In [254]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
%matplotlib inline

Question 1 - Determine the five most common journals and the total articles for each

In [255]:
wellcome_trust_raw = pd.read_csv('WELLCOME_APCspend2013_forThinkful.csv', encoding='ISO-8859-1')
wellcome_trust = pd.read_csv('WELLCOME_APCspend2013_forThinkful.csv', encoding='ISO-8859-1')

In [256]:
wellcome_trust.shape

(2127, 5)

In [257]:
wellcome_trust.head(5)

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88


In [258]:
#Since we are looking specifically at Journal title, Article title, and Cost, want to drop NaN's there.

wellcome_trust = wellcome_trust.dropna(subset=['Journal title','Article title', 'COST (£) charged to Wellcome (inc VAT when charged)'])

In [259]:
#Check to see missing values per column - confirm if our dropna() above worked.

def missing(x):
  return sum(x.isnull())

print("Missing values per column:")
print(wellcome_trust.apply(missing, axis=0))

Missing values per column:
PMID/PMCID                                             198
Publisher                                                0
Journal title                                            0
Article title                                            0
COST (£) charged to Wellcome (inc VAT when charged)      0
dtype: int64


In [260]:
#Make all journal titles uppercase to avoid mismatches
wellcome_trust['Journal title clean'] = wellcome_trust['Journal title'].apply(lambda x: str(x).upper())

In [261]:
#Similarly, strip whitespace from all journal titles to avoid mismatches
wellcome_trust['Journal title clean'] = wellcome_trust['Journal title clean'].apply(lambda x: str(x).strip())

In [262]:
#Show in alphabetical order to compare against value_counts() below
wellcome_trust.sort_values(by='Journal title clean')

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged),Journal title clean
439,PMC3378987\n,Elsevier,Academy of Nutrition and Dietetics,Parent support and parent mediated behaviours ...,£2379.54,ACADEMY OF NUTRITION AND DIETETICS
8,PMCID: PMC3780468,ACS (Amercian Chemical Society) Publications,ACS Chemical Biology,A Novel Allosteric Inhibitor of the Uridine Di...,£1294.59,ACS CHEMICAL BIOLOGY
9,PMCID: PMC3621575,ACS (Amercian Chemical Society) Publications,ACS Chemical Biology,Chemical proteomic analysis reveals the drugab...,£1294.78,ACS CHEMICAL BIOLOGY
21,,American Chemical Society,ACS Chemical Biology,Discovery of ?2 Adrenergic Receptor Ligands Us...,£947.07,ACS CHEMICAL BIOLOGY
20,: PMC3805332,American Chemical Society,ACS Chemical Biology,Synthesis of alpha-glucan in mycobacteria invo...,£2286.73,ACS CHEMICAL BIOLOGY
19,PMID: 24015914 PMC3833349,American Chemical Society,ACS Chemical Biology,Discovery of an allosteric inhibitor binding s...,£1267.76,ACS CHEMICAL BIOLOGY
22,PMCID:\n PMC3656742\n,American Chemical Society,ACS Chemical Neuroscience,Continuous online microdialysis using microflu...,£1186.80,ACS CHEMICAL NEUROSCIENCE
34,23373658,American Chemical Society Publications,ACS Nano,Skin dendritic cell targeting via microneedle ...,£693.39,ACS NANO
23,PMCID: 3584654,AMERICAN CHEMICAL SOCIETY,ACS NANO,HYDROXY-TERMINATED CONJUGATED POLYMER NANOPART...,£642.89,ACS NANO
927,PMCID:\n PMC3727331\n,International Union of Crystallography,"Acta Crystallographica Section D, Biological ...",Clustering procedures for the optimal selectio...,£771.42,"ACTA CRYSTALLOGRAPHICA SECTION D, BIOLOGICAL ..."


In [263]:
#Show value_counts and compare with above in order to determine if any need to be combined
pd.set_option("display.max_rows",3000)
wellcome_trust['Journal title clean'].value_counts().head(100)

PLOS ONE                                                   190
JOURNAL OF BIOLOGICAL CHEMISTRY                             53
NEUROIMAGE                                                  29
NUCLEIC ACIDS RESEARCH                                      26
PLOS PATHOGENS                                              24
PLOS GENETICS                                               24
PROCEEDINGS OF THE NATIONAL ACADEMY OF SCIENCES             22
PLOS NEGLECTED TROPICAL DISEASES                            20
HUMAN MOLECULAR GENETICS                                    19
NATURE COMMUNICATIONS                                       19
MOVEMENT DISORDERS                                          15
BMC PUBLIC HEALTH                                           14
BRAIN                                                       14
JOURNAL OF NEUROSCIENCE                                     13
DEVELOPMENTAL CELL                                          12
BIOCHEMICAL JOURNAL                                    

In [264]:
#Combining categories

wellcome_trust['Journal title clean'] = wellcome_trust['Journal title clean'].replace(['PLOSONE', 'PLOS 1', 'PNAS', 'NEUROIMAGE: CLINICAL'], 
                                                                            ['PLOS ONE', 'PLOS ONE', 'PROCEEDINGS OF THE NATIONAL ACADEMY OF SCIENCES', 'NEUROIMAGE'])

In [265]:
#Now that we've combined, get our top 5 journals
wellcome_trust['Journal title clean'].value_counts().head(5)

PLOS ONE                                           206
JOURNAL OF BIOLOGICAL CHEMISTRY                     53
NEUROIMAGE                                          34
PROCEEDINGS OF THE NATIONAL ACADEMY OF SCIENCES     28
NUCLEIC ACIDS RESEARCH                              26
Name: Journal title clean, dtype: int64

In [266]:
#Create dataframe with only top 5 journals
wellcome_trust_top_journals = wellcome_trust[wellcome_trust["Journal title clean"].isin(['PLOS ONE', 'JOURNAL OF BIOLOGICAL CHEMISTRY', 'NEUROIMAGE', 'PROCEEDINGS OF THE NATIONAL ACADEMY OF SCIENCES', 'NUCLEIC ACIDS RESEARCH'])]

In [267]:
#Now calculate total articles for each
wellcome_trust_top_journals[['Journal title clean', 'Article title']].groupby('Journal title clean').count()

Unnamed: 0_level_0,Article title
Journal title clean,Unnamed: 1_level_1
JOURNAL OF BIOLOGICAL CHEMISTRY,53
NEUROIMAGE,34
NUCLEIC ACIDS RESEARCH,26
PLOS ONE,206
PROCEEDINGS OF THE NATIONAL ACADEMY OF SCIENCES,28


Question 2 - Next, calculate the mean, median, and standard deviation of the open-access cost per article for each journal

In [276]:
#Strip whitespace from the cost field
wellcome_trust['COST (£) charged to Wellcome (inc VAT when charged)'] = wellcome_trust['COST (£) charged to Wellcome (inc VAT when charged)'].apply(lambda x: str(x).strip())

In [269]:
#Define function to strip £ symbol when there is one
def remove_pound_symbol(x):
    if x.find('£') != -1:
        return x[1:]
    else:
        return x

In [279]:
#Apply function to the cost column
wellcome_trust['COST (£) charged to Wellcome (inc VAT when charged)'] = wellcome_trust['COST (£) charged to Wellcome (inc VAT when charged)'].apply(remove_pound_symbol)

In [280]:
#Change cost column to float
wellcome_trust['COST (£) charged to Wellcome (inc VAT when charged)'] = wellcome_trust['COST (£) charged to Wellcome (inc VAT when charged)'].astype(float)

In [287]:
#Create dataframe with just top 5 journals, similar to above
wellcome_trust_top_journals_stats = wellcome_trust[wellcome_trust["Journal title clean"].isin(['PLOS ONE', 'JOURNAL OF BIOLOGICAL CHEMISTRY', 'NEUROIMAGE', 'PROCEEDINGS OF THE NATIONAL ACADEMY OF SCIENCES', 'NUCLEIC ACIDS RESEARCH'])]

In [288]:
#Calculate the mean
wellcome_trust_top_journals_stats[['Journal title clean', 'COST (£) charged to Wellcome (inc VAT when charged)']].groupby('Journal title clean').mean()

Unnamed: 0_level_0,COST (£) charged to Wellcome (inc VAT when charged)
Journal title clean,Unnamed: 1_level_1
JOURNAL OF BIOLOGICAL CHEMISTRY,20264.633962
NEUROIMAGE,2050.756176
NUCLEIC ACIDS RESEARCH,1149.0
PLOS ONE,40664.626019
PROCEEDINGS OF THE NATIONAL ACADEMY OF SCIENCES,36473.828929


In [289]:
#Calculate the standard deviation
wellcome_trust_top_journals_stats[['Journal title clean', 'COST (£) charged to Wellcome (inc VAT when charged)']].groupby('Journal title clean').std()

Unnamed: 0_level_0,COST (£) charged to Wellcome (inc VAT when charged)
Journal title clean,Unnamed: 1_level_1
JOURNAL OF BIOLOGICAL CHEMISTRY,137165.488398
NEUROIMAGE,472.211498
NUCLEIC ACIDS RESEARCH,442.940447
PLOS ONE,193764.150303
PROCEEDINGS OF THE NATIONAL ACADEMY OF SCIENCES,188833.640483


In [290]:
#Calculate the median
wellcome_trust_top_journals_stats[['Journal title clean', 'COST (£) charged to Wellcome (inc VAT when charged)']].groupby('Journal title clean').median()

Unnamed: 0_level_0,COST (£) charged to Wellcome (inc VAT when charged)
Journal title clean,Unnamed: 1_level_1
JOURNAL OF BIOLOGICAL CHEMISTRY,1314.53
NEUROIMAGE,2289.245
NUCLEIC ACIDS RESEARCH,852.0
PLOS ONE,899.74
PROCEEDINGS OF THE NATIONAL ACADEMY OF SCIENCES,733.125
