In [1]:
import numpy as np
import pandas as pd
import re
import scipy.stats
import matplotlib.pyplot as plt
import seaborn as sns

In [18]:
trust = pd.read_csv('WELLCOME_APCspend2013_forThinkful.csv', encoding = 'ISO-8859-1')

In [19]:
trust.head()

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88


In [20]:
trust.columns.values

array(['PMID/PMCID', 'Publisher', 'Journal title', 'Article title',
       'COST (£) charged to Wellcome (inc VAT when charged)'], dtype=object)

In [21]:
trust.rename(columns={'COST (£) charged to Wellcome (inc VAT when charged)': 'Cost'}, inplace= True)

In [22]:
trust.columns.values

array(['PMID/PMCID', 'Publisher', 'Journal title', 'Article title', 'Cost'], dtype=object)

In [23]:
# this function obtains the 7-digit id that starts with 3 
def get_pmcid(x):
    pmcid = re.findall('3\d{6}', str(x))
    if len(pmcid) > 0:
        return pmcid[- 1]

# this function obtains the 8-digit id that starts with 2 
def get_pmid(x):
    pmid = re.findall('2\d{7}', str(x))
    if len(pmid) > 0:
        return pmid[0]

In [24]:
trust['PMID'] = trust['PMID/PMCID'].apply(get_pmid)
trust['PMCID'] = trust['PMID/PMCID'].apply(get_pmcid)

In [25]:
trust.tail()

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,Cost,PMID,PMCID
2122,2901593,Wolters Kluwer Health,Circulation Research,Mechanistic Links Between Na+ Channel (SCN5A) ...,£1334.15,,
2123,3748854,Wolters Kluwer Health,AIDS,Evaluation of an empiric risk screening score ...,£1834.77,,3748854.0
2124,3785148,Wolters Kluwer Health,Pediatr Infect Dis J,Topical umbilical cord care for prevention of ...,£1834.77,,3785148.0
2125,PMCID:\n PMC3647051\n,Wolters Kluwer N.V./Lippinott,AIDS,Grassroots Community Organisations' Contributi...,£2374.52,,3647051.0
2126,PMID: 23846567 (Epub July 2013),Wolters Kluwers,Journal of Acquired Immune Deficiency Syndromes,A novel community health worker tool outperfor...,£2034.75,23846567.0,3846567.0


In [26]:
# now drop the old column
del trust['PMID/PMCID']

In [27]:
# Find duplicate ID_PMC
id_pm_count = trust.groupby('PMID')['PMID'].count()
id_pm_count[id_pm_count > 1]

PMID
22735079    2
Name: PMID, dtype: int64

In [28]:
# Find duplicate ID_PMC
id_pmc_count = trust.groupby('PMCID')['PMCID'].count()
id_pmc_count[id_pmc_count > 1]

PMCID
3173209    2
3381227    2
3401426    2
3405234    2
3413714    2
3435256    2
3528370    2
3529206    2
3597274    2
3599138    2
3599488    2
3613719    2
3647051    2
3676342    2
3708033    2
3746156    2
Name: PMCID, dtype: int64

In [29]:
trust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2127 entries, 0 to 2126
Data columns (total 6 columns):
Publisher        2127 non-null object
Journal title    2126 non-null object
Article title    2127 non-null object
Cost             2127 non-null object
PMID             452 non-null object
PMCID            1788 non-null object
dtypes: object(6)
memory usage: 99.8+ KB


In [30]:
trust.drop_duplicates(subset=['PMID', 'PMCID'],  inplace=True)


In [31]:
trust['Cost'] = trust['Cost'].astype(str)
trust['Cost'] = trust['Cost'].str.strip('£')
trust['Cost'] = trust['Cost'].str.strip('$')

In [32]:
trust['Cost'] = pd.to_numeric(trust['Cost'] , errors='coerce')

In [33]:
trust.to_csv('trust4.csv')

In [34]:
trust[trust.Cost == 999999.00 ].count()

Publisher        36
Journal title    36
Article title    36
Cost             36
PMID              6
PMCID            35
dtype: int64

In [35]:
# There are few rows with cost value as 999999. drop the rows 
trust.drop(trust[trust.Cost == 999999.00 ].index, inplace=True)

In [42]:
# There is one row with value as 192645.000000 drop that row.
trust.drop(trust[trust.Cost == 192645.000000 ].index, inplace=True)

In [43]:
trust.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1778 entries, 0 to 2126
Data columns (total 6 columns):
Publisher        1778 non-null object
Journal title    1778 non-null object
Article title    1778 non-null object
Cost             1778 non-null float64
PMID             445 non-null object
PMCID            1740 non-null object
dtypes: float64(1), object(5)
memory usage: 97.2+ KB


In [37]:
trust['Journal title'] = trust['Journal title'].str.lower()
trust['Journal title'] = trust['Journal title'].apply(lambda x: str(x).replace('journal of ',''))
trust['Journal title'] = trust['Journal title'].apply(lambda x: str(x).replace('journal',''))


In [38]:
def change_journal(x):
    if str(x).replace(' ','') == 'plosone' :
        return str(x).replace(' ','')
    else:
        return str(x)
    

In [39]:
trust['Journal title'] = trust['Journal title'].apply(change_journal)

# The five most common journals and the total articles for each:

In [40]:
trust['Journal title'].value_counts().head(5)

plosone                 185
biological chemistry     47
neuroimage               26
plos pathogens           22
plos genetics            22
Name: Journal title, dtype: int64

In [41]:
top_5_Journal = pd.DataFrame(trust['Journal title'].value_counts().head(5)).index.tolist()

# calculate the mean, median, and standard deviation of the open-access cost  for Top 5 journal . 

In [44]:
top5_cost = pd.DataFrame(trust[trust['Journal title'].isin(top_5_Journal) ].Cost)

top5_cost.Cost.describe()

count     301.000000
mean     1214.419169
std       472.821435
min       122.310000
25%       879.360000
50%      1044.550000
75%      1541.480000
max      2503.340000
Name: Cost, dtype: float64

The Average(Mean) Cost is 1214.419169

The Median Cost is 1044.550000

The standard deviation is 472.821435
