In [77]:

# cleanup Journal Title, Publisher and Cost fields
# add new field Subject from Journal Title

# Determine the five most common journals 
# Count total number of articles for each journal
# Calc mean, median and mode for open-access cost per article for each journal and update it with the missing values.
# Identify the open access prices paid by subject area.

import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from difflib import SequenceMatcher

def similar(a, b): return SequenceMatcher(None, a, b).ratio()

#This function expects a raw string. The purpose of this function is to take the string
#text and remove double dashes, punctuactions and special characters.
def text_cleaner(text):
    text = re.sub(r'--',' ',text)
    text = re.sub(r'_','',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = re.sub('[^a-zA-z\s]', "", text)
    text = ' '.join(text.split())
    return text

df = pd.read_csv("WELLCOME_APCspend2013_forThinkful.csv",encoding='ISO-8859-1')

# Add 4 columns - Conversion Rate, Cost, Subject and Open Access Cost
df.insert(5,'Conversion Rate',0.00)
df.insert(6,'Cost',0.00)
df.insert(7,'Open Access Cost',0.00)

# Or we can also use Rename method
#df.rename(columns = {'COST (£) charged to Wellcome (inc VAT when charged)':'Open Access Cost'}, inplace = True) 


df.head(5)

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged),Conversion Rate,Cost,Open Access Cost
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00,0.0,0.0,0.0
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04,0.0,0.0,0.0
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56,0.0,0.0,0.0
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64,0.0,0.0,0.0
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88,0.0,0.0,0.0


In [78]:
df=df.rename(columns = {'PMID/PMCID':'PMCID'})
df.head()

Unnamed: 0,PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged),Conversion Rate,Cost,Open Access Cost
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00,0.0,0.0,0.0
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04,0.0,0.0,0.0
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56,0.0,0.0,0.0
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64,0.0,0.0,0.0
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88,0.0,0.0,0.0


In [79]:
df.columns
df.shape

(2127, 8)

In [80]:
# Columns with NAN using True/False
# False means it doesn't have a NAN
df.isnull().any()

# For entire DataFrame
df.isnull().any().any()

# Total Number of Missing NA
df.isnull().sum().sum()  #200

# Columns with NAN using Integer
df.isnull().sum()


PMCID                                                  199
Publisher                                                0
Journal title                                            1
Article title                                            0
COST (£) charged to Wellcome (inc VAT when charged)      0
Conversion Rate                                          0
Cost                                                     0
Open Access Cost                                         0
dtype: int64

In [81]:
#Adding A Default Value or Filling the Missing Data
#df_with_0 = df.fillna(0)

# Fill it with the mean
#df['DURATION'].mean()
#df_with_mean = df.DURATION.fillna(df['DURATION'].mean())

## Droping NA
df.head()

df.isnull().sum().sum() #200

df.shape



(2127, 8)

In [82]:
# Drop rows with NaN in a specific column . 
# here we are removing Missing values in Journal title column
df = df.dropna(subset=['Journal title'])

In [83]:
df.shape


(2126, 8)

In [84]:
# Cleanup Journal Title

# Strip the extra spaces
df['Journal title'] = df['Journal title'].str.strip()

#Convert to uppercase
df['Journal title'] = df['Journal title'].str.upper()

# Replace & with and 
df['Journal title'] = df['Journal title'].str.replace('&','and')

df['Journal title'] = df['Journal title'].str.replace('\(','')
df['Journal title'] = df['Journal title'].str.replace('\)','')
df['Journal title'] = df['Journal title'].str.replace('\n','')


df.head(5)



Unnamed: 0,PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged),Conversion Rate,Cost,Open Access Cost
0,,CUP,PSYCHOLOGICAL MEDICINE,Reduced parahippocampal cortical thickness in ...,£0.00,0.0,0.0,0.0
1,PMC3679557,ACS,BIOMACROMOLECULES,Structural characterization of a Model Gram-ne...,£2381.04,0.0,0.0,0.0
2,23043264 PMC3506128,ACS,J MED CHEM,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56,0.0,0.0,0.0
3,23438330 PMC3646402,ACS,J MED CHEM,Orvinols with mixed kappa/mu opioid receptor a...,£669.64,0.0,0.0,0.0
4,23438216 PMC3601604,ACS,J ORG CHEM,Regioselective opening of myo-inositol orthoes...,£685.88,0.0,0.0,0.0


In [85]:
#process and clean the Journal title column 
# Create a list to store the data
get_title = []
for row in df['Journal title']:
    clean_title = text_cleaner(row) #for each row process and clean the text
    get_title.append(clean_title) #store the clean text in a list
# Create a column from the list
df['clean_title'] = get_title

In [86]:
df.head()

Unnamed: 0,PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged),Conversion Rate,Cost,Open Access Cost,clean_title
0,,CUP,PSYCHOLOGICAL MEDICINE,Reduced parahippocampal cortical thickness in ...,£0.00,0.0,0.0,0.0,PSYCHOLOGICAL MEDICINE
1,PMC3679557,ACS,BIOMACROMOLECULES,Structural characterization of a Model Gram-ne...,£2381.04,0.0,0.0,0.0,BIOMACROMOLECULES
2,23043264 PMC3506128,ACS,J MED CHEM,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56,0.0,0.0,0.0,J MED CHEM
3,23438330 PMC3646402,ACS,J MED CHEM,Orvinols with mixed kappa/mu opioid receptor a...,£669.64,0.0,0.0,0.0,J MED CHEM
4,23438216 PMC3601604,ACS,J ORG CHEM,Regioselective opening of myo-inositol orthoes...,£685.88,0.0,0.0,0.0,J ORG CHEM


In [87]:
# Calculate Conversion Rate based on £ or $
df['Conversion Rate'] = df['COST (£) charged to Wellcome (inc VAT when charged)'].str[0].apply(lambda x: 1 if x == '£' else 0.77)

# Remove £ and $ from the Cost field using Replace method
df['Cost']= df['COST (£) charged to Wellcome (inc VAT when charged)'].str.replace('£','')
df['Cost']= df['Cost'].str.replace('$','')
df['Cost']= df['Cost'].str.replace('999999','0')


# Convert Cost to Float64
df['Cost'] = df['Cost'].astype('float64')

#Convert the Cost to £
df['Open Access Cost'] = df['Cost'] * df['Conversion Rate']

df.head()

Unnamed: 0,PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged),Conversion Rate,Cost,Open Access Cost,clean_title
0,,CUP,PSYCHOLOGICAL MEDICINE,Reduced parahippocampal cortical thickness in ...,£0.00,1.0,0.0,0.0,PSYCHOLOGICAL MEDICINE
1,PMC3679557,ACS,BIOMACROMOLECULES,Structural characterization of a Model Gram-ne...,£2381.04,1.0,2381.04,2381.04,BIOMACROMOLECULES
2,23043264 PMC3506128,ACS,J MED CHEM,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56,1.0,642.56,642.56,J MED CHEM
3,23438330 PMC3646402,ACS,J MED CHEM,Orvinols with mixed kappa/mu opioid receptor a...,£669.64,1.0,669.64,669.64,J MED CHEM
4,23438216 PMC3601604,ACS,J ORG CHEM,Regioselective opening of myo-inositol orthoes...,£685.88,1.0,685.88,685.88,J ORG CHEM


In [88]:
df.dtypes

PMCID                                                   object
Publisher                                               object
Journal title                                           object
Article title                                           object
COST (£) charged to Wellcome (inc VAT when charged)     object
Conversion Rate                                        float64
Cost                                                   float64
Open Access Cost                                       float64
clean_title                                             object
dtype: object

In [89]:
# View unique values
newdf = pd.DataFrame(columns=['Journal title'])
newdf = df['Journal title'].unique()

for a in newdf:
    for b in df['Journal title']:
        ratio = similar(str(a),str(b)) 
        if ratio >= 0.8 and ratio < 1:
            df['Journal title'] = df['Journal title'].str.replace(b,a)
            #print("b = " + b + " Replaced by a = " + a)
           


In [90]:
stats = df.groupby('Journal title')['Journal title'].count().sort_values(ascending=False)
stats


Journal title
PLOS 1  ONE                                                      200
JOURNAL BIOLOGICAL CHEMISTRY                                      81
JOURNAL OF CELL PHYSIOLOGY                                        68
BMC GENOMICS.                                                     38
NEUROLMAGE                                                        31
NUCLEIC ACID RESEARCH                                             29
JOURNAL OF THE NEUROLOGICAL SCIENCES                              29
PROCEEDINGS OF THE NATIONAL ACADEMY OF SCIENCES PNAS              29
THE FEBS JOURNAL                                                  27
DEVELOPMENTAL SCIENCE                                             26
PLOS 1 PATHOGENS                                                  24
JOURNAL OF VIROL                                                  23
PLOS NEGLECTED TROPICAL DISEASES                                  23
JOURNAL OF MICROSCOPY                                             23
HUMAN MOLECULAR GENE

In [91]:
#print('Mean =', np.mean(df['Open Access Cost']))

results = df.groupby(['Journal title']).agg({'Open Access Cost':['size','min','max','mean', 'median', 'sum']})

results

Unnamed: 0_level_0,Open Access Cost,Open Access Cost,Open Access Cost,Open Access Cost,Open Access Cost,Open Access Cost
Unnamed: 0_level_1,size,min,max,mean,median,sum
Journal title,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
ACADEMY OF NUTRITION AND DIETETICS,1,2379.54,2379.54,2379.540000,2379.540,2379.54
ACS CHEMICAL BIOLOGY,5,947.07,2286.73,1418.186000,1294.590,7090.93
ACS CHEMICAL NEUROSCIENCE,1,1186.80,1186.80,1186.800000,1186.800,1186.80
ACS NANO,2,642.89,693.39,668.140000,668.140,1336.28
"ACTA CRYSTALLOGRAPHICA SECTION D, BIOLOGICAL CRYSTALLOGRAPHY",2,771.42,773.74,772.580000,772.580,1545.16
ACTA CRYSTALLOGRAPHICA SECTION F: STRUCTURAL BIOLOGY AND CRYSTALLIZATION COMMUNICATIONS,2,785.60,807.67,796.635000,796.635,1593.27
"ACTA CRYSTALLOGRAPHICA, SECTION D",1,757.18,757.18,757.180000,757.180,757.18
ACTA CRYSTALLOGRAPHY D,1,774.19,774.19,774.190000,774.190,774.19
ACTA DERMATO VENEREOLOGICA,1,653.96,653.96,653.960000,653.960,653.96
ACTA F,2,750.16,754.90,752.530000,752.530,1505.06


In [94]:
results.duplicated()

Journal title
ACADEMY OF NUTRITION AND DIETETICS                                                         False
ACS CHEMICAL BIOLOGY                                                                       False
ACS CHEMICAL NEUROSCIENCE                                                                  False
ACS NANO                                                                                   False
ACTA CRYSTALLOGRAPHICA SECTION D,  BIOLOGICAL CRYSTALLOGRAPHY                              False
ACTA CRYSTALLOGRAPHICA SECTION F: STRUCTURAL BIOLOGY AND CRYSTALLIZATION COMMUNICATIONS    False
ACTA CRYSTALLOGRAPHICA, SECTION D                                                          False
ACTA CRYSTALLOGRAPHY D                                                                     False
ACTA DERMATO VENEREOLOGICA                                                                 False
ACTA F                                                                                     False
ACTA OPTHALMOLOG