In [44]:
import pandas as pd
import numpy as np
import re
import difflib
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import requests
import statsmodels.api as sm
import statsmodels.formula.api as smf


Open the data set. Use ISO encoding, for the pound sterling symbol.

In [2]:
welcome = pd.read_csv('WELLCOME_APCspend2013_forThinkful.csv',encoding='iso8859_15')


Look at types

In [3]:
welcome.dtypes

PMID/PMCID                                             object
Publisher                                              object
Journal title                                          object
Article title                                          object
COST (£) charged to Wellcome (inc VAT when charged)    object
dtype: object

Look at unique data in each column.

In [4]:
welcome['Publisher'].value_counts()

Elsevier                                                      387
Public Library of Science                                     278
Wiley                                                         136
Springer                                                       81
Oxford University Press                                        77
OUP                                                            56
Wiley-Blackwell                                                56
ASBMB                                                          46
Nature Publishing Group                                        45
BioMed Central                                                 40
BMC                                                            26
Nature                                                         24
Frontiers                                                      23
BMJ                                                            23
Royal Society                                                  22
Cambridge 

In [5]:
welcome['Journal title'].value_counts()

PLoS One                                                                                 92
PLoS ONE                                                                                 62
Journal of Biological Chemistry                                                          48
Nucleic Acids Research                                                                   21
Proceedings of the National Academy of Sciences                                          19
Human Molecular Genetics                                                                 18
PLoS Neglected Tropical Diseases                                                         18
Nature Communications                                                                    17
PLoS Genetics                                                                            15
PLoS Pathogens                                                                           15
Neuroimage                                                                      

In [6]:
#welcome['Article title'].unique()

In [7]:
#welcome['COST (£) charged to Wellcome (inc VAT when charged)'].unique()

Rename column 'COST (£) charged to Wellcome (inc VAT when charged)' to 'cost'. 
Rename other columns to one string to make copy and paste faster.

In [8]:
welcome.rename(index=str, columns={'PMID/PMCID':'PMID_PMCID',\
              'Journal title':'journal_title', \
              'Article title':'article_title', \
              'COST (£) charged to Wellcome (inc VAT when charged)':'cost'}, 
               inplace=True)

Remove the sterling pound symbol.

In [23]:
welcome.cost = welcome.cost.apply(lambda x: x.strip('£'))
welcome.cost = welcome.cost.apply(lambda x: x.strip('$'))
welcome.cost = welcome.cost.astype(float)

Looking at unique values (above) there are instances of tailing white space.  
Strip the occasional white space from all columns.
First, replace empty NaN with empty strings.

In [24]:
welcome = welcome.replace(np.nan, '', regex=True)

In [27]:
welcome.PMID_PMCID = welcome.PMID_PMCID.apply(lambda x: x.strip())
welcome.Publisher = welcome.Publisher.apply(lambda x: x.strip())
welcome.journal_title = welcome.journal_title.apply(lambda x: x.strip())
welcome.article_title = welcome.article_title.apply(lambda x: x.strip())


Journal titles are Inconsistent. We need a list of good journal title names that can be queried. I did the following:  
1) The national center for biotechnology information has a text file of all cited journals - ftp://ftp.ncbi.nih.gov/pubmed/J_Medline.txt.  
2) In bash - created a txt file and added the data.  cat > J_Medline.txt  
3) Using awk - selected the titles and medline abbreviations. The abbreviations and full titles are on different lines. awk '($1=="JournalTitle:") {$1=""; print $0} ($1=="MedAbbr:") {$1=""; print $0}' J_Medline.txt > titles_Abbrev.txt  
4) Cleaned up the file using sed (removed all punctuation). sed 's|[ ] [ ) ( :;.,]||g' filename  
5) Moved the abbreviation and full title to the same line. awk '!(NR%2){print$0";"p}{p=$0}' titles_Abbrev.txt > titles_pubmed.txt  
6) Made a csv file of unique journal title names (from the welcome database). used pandas  
7) Cleaned up the file using sed (removed all punctuation). sed 's|[ ] [ ) ( :;.,]||g' filename  
8) Made a script in the CLI, using the bash visual editor. cf. string-math.py (note I imported fuzzywuzzy to match stings.  
9) Ran the script to match the Medline full titles and abbreviations to the welcome data titles. The script prints into a csv file. python string-match.py > pubmed_titles.csv    
10) Cleaned up the file using sed (removed all '[ ]' and '( )' and single quote marks).  sed 's|[][)('"'"']||g' pubmed_titles.csv > pubmed_titles_clean.csv    
11) Read the csv file into a pandas dataframe, then visually inspected records where the fuzzywuzzy score was less than 100. Removed obvious bad records.  
11) Match matched the journal titles from the pubmed_titles.csv to the welcome data, returned the Medline abbreviation.  
12) Ran the script to match welcome titles to the Medline abbreviation. python string-match-abbr.py > pubmed_titles_abbr.csv

Below is the analysis of pubmed_titles. Notes and observations:  
1) Scores of minimum (50) to 70. n=8. No matches.  
2) Scores of 71 to 80, n=5. No matches.  
3) Scores of 86 to 88, n=53. Four matches at scores 87 and 88.  
4) Scores of 89 to 90. n=161. 40 matches.  
5) Scores of 91 to 95 n=173. 131 matches.  
6) Scores of 96 to 99 n=41. 41 matches.  
7) Score of 100. n=542, exact match.  

About 77% match unique titles

In [28]:
pubmed_titles = pd.read_table('pubmed_titles_clean.csv', sep=',',\
                            names=['journal_title', 'full_title', 'score',\
                                  'file_index', 'medline_abbr'], \
                              encoding='iso8859_15')
pubmed_titles.journal_title = pubmed_titles.journal_title.apply(lambda x: x.strip())
pubmed_titles.full_title = pubmed_titles.full_title.apply(lambda x: x.strip())
pubmed_titles.medline_abbr = pubmed_titles.medline_abbr.apply(lambda x: x.strip())
#pd.set_option('display.max_rows', 200)
#pubmed_titles[pubmed_titles.score.between(100,100)].sort_values('score')

In [29]:
abbr_titles = pd.read_table('pubmed_titles_abbr_clean.csv', sep=',',\
                            names=['journal_title', 'medline_abbr', 'score',\
                                  'file_index', 'medline_abbr_2'], \
                              encoding='iso8859_15')
abbr_titles.journal_title = abbr_titles.journal_title.apply(lambda x: x.strip())
abbr_titles.medline_abbr_2 = abbr_titles.medline_abbr_2.apply(lambda x: x.strip())
abbr_titles.medline_abbr = abbr_titles.medline_abbr.apply(lambda x: x.strip())

Merge pubmed_titles with the welcome data set. Do the same with abbr_titles.

In [30]:
welcome_new = welcome.merge(pubmed_titles, how='outer', on='journal_title', validate='m:m')
welcome_new.drop_duplicates(inplace=True)
welcome_new = welcome_new.merge(abbr_titles, how='outer', on='journal_title', validate='m:m')
welcome_new.drop_duplicates(inplace=True)


In [69]:
welcome_new['good_title'] = 'open'
pmid_res = pd.read_csv('pmid_res.txt')
pmid_res.set_index('indx', inplace=True)
pmc_res = pd.read_csv('pmc_res_clean.txt')
pmc_res.set_index('indx', inplace=True)
#mask = (welcome_new.index == pmid_res.indx)

number = pmid_res.index
for i in number.astype(str):
    x = int(i)
    welcome_new.at[x, 'good_title'] = pmid_res.abbr.loc[x]
    
number_2 = pmc_res.index
for i in number_2.astype(str):
    x = int(i)
    welcome_new.at[x, 'good_title'] = pmc_res.abbr.loc[x]
    #welcome_new.good_title.loc[x] = pmc_res.abbr.loc[x]    

welcome_new.good_title = np.where(welcome_new['score_y']==100,\
                                  welcome_new['journal_title'],\
                                  welcome_new['good_title'])
welcome_new.good_title = np.where(welcome_new['score_x']==100,\
                                  welcome_new['medline_abbr_x'],\
                                  welcome_new['good_title'])
welcome_new.good_title = np.where(welcome_new['score_x'].between(96, 99),\
                                  welcome_new['medline_abbr_x'],\
                                  welcome_new['good_title'])
welcome_new.good_title = np.where(welcome_new['score_x'].between(95,95) &\
                                  welcome_new['score_y'].between(90,90),\
                                  welcome_new['medline_abbr_x'],\
                                  welcome_new['good_title'])
welcome_new.good_title = np.where(welcome_new['score_x'].between(90,90) &\
                                  welcome_new['score_y'].between(90,90),\
                                  welcome_new['medline_abbr_x'],\
                                  welcome_new['good_title'])
welcome_new.good_title = np.where(welcome_new['score_x'].between(92,92) &\
                                  welcome_new['score_y'].between(92,92),\
                                  welcome_new['medline_abbr_x'],\
                                  welcome_new['good_title'])
welcome_new.good_title = np.where(welcome_new['score_x'].between(91,91) &\
                                  welcome_new['score_y'].between(90,90),\
                                  welcome_new['medline_abbr_x'],\
                                  welcome_new['good_title'])
welcome_new.good_title = np.where(welcome_new['score_x'].between(95,95) &\
                                  welcome_new['score_y'].between(86,86),\
                                  welcome_new['medline_abbr_x'],\
                                  welcome_new['good_title'])  
welcome_new.good_title = np.where(welcome_new['score_x'].between(93,93) &\
                                  welcome_new['score_y'].between(90,90),\
                                  welcome_new['medline_abbr_x'],\
                                  welcome_new['good_title'])
welcome_new.good_title = np.where(welcome_new['score_x'].between(95,95) &\
                                  welcome_new['score_y'].between(74,74),\
                                  welcome_new['medline_abbr_x'],\
                                  welcome_new['good_title'])
welcome_new.good_title = np.where(welcome_new['score_x'].between(95,95) &\
                                  welcome_new['score_y'].between(95,95),\
                                  welcome_new['medline_abbr_x'],\
                                  welcome_new['good_title'])
welcome_new.good_title = np.where(welcome_new['score_x'].between(87,87) &\
                                  welcome_new['score_y'].between(86,86),\
                                  welcome_new['medline_abbr_x'],\
                                  welcome_new['good_title'])
welcome_new.good_title = np.where(welcome_new['score_x'].between(87,87) &\
                                  welcome_new['score_y'].between(90,90),\
                                  welcome_new['medline_abbr_x'],\
                                  welcome_new['good_title'])
welcome_new.good_title = np.where(welcome_new['score_x'].between(90,90) &\
                                  welcome_new['score_y'].between(81,81),\
                                  welcome_new['medline_abbr_x'],\
                                  welcome_new['good_title'])
welcome_new.good_title = np.where(welcome_new['score_x'].between(87,87) &\
                                  welcome_new['score_y'].between(90,90),\
                                  welcome_new['medline_abbr_x'],\
                                  welcome_new['good_title'])
welcome_new.good_title = np.where(welcome_new['score_x'].between(90,90) &\
                                  welcome_new['score_y'].between(77,77),\
                                  welcome_new['medline_abbr_x'],\
                                  welcome_new['good_title'])
welcome_new.good_title = np.where(welcome_new['score_x'].between(90,90) &\
                                  welcome_new['score_y'].between(98,98),\
                                  welcome_new['medline_abbr_x'],\
                                  welcome_new['good_title'])
welcome_new.good_title = np.where(welcome_new['score_x'].between(93,93) &\
                                  welcome_new['score_y'].between(90,90),\
                                  welcome_new['medline_abbr_x'],\
                                  welcome_new['good_title'])
welcome_new.good_title = np.where(welcome_new['score_x'].between(86,86) &\
                                  welcome_new['score_y'].between(94,94),\
                                  welcome_new['medline_abbr_y'],\
                                  welcome_new['good_title'])
welcome_new.good_title = np.where(welcome_new['score_x'].between(92,92) &\
                                  welcome_new['score_y'].between(90,90),\
                                  welcome_new['medline_abbr_x'],\
                                  welcome_new['good_title'])
welcome_new.good_title = np.where(welcome_new['score_x'].between(94,94) &\
                                  welcome_new['score_y'].between(95,95),\
                                  welcome_new['medline_abbr_x'],\
                                  welcome_new['good_title'])
welcome_new.good_title = np.where(welcome_new['score_x'].between(95,95) &\
                                  welcome_new['score_y'].between(85,85),\
                                  welcome_new['medline_abbr_x'],\
                                  welcome_new['good_title'])
welcome_new.good_title = np.where(welcome_new['score_x'].between(90,90) &\
                                  welcome_new['score_y'].between(95,95),\
                                  welcome_new['medline_abbr_x'],\
                                  welcome_new['good_title'])
welcome_new.good_title = np.where(welcome_new['score_x'].between(88,88) &\
                                  welcome_new['score_y'].between(90,90),\
                                  welcome_new['medline_abbr_x'],\
                                  welcome_new['good_title'])
welcome_new.good_title = np.where(welcome_new['score_x'].between(95,95) &\
                                  welcome_new['score_y'].between(91,91),\
                                  welcome_new['medline_abbr_x'],\
                                  welcome_new['good_title'])
welcome_new.good_title = np.where(welcome_new['score_x'].between(86,86) &\
                                  welcome_new['score_y'].between(93,93),\
                                  welcome_new['medline_abbr_y'],\
                                  welcome_new['good_title'])
welcome_new.good_title = np.where(welcome_new['score_x'].between(90,90) &\
                                  welcome_new['score_y'].between(93,93),\
                                  welcome_new['medline_abbr_x'],\
                                  welcome_new['good_title'])
welcome_new.good_title = np.where(welcome_new['score_x'].between(86,86) &\
                                  welcome_new['score_y'].between(90,90),\
                                  welcome_new['medline_abbr_x'],\
                                  welcome_new['good_title'])
welcome_new.good_title = np.where(welcome_new['score_x'].between(90,90) &\
                                  welcome_new['score_y'].between(86,86),\
                                  welcome_new['medline_abbr_x'],\
                                  welcome_new['good_title'])
welcome_new.good_title = np.where(welcome_new['score_x'].between(93,93) &\
                                  welcome_new['score_y'].between(93,93),\
                                  welcome_new['medline_abbr_x'],\
                                  welcome_new['good_title'])
welcome_new.good_title = np.where(welcome_new['journal_title']=='Proceedings of the National Academy of Sciences',
                                  'Proc Natl Acad Sci U S A',\
                                  welcome_new['good_title'])

Google lookup 39 with no PMCID or PMID  
1372:Viral suppression following switch to second-line antiretroviral therapy: associations with NRTI resistance and 'sub-therapeutic' drug concentrations prior to switch,  
1485:Beyond the Medical Text: Health and Illness in Early Medieval Italian Sources,  
1483:A Modern History of the Stomach,  
424:alpha2delta-1 gene deletion affects somatosensory neuron function and delays mechanical hypersensitivity in response to peripheral nerve damage,  
981:The impact on a serogroup a meningococcal conjugate vaccine (PsA-TT) on serogroup A menigococcal meningitis and carriage in Chad,  
434:Effect of limb lengthening on internodal length and conduction velocity of peripheral nerve,  
432:A distinct contribution of short wavelength sensitive cones to light evoked activity in the mouse pretectal olivary nucleus (PON),  
1934:Determinants of Enrolment in Voluntary Health Insurance: Evidences from a Mixed Method Study, Kerala, India 	,  
2083:A Visual Assay to Monitor T6SS-mediated Bacterial Competition,  
2071:Parenthood, child-rearing and fertility in England, 1850-1914,  
2062:We Cannot Be Greek Now: Age Difference, Corruption of Youth and the Making of Sexual Inversion 	,  
2000:Use of antibiotic prophylaxis in elective inguinal hernia repair in adults in London and south-east England: a cross sectional survey,  
1315:Reading therapy strengthens top-down connectivity in patients with pure alexia 	,  
1313:Atypical basic movement kinematics in autism spectrum conditions 	,  
1311:The role of human ventral visual cortex in motion perception,  
2096:Acute alcohol-related dysfunction as a predictor of employment status in a longitudinal study of working age men in Izhevsk, Russia. 	,  
1753:Subjective wellbeing: a primer for poverty analysts,  
2180:Proteome-wide analyses of human hepatocytes during differentiation and de-differentiation,  
2198:Length of carotid stenosis predicts peri-procedural stroke or death and restenosis in patients randomized to endovascular treatment or endarterectomy,  
195:Developmental Trajectories of Verbal and Nonverbal Skills in Individuals with a History of Specific Language Impairment: From Childhood to Adolescence,  
183:The association between breastfeeding and HIV on postpartum maternal weight changes over 24 months in rural South Africa,  
1158:Meningococcal carriage in the African meningitis belt.,  
1232:Behavioral genetics and population health interventions for alcohol problems: at odds or oddly in agreement?,  
1096:Prevalence and risk factors for self-reported asthma in an adult Indian population: a cross-sectional survey,  
1057:Bridging the gap between computation and clinical biology: validation of cable theory in humans,  
1054:Exploration, Novelty, Surprise and Free Energy Minimisation,  
1791:Re-axpession of IGF-II is important for Beta Cell Regeneration and Adult Mice,  
1792:Localisation of RNAs into the Germ Plasm of Viellogenic Xenopus Oocytes,  
1793:Cytosolic entry of Shiga-like toxin A chain from the yeast endoplasmic reticulum requires catalytica,  
1868:Laboratory Science in Tropical Medicine,  
1869:Assessment of in vivo metabolism in failing hearts using hyperpolarised 13C magnetic resonance 	,  
1790:Time to pregnancy: a computational method for using the duration of non-conception for predicting   conception 	,  
1789:Sleep-wake sensitive mechanisms of adenosine release in the Basal forebrain of rodents: an in vitro study,  
1787:Cloned defective interfering influenza virus protects ferrets from pandemic 2009 influenza A virus,  
546:Stuck in ruins, or up and coming? 	,  
548:Pharmacy, money and public health in Dakar.,  
1504:Spatial attention, precision and Bayesian inference: a study of saccadic response speed,  
1503:Inter- and Intra-hemispheric connectivity differences when reading Japanese Kanji and Hiragana,  
1095:Socio-economic patterning of tobacco use in Indian states  

In [71]:
welcome_new.at[1372, 'good_title'] = 'J Infect Dis'
welcome_new.at[1485, 'good_title'] = 'Soc Hist Med'
welcome_new.at[1483, 'good_title'] = 'Soc Hist Med'
welcome_new.at[424, 'good_title'] = 'J Neurosci'
welcome_new.at[981, 'good_title'] = 'Lancet'
welcome_new.at[434, 'good_title'] = 'J Neurosci'
welcome_new.at[432, 'good_title'] = 'J Neurosci'
welcome_new.at[1934, 'good_title'] = 'Int J Fi Re'
welcome_new.at[2071, 'good_title'] = 'Hist Fam'
welcome_new.at[2062, 'good_title'] = 'Engl Stud'
welcome_new.at[2000, 'good_title'] = 'Hernia'
welcome_new.at[1315, 'good_title'] = 'Brain'
welcome_new.at[1313, 'good_title'] = 'Brain'
welcome_new.at[1311, 'good_title'] = 'Brain'
welcome_new.at[2096, 'good_title'] = 'Addiction'
welcome_new.at[1753, 'good_title'] = 'J Poverty Soc Justice'
welcome_new.at[2180, 'good_title'] = 'Hepatology'
welcome_new.at[2198, 'good_title'] = 'Int J Stroke'
welcome_new.at[195, 'good_title'] = 'J Speech Lang Hear Res'
welcome_new.at[183, 'good_title'] = 'Trop Med Int Health'
welcome_new.at[1158, 'good_title'] = 'J Infect'
welcome_new.at[1232, 'good_title'] = 'Genet Med'
welcome_new.at[1096, 'good_title'] = 'Int J Tuberc Lung Dis'
welcome_new.at[1057, 'good_title'] = 'Front Physiol'
welcome_new.at[1054, 'good_title'] = 'Front Psychol'
welcome_new.at[1791:1793, 'good_title'] = 'PLoS One'
welcome_new.at[1868, 'good_title'] = 'Pub Ser Rev UK sci tech'
welcome_new.at[1869, 'good_title'] = 'J Cardi Mag Res'
welcome_new.at[1790, 'good_title'] = 'PLoS One'
welcome_new.at[1789, 'good_title'] = 'PLoS One'
welcome_new.at[1787, 'good_title'] = 'PLoS One'
welcome_new.at[546, 'good_title'] = 'Africa'
welcome_new.at[548, 'good_title'] = 'Africa'
welcome_new.at[1504, 'good_title'] = 'Cereb Cortex'
welcome_new.at[1503, 'good_title'] = 'Cereb Cortex'
welcome_new.at[1095, 'good_title'] = 'Int J Tuberc Lung Dis'
welcome_new.at[122, 'good_title'] = 'J Exp Psychol Anim Behav Process'
welcome_new.at[123, 'good_title'] = 'J Exp Psychol Hum Percept Perform'
welcome_new.at[124, 'good_title'] = 'J Exp Psychol Hum Percept Perform'
welcome_new.at[128, 'good_title'] = 'J Exp Psychol Hum Percept Perform'
welcome_new.at[130, 'good_title'] = 'J Biol Chem'
welcome_new.at[131, 'good_title'] = 'J Biol Chem'
welcome_new.at[132, 'good_title'] = 'J Biol Chem'
welcome_new.at[133, 'good_title'] = 'J Biol Chem'
welcome_new.at[133, 'good_title'] = 'J Biol Chem'
welcome_new.at[134, 'good_title'] = 'Mol Cell Proteomics'
welcome_new.at[201, 'good_title'] = 'Evidence, Ethos and Experiment: the Anthropology and History of Medical Research in Africa'
welcome_new.at[212, 'good_title'] = 'Evidence, Ethos and Experiment: the Anthropology and History of Medical Research in Africa'
welcome_new.at[316, 'good_title'] = 'Implement Sci'
welcome_new.at[317, 'good_title'] = 'Parasit Vectors'
welcome_new.at[342, 'good_title'] = 'Reproduction'
welcome_new.at[385:390, 'good_title'] = 'J Neurol Neurosurg Psychiatry'
welcome_new.at[407, 'good_title'] = 'Sex Transm Infect'
welcome_new.at[544, 'good_title'] = 'J Cell Sci'
welcome_new.at[547, 'good_title'] = 'Africa'
welcome_new.at[561, 'good_title'] = 'G3'
welcome_new.at[649, 'good_title'] = 'Biochim Biophys Acta'
welcome_new.at[661, 'good_title'] = 'Biochim Biophys Acta'
welcome_new.at[213, 'good_title'] = 'Evidence, Ethos and Experiment: the Anthropology and History of Medical Research in Africa'
welcome_new.at[310, 'good_title'] = 'BMC Genomics'
welcome_new.at[863, 'good_title'] = 'N Biotechnol'
welcome_new.at[890:894, 'good_title'] = 'Neuroimage Clin'
welcome_new.at[947, 'good_title'] = 'Protein Expr Purif'
welcome_new.at[972, 'good_title'] = 'Stud Hist Philos Biol Biomed Sci'
welcome_new.at[985, 'good_title'] = 'Vet J'
welcome_new.at[1019, 'good_title'] = 'curr Biol'
welcome_new.at[1097:1098, 'good_title'] = 'Acta Crystallogr D Biol Crystallogr'
welcome_new.at[1102, 'good_title'] = 'J Synchrotron Radiat'
welcome_new.at[1104, 'good_title'] = 'Acta Crystallogr D Biol Crystallogr'
welcome_new.at[1105:1106, 'good_title'] ='Acta Crystallogr Sect F Struct Biol Cryst Commun'
welcome_new.at[1148, 'good_title'] = 'Popul Space Place'
welcome_new.at[1200, 'good_title'] = 'Fungal Disease in Britain and the United States 1850-2000'
welcome_new.at[1204, 'good_title'] = 'Tissue Eng Part A'
welcome_new.at[1213, 'good_title'] = 'Proc Natl Acad Sci U S A'
welcome_new.at[1473, 'good_title'] = 'Clin Infect Dis'
welcome_new.at[1477, 'good_title'] = 'J Gerontol A Biol Sci Med Sci.'
welcome_new.at[1526, 'good_title'] = 'Protein Eng Des Sel'
welcome_new.at[1527:1528, 'good_title'] = 'QJM'
welcome_new.at[1751, 'good_title'] = 'Proc Natl Acad Sci U S A'
welcome_new.at[1776, 'good_title'] = 'Proc Natl Acad Sci U S A'
welcome_new.at[1788, 'good_title'] = 'PLoS One'
welcome_new.at[1892, 'good_title'] = 'J R Soc Interface'
welcome_new.at[1895, 'good_title'] = 'Philos Trans R Soc Lond B Biol Sci'
welcome_new.at[1900:1907, 'good_title'] = 'Philos Trans R Soc Lond B Biol Sci'
welcome_new.at[1917, 'good_title'] = 'Health'
welcome_new.at[1969, 'good_title'] = 'Anal Bioanal Chem'
welcome_new.at[2042, 'good_title'] = 'Azania'
welcome_new.at[2054, 'good_title'] = 'Azania'
welcome_new.at[2073:2076, 'good_title'] = 'J Med Chem'
welcome_new.at[2085:2086, 'good_title'] = 'Philos Trans R Soc Lond B Biol Sci'
welcome_new.at[2128:2129, 'good_title'] = 'Birth Defects Res A Clin Mol Teratol.'
welcome_new.at[2162:2163, 'good_title'] = 'FEBS J'
welcome_new.at[2174, 'good_title'] = 'Genesis'
welcome_new.at[2194, 'good_title'] = 'Hum Mutat'
welcome_new.at[2215, 'good_title'] = 'J R Stat Soc Ser C Appl Stat'
welcome_new.at[2267, 'good_title'] = 'Child Care Health Dev'
welcome_new.at[2277, 'good_title'] = 'Child Care Health Dev'
welcome_new.at[2278:2282, 'good_title'] = 'Dev World Bioeth'
welcome_new.at[2284, 'good_title'] = 'Influenza Other Respir Viruses'
welcome_new.at[2334, 'good_title'] = 'J Acquir Immune Defic Syndr'
welcome_new.at[806, 'good_title'] = 'J Cyst Fibros'
welcome_new.at[2365, 'good_title'] = 'Acta Crystallogr D Biol Crystallogr'
welcome_new.at[2393, 'good_title'] = 'Influenza Other Respir Viruses'
welcome_new.at[1752, 'good_title'] = 'Proc Natl Acad Sci U S A'
pd.set_option('display.max_colwidth', -1)

welcome_new[welcome_new.good_title=='nan']

Unnamed: 0,PMID_PMCID,Publisher,journal_title,article_title,cost,full_title,score_x,file_index_x,medline_abbr_x,medline_abbr_y,score_y,file_index_y,medline_abbr_2,good_title


In [72]:
x = welcome_new.groupby('good_title')['cost']
results = x.describe()
results.sort_values(['count', 'mean'], ascending=False)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
good_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
PLoS One,208.0,45085.396442,203999.556408,122.31,852.5050,899.740,1044.7325,999999.00
J Biol Chem,65.0,16773.726769,123860.190202,265.67,1166.6000,1324.570,1586.3600,999999.00
Proc Natl Acad Sci U S A,37.0,850.920000,465.164990,206.32,660.5900,751.840,831.0900,2691.68
Neuroimage,31.0,2212.181290,273.193244,1747.16,2030.8950,2326.430,2408.6800,2518.23
Nucleic Acids Res,29.0,1162.344828,442.150934,710.00,852.0000,852.000,1704.0000,2184.00
PLoS Negl Trop Dis,25.0,1712.532000,587.680631,1283.76,1427.0300,1525.000,1750.0100,3600.00
PLoS Genet,24.0,84839.435000,281865.707794,1394.05,1484.8625,1718.390,1778.2250,999999.00
PLoS Pathog,24.0,84775.044583,281885.544090,1254.02,1440.0000,1600.520,1760.3100,999999.00
Hum Mol Genet,20.0,51921.887500,223154.227034,1700.00,2040.0000,2040.000,2100.0000,999999.00
Nat Commun,19.0,55600.705263,228698.044027,910.80,2562.0000,3642.000,3780.0000,999999.00


Separate out good matches for titles and abbreviations.  
1) Select all abbreviation score (score_y) == 100
2) Select remaining pubmed titles score (score_x) ==100  
3) Select remaining pubmed titles score (score_x) between 96 and 99

In [None]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 300)
welcome_abbr_match_100 = welcome_new[welcome_new.score_y==100]
welcome_title_match_100 = welcome_new[welcome_new.score_x==100]
welcome_title_match_96 = welcome_new[welcome_new.score_x.between(96,99)]\
.sort_values('score_x')
welcome_title_match_91 = welcome_new[welcome_new.score_x.between(91,95)]\
.sort_values('score_x')
welcome_title_match_89 = welcome_new[welcome_new.score_x.between(89,90)]\
.sort_values('score_x')

welcome_title_match_86 = welcome_new[welcome_new.score_x.between(86,88)]\
.sort_values('score_x')

welcome_title_match_50 = welcome_new[welcome_new.score_x.between(50,85)]\
.sort_values('score_x')


In [None]:
pmid = pd.read_csv('PMID.csv', skipinitialspace=True)
pmid = pmid.replace(np.nan, '', regex=True)

pmid.rename(index=str, columns={'Unnamed: 0':'id'}, inplace=True)

replacements = {'PMCID':'PMC'} #, '(\\b)(\\d{7})(\\b)':'PMC3, ',':' ', '(\r\n)':' ', '(\\D)':' '}
pmid.id = pmid.id.replace(replacements, regex=True)
pmid.id[40]==r'3\\d{6}'
#pmid.where(pmid.id!='\\d{7}', other=nan, inplace=False, axis=None, level=None, errors='raise', try_cast=False, raise_on_error=None)[source]
#pmid.id = pmid.id.str.strip()
#pmid['first_part'], pmid['second_part'] = pmid.id.str.split('PMC', 1).str
#pmid.to_csv('numbers.csv', index=False)
#pmid[0:50]

In [None]:
data = {1378:24048963, 1376:22301630, 1374:23945372, 1370:23493728, 1336:23884064,
1496: 23658422, 1271:23328632, 722:23932517, 568:2766312, 440:24285889, 428:23739958,
1077:23847615, 634:23907068, 608:23341602, 604:23650371, 602:23319650, 598:23213245,
1911:23023652, 1866:23166732, 1784:23527131, 1783:24147036, 1898:23282992, 1976:23052214,
2095:21624095, 1782:23844111, 2044:22897899, 2037:22618994, 1781:23239765, 1517:23328711,
1516:22735079, 1492:23162054, 1490:23396536, 2097:23734913, 2286:22364555, 2320:23670821,
2179:22807091, 2178:23775568, 2173:21255266, 2172:23738518, 729:22155499, 730:20800751,
732:23620154, 731:24064150, 775:23137753, 735:23541370, 734:21680110, 733:22591621,
208:23589301, 414:23709760, 1231:23703681, 221:22738332, 2296:22961729, 2295:22730171,
2294:21472932, 806:23642644, 1099:22993091, 1525:23981980, 1489:22345357, 617:2294692}

pmid_list = pd.DataFrame.from_dict(data, orient='index', columns=['pmid'])


In [None]:
pmc_data = {1380:3749005,1382:3583273,1338:3381638,1334:3540040,1284:3604800,1482:3635503,1484:3481955,1752:3606973,2335:3707567,1512:3600839,1495:3476336,
1494:3570213,1184:3547901,420:3676539,422:23426678,805:3793856,982:3627205,980:3525981,578:3529010,576:3479458,574:3670340,572:3704016,570:3511132,
438:3586675,430:3655688,426:3724995,580:3479523,582:3538261,586:3666720,611:3738233,610:3607403,600:3503170,594:3746881,592:3752214,590:3587277,
584:3557024,1929:3682634,1923:3524560,1908:3785133,1867:3388079,1863:3712914,1809:3692470,1961:3549237,2084:3613719,1320:3501977,1319:3634199,
1318:3159156,1316:3692042,1312:3754462,1310:3692041,1309:3673465,1308:3673458,1767:3526116,1488:3496338,1487:3546802,1486:3413389,2104:3749465,
2266:3853540,2287:3465775,2330:3819359,2146:3638371,1307:3286332,1303:3639724,660:3793867,560:3644702,774:3438445,913:3562439,202:3734580,198:3597819,
494:3687256,493:3533127,1160:3627817,1159:3770928,1157:3558801,1155:3717178,1241:3465389,1208:3571806,965:3526787,1129:3563216,1018:3763376,1000:3759846,
970:3694306,969:3664939,968:3651934,967:3542428,1049:3677134,1100:3087623,1073:3715722,1072:3557415,1069:3676342,1046:3689257,1067:3564010,1065:3502006,
1056:3759789,1053:3644713,1050:3521128,1066:3444764,1893:3679597,1894:3758187,1956:3492749,2041:3627851,2043:3778840,2147:3698701,2331:3773237,
2332:3815011,1101:3374517,1110:3510731,1749:3612675,1550:3493395,1530:3475639,1505:3673173,1499:3170535,1497:3580272,1481:3819976,2305:3731578,
612:3549118,613:3606975,614:3612627,615:3491471,616:3529030,1864:3749971,1865:3460945,1207:3717731,1162:3427885,352:3686250}
pmc_list = pd.DataFrame.from_dict(pmc_data, orient='index', columns=['pmc'])

In [None]:
pmid_call = pd.read_table('pmid_res.txt', names=['abbr'])

In [None]:
pmc_get = pmc_list.pmc.apply(str)
for i in pmc_get:
    indx = list(pmc_get.loc[pmc_get == i].index)
    pmc = list(pmc_get.loc[pmc_get == i])
    r = requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id='+i+'&retmode=text&rettype=mln-ta&api_key=3ffbbb6bd110815d69e4aa14b7c26d72ab09')
    print(indx, pmc, r.text)

    

In [None]:
titles = pd.read_csv('title_list.txt')
titles

In [None]:
welcome_titles = pd.DataFrame(welcome['Journal title'].unique())
welcome_titles.to_csv('welcome-titles.csv', index=False)