In [2]:
# import modules

import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rc
import pandas as pd
import seaborn as sns

In [3]:
# import GEO reference data

# import table with accession, platform, and series
geoAPS = pd.read_csv('../geo_sample.csv')

# import table with datasets IDs
geoDS = pd.read_csv('../geo_series_table.csv', low_memory = False)
geoDS.columns = ['Series', 'Title', 'Series Type', 'Taxonomy', 
                 'Sample Count', 'Datasets', 'Supplementary Types', 
                 'Supplementary Links', 'PubMed ID', 'SRA Accession', 
                 'Contact', 'Release Date']

# add datasets column by merging
allData = pd.merge(geoAPS, geoDS, how = 'outer', on = 'Series')
geoReference = allData[['Series', 'Accession', 'Platform', 'Datasets']]

geoReference

Unnamed: 0,Series,Accession,Platform,Datasets
0,GSE506,GSM1,GPL4,
1,GSE506,GSM2,GPL4,
2,GSE462,GSM3,GPL5,
3,GSE462,GSM4,GPL5,
4,GSE462,GSM5,GPL5,
...,...,...,...,...
3280011,GSE136775,,,
3280012,GSE136776,,,
3280013,GSE137458,,,
3280014,GSE137562,,,


In [4]:
# import SRA reference data
sraReference = pd.read_csv('../sraIDfull.csv', error_bad_lines = False, low_memory=False, quoting=3)
sraReference = sraReference[['SRAStudy', 'Run', 'Experiment', 
                             'BioProject', 'Submission', 'Sample']]
sraReference

Unnamed: 0,SRAStudy,Run,Experiment,BioProject,Submission,Sample
0,ERP000767,ERR3550121,ERX3556726,PRJEB2600,ERA2143351,ERS3773255
1,ERP000767,ERR3550142,ERX3556747,PRJEB2600,ERA2143351,ERS3773255
2,ERP000767,ERR3550147,ERX3556752,PRJEB2600,ERA2143351,ERS3773263
3,ERP000767,ERR3550125,ERX3556730,PRJEB2600,ERA2143351,ERS3773263
4,ERP000767,ERR3550137,ERX3556742,PRJEB2600,ERA2143351,ERS3773252
...,...,...,...,...,...,...
7676058,SRP002493,SRR049674,SRX020916,,SRA017695,SRS073299
7676059,SRP002493,SRR049675,SRX021028,,SRA017695,SRS073299
7676060,SRP002493,SRR049677,SRX021029,,SRA017695,SRS073299
7676061,SRP002064,SRR042496,SRX020084,PRJNA46359,DIEGO,SRS011854


In [5]:
# import data scraped from PubMed XML files
pmcData = pd.read_csv('../data_tables/preFilterMatrix.csv')
pmcData

# temporary mini dataframe...
# pmcData = pmcData.iloc[141499:144406]
# pmcData

Unnamed: 0,journal,pmc_ID,accession
0,Alzheimers_Res_Ther,PMC3707052,GSM1
1,Alzheimers_Res_Ther,PMC3706879,GSE45534
2,Alzheimers_Res_Ther,PMC3706879,GSE45534
3,Alzheimers_Res_Ther,PMC3706879,GSE45534
4,Alzheimers_Res_Ther,PMC3706879,GSE45534
...,...,...,...
144401,Dis_Markers,PMC3834650,GDS1059
144402,Dis_Markers,PMC3834650,GDS1059
144403,Dis_Markers,PMC3834650,GDS1
144404,Case_Rep_Hematol,PMC6462343,PRJNA437812


In [6]:
# define function returning corresponding metadata or Study/Series ID given any information on a dataset
def grabRelated(table, key, key_col, out_col):
    indices = table.index[table[key_col] == key].tolist()
    if(len(indices) > 0):
        allHits = table.loc[indices][[out_col]]
        modeHits = allHits.mode()
        if(modeHits.empty):
            return 'NaN'
        else:
            return modeHits.iloc[0,0]
    else:
        return 'NaN'

# define functions performing QC on accessions, checking if they exist in the references
def sraChecksOut(acc, col):
    if(acc in sraReference[col].tolist()):
        return True
    else:
        return False
    
def geoChecksOut(acc, col):
    if(acc in geoReference[col].tolist()):
        return True
    else:
        return False

In [6]:
# check every accession against the corresponding reference, and convert to STUDY/SERIES
# add converted accessions to a new list, to be added as a column later

converted_acc = []

for acc in pmcData['accession']:
    style = ''
    # assign a style to the accession, corresponds to column names
    # SRA styles
    if('SRP' in acc or 'ERP' in acc or 'DRP' in acc):
        style = 'SRAStudy'
    elif('SRR' in acc or 'ERR' in acc or 'DRR' in acc):
        style = 'Run'
    elif('SRX' in acc or 'ERX' in acc or 'DRX' in acc):
        style = 'Experiment'
    elif('PRJNA' in acc or 'PRJD' in acc or 'PRJEB' in acc):
        style = 'BioProject'
    elif('SRA' in acc or 'ERA' in acc or 'DRA' in acc):
        style = 'Submission'
    elif('SRS' in acc or 'ERS' in acc or 'DRS' in acc):
        style = 'Sample'
    # GEO styles
    elif('GSE' in acc):
        style = 'Series'
    elif('GSM' in acc):
        style = 'Accession'
    elif('GPL' in acc):
        style = 'Platform'
    elif('GDS' in acc):
        style = 'Datasets'
    else:
        style = ''
        
    # for SRA accessions, check if they exist in the SRA reference
    # ...and add corresponding STUDY ID to new column
    if(style == 'SRAStudy' or style == 'Run' or style == 'Experiment' or 
       style == 'BioProject' or style == 'Submission' or style == 'Sample'):
        if(not sraChecksOut(acc, style)):
            converted_acc.append('NaN')
        else:
            converted_acc.append(grabRelated(sraReference, acc, style, 'SRAStudy'))
    # for GEO accessions, check if they exist in the GEO reference
    # ...and add corresponding SERIES ID to new column
    elif(style == 'Series' or style == 'Accession' or style == 'Platform' or style == 'Datasets'):
        if(not geoChecksOut(acc, style)):
            converted_acc.append('NaN')
        else:
            converted_acc.append(grabRelated(geoReference, acc, style, 'Series'))
    # something doesn't belong, record as missing
    else:
        converted_acc.append('NaN')

# add the converted accession list as a new column in PubMed data table
# pd.set_option('display.min_rows', 50)
pmcData.loc[:,'converted_accession'] = converted_acc
pmcData = pmcData.drop_duplicates()
pmcData

Unnamed: 0,journal,pmc_ID,accession,converted_accession
0,Alzheimers_Res_Ther,PMC3707052,GSM1,GSE506
1,Alzheimers_Res_Ther,PMC3706879,GSE45534,GSE45534
5,Alzheimers_Res_Ther,PMC4255636,GSE5281,GSE5281
6,Alzheimers_Res_Ther,PMC4731966,GSE67036,GSE67036
7,Alzheimers_Res_Ther,PMC4731966,GSE1297,GSE1297
8,Alzheimers_Res_Ther,PMC4731966,GSE48350,GSE48350
9,Alzheimers_Res_Ther,PMC4731966,GSE36980,GSE36980
10,Alzheimers_Res_Ther,PMC4731966,GSE12679,GSE12679
11,Alzheimers_Res_Ther,PMC4731966,GSE12293,GSE12293
17,Alzheimers_Res_Ther,PMC4731966,GSE46871,GSE46871


In [8]:
# save 'checkpoint' to CSV file
# pmcData.to_csv('postFilterMatrix.csv')

In [7]:
# temporary cell saving time from big loop...

pmcData = pd.read_csv('../data_tables/postFilterMatrix.csv')
pmcData

Unnamed: 0,journal,pmc_ID,accession,converted_accession
0,Alzheimers_Res_Ther,PMC3707052,GSM1,GSE506
1,Alzheimers_Res_Ther,PMC3706879,GSE45534,GSE45534
2,Alzheimers_Res_Ther,PMC4255636,GSE5281,GSE5281
3,Alzheimers_Res_Ther,PMC4731966,GSE67036,GSE67036
4,Alzheimers_Res_Ther,PMC4731966,GSE1297,GSE1297
...,...,...,...,...
78615,Dis_Markers,PMC6589301,GSE62254,GSE62254
78616,Dis_Markers,PMC3834650,GDS1059,GSE2191
78617,Dis_Markers,PMC3834650,GDS1,
78618,Case_Rep_Hematol,PMC6462343,PRJNA437812,SRP135146


In [8]:
# Add a column tagging each accession as GEO or SRA

repoList = []

for i in pmcData['converted_accession']:
    if(type(i) == str):
        if('GSE' in i):
            repoList.append('GEO')
        elif('SRP' in i or 'ERP' in i or 'DRP' in i):
            repoList.append('SRA')
        else:
            repoList.append('NaN')
    else:
        repoList.append('NaN')
        
pmcData['repository'] = repoList
pmcData

Unnamed: 0,journal,pmc_ID,accession,converted_accession,repository
0,Alzheimers_Res_Ther,PMC3707052,GSM1,GSE506,GEO
1,Alzheimers_Res_Ther,PMC3706879,GSE45534,GSE45534,GEO
2,Alzheimers_Res_Ther,PMC4255636,GSE5281,GSE5281,GEO
3,Alzheimers_Res_Ther,PMC4731966,GSE67036,GSE67036,GEO
4,Alzheimers_Res_Ther,PMC4731966,GSE1297,GSE1297,GEO
...,...,...,...,...,...
78615,Dis_Markers,PMC6589301,GSE62254,GSE62254,GEO
78616,Dis_Markers,PMC3834650,GDS1059,GSE2191,GEO
78617,Dis_Markers,PMC3834650,GDS1,,
78618,Case_Rep_Hematol,PMC6462343,PRJNA437812,SRP135146,SRA


In [9]:
# import SRA attribute data
# CAUTION: huge file! Time delay on import...

sraAttributes = pd.read_csv('../sra_complete_runs.csv', error_bad_lines = False, low_memory=False)
pd.set_option('display.max_columns', 50)
sraAttributes

Unnamed: 0,Run,ReleaseDate,LoadDate,spots,bases,spots_with_mates,avgLength,size_MB,AssemblyName,download_path,Experiment,LibraryName,LibraryStrategy,LibrarySelection,LibrarySource,LibraryLayout,InsertSize,InsertDev,Platform,Model,SRAStudy,BioProject,Study_Pubmed_id,ProjectID,Sample,BioSample,SampleType,TaxID,ScientificName,SampleName,g1k_pop_code,source,g1k_analysis_group,Subject_ID,Sex,Disease,Tumor,Affection_Status,Analyte_Type,Histological_Type,Body_Site,CenterName,Submission,dbgap_study_accession,Consent,RunHash,ReadHash
0,ERR3550121,2019-09-25 19:35:40,,0,0,0,0,0,,,ERX3556726,,WGS,RANDOM,GENOMIC,PAIRED,250,0.0,ILLUMINA,Illumina HiSeq 2000,ERP000767,PRJEB2600,,204335,ERS3773255,,simple,0,Leishmania infantum,0c4ac31a-dec5-11e9-ba7b-68b599768938,,,,,,,no,,,,,UOC-CCG,ERA2143351,,public,,
1,ERR3550142,2019-09-25 19:35:41,,0,0,0,0,0,,,ERX3556747,,WGS,RANDOM,GENOMIC,PAIRED,250,0.0,ILLUMINA,Illumina HiSeq 2000,ERP000767,PRJEB2600,,204335,ERS3773255,,simple,0,Leishmania infantum,0c4ac31a-dec5-11e9-ba7b-68b599768938,,,,,,,no,,,,,UOC-CCG,ERA2143351,,public,,
2,ERR3550147,2019-09-25 19:35:41,,0,0,0,0,0,,,ERX3556752,,WGS,RANDOM,GENOMIC,PAIRED,250,0.0,ILLUMINA,Illumina HiSeq 2000,ERP000767,PRJEB2600,,204335,ERS3773263,,simple,0,Leishmania donovani,0e2d3a2c-dec6-11e9-a5d1-68b59976a384,,,,,,,no,,,,,UOC-CCG,ERA2143351,,public,,
3,ERR3550125,2019-09-25 19:35:40,,0,0,0,0,0,,,ERX3556730,,WGS,RANDOM,GENOMIC,PAIRED,250,0.0,ILLUMINA,Illumina HiSeq 2000,ERP000767,PRJEB2600,,204335,ERS3773263,,simple,0,Leishmania donovani,0e2d3a2c-dec6-11e9-a5d1-68b59976a384,,,,,,,no,,,,,UOC-CCG,ERA2143351,,public,,
4,ERR3550137,2019-09-25 19:35:40,,0,0,0,0,0,,,ERX3556742,,WGS,RANDOM,GENOMIC,PAIRED,250,0.0,ILLUMINA,Illumina HiSeq 2000,ERP000767,PRJEB2600,,204335,ERS3773252,,simple,0,Leishmania donovani,188c9a6e-deba-11e9-ba7b-68b599768938,,,,,,,no,,,,,UOC-CCG,ERA2143351,,public,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7676058,SRR049674,2010-05-27 09:26:35,2014-05-27 02:57:13,35888370,1794418500,0,50,1504,,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/s...,SRX020916,VVWTA 0h A,FL-cDNA,RT-PCR,VIRAL RNA,SINGLE,0,0.0,ABI_SOLID,AB SOLiD System 3.0,SRP002493,,2.0,0,SRS073299,SAMN00013848,simple,10245,Vaccinia virus,VACV,,,,,,,no,,,,,NIAID-RML-RTS,SRA017695,,public,1C9EE0CEFAE2352A0321B24EAB64EE18,753ED95CDA0B112FD877F38960613605
7676059,SRR049675,2010-05-27 09:26:35,2014-05-27 02:58:17,40264881,2013244050,0,50,1696,,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/s...,SRX021028,VVWTA 0.5A,FL-cDNA,RT-PCR,VIRAL RNA,SINGLE,0,0.0,ABI_SOLID,AB SOLiD System 3.0,SRP002493,,2.0,0,SRS073299,SAMN00013848,simple,10245,Vaccinia virus,VACV,,,,,,,no,,,,,NIAID-RML-RTS,SRA017695,,public,9F96F35BE762574A084985FD9C918C28,392587111A738BC44AE8878E649D052A
7676060,SRR049677,2010-05-27 09:26:35,2014-05-27 02:58:58,40852259,2042612950,0,50,1732,,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/s...,SRX021029,VVWTA 1h A,FL-cDNA,RT-PCR,VIRAL RNA,SINGLE,0,0.0,ABI_SOLID,AB SOLiD System 3.0,SRP002493,,2.0,0,SRS073299,SAMN00013848,simple,10245,Vaccinia virus,VACV,,,,,,,no,,,,,NIAID-RML-RTS,SRA017695,,public,22DCD0A93DAA207187EF6C6F6A835A55,416EBCA2B1A6AB70E4F778F16C36FCE5
7676061,SRR042496,2010-05-11 15:16:41,2012-01-19 16:52:13,47760,16505980,0,345,42,,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/s...,SRX020084,VirRnaMgMosqCeI,OTHER,RANDOM PCR,VIRAL RNA,SINGLE,0,0.0,LS454,454 GS FLX Titanium,SRP002064,PRJNA46359,2.0,46359,SRS011854,SAMN00012272,simple,1284618,environmental samples,VirRnaMgMosqCeI,,,,,,,no,,,,,SAN DIEGO STATE UNIVERSITY,SRA012164,,public,633E95BCDFC18AD9C8B5DEAAFAE4DD3F,00AB1B11657DAE26FFECDAD623148A3C


In [10]:
# import GEO attribute data and add Series column

geoAttributes = pd.read_csv('../geo_platforms_table.csv')
geoAttributes.rename(columns={'Accession':'Platform'}, inplace=True)
geoAttributes = pd.merge(geoAttributes, geoReference, how = 'left', on = 'Platform')
geoAttributes

Unnamed: 0,Platform,Title,Technology,Taxonomy,Data Rows,Samples Count,Series Count,Contact,Release Date,Series,Accession,Datasets
0,GPL4,SAGE:10:NlaIII:Homo sapiens,SAGE NlaIII,Homo sapiens,265577,580,58,GEO,"Sep 28, 2000",GSE506,GSM1,
1,GPL4,SAGE:10:NlaIII:Homo sapiens,SAGE NlaIII,Homo sapiens,265577,580,58,GEO,"Sep 28, 2000",GSE506,GSM2,
2,GPL4,SAGE:10:NlaIII:Homo sapiens,SAGE NlaIII,Homo sapiens,265577,580,58,GEO,"Sep 28, 2000",GSE10,GSM571,
3,GPL4,SAGE:10:NlaIII:Homo sapiens,SAGE NlaIII,Homo sapiens,265577,580,58,GEO,"Sep 28, 2000",GSE10,GSM572,
4,GPL4,SAGE:10:NlaIII:Homo sapiens,SAGE NlaIII,Homo sapiens,265577,580,58,GEO,"Sep 28, 2000",GSE10,GSM573,
...,...,...,...,...,...,...,...,...,...,...,...,...
2961228,GPL27549,Illumina HiSeq 2000 (Escherichia coli str. K-1...,high-throughput sequencing,Escherichia coli str. K-12 substr. DH10B,0,0,0,GEO,"Sep 30, 2019",,,
2961229,GPL27550,Illumina HiSeq 4000 (Solanum chacoense),high-throughput sequencing,Solanum chacoense,0,0,0,GEO,"Sep 30, 2019",,,
2961230,GPL27552,"Illumina HiSeq 2500 (Caenorhabditis elegans ,V...",high-throughput sequencing,"Caenorhabditis elegans ,Vibrio parahaemolyticus",0,0,0,GEO,"Sep 30, 2019",,,
2961231,GPL27554,Illumina HiSeq 2000 (Trichoderma atroviride),high-throughput sequencing,Trichoderma atroviride,0,0,0,GEO,"Sep 30, 2019",,,


In [11]:
# # temporary mini dataframe...
# pmcData = pmcData.iloc[1:100]
# pmcData

In [12]:
# add column for paper publish date

pmc_dates = pd.read_csv('../data_lists/postFilterDates.txt')

pmcData.loc[:,'pmc_date'] = pmc_dates['date'].tolist()
pmcData

Unnamed: 0,journal,pmc_ID,accession,converted_accession,repository,pmc_date
0,Alzheimers_Res_Ther,PMC3707052,GSM1,GSE506,GEO,2013
1,Alzheimers_Res_Ther,PMC3706879,GSE45534,GSE45534,GEO,2013
2,Alzheimers_Res_Ther,PMC4255636,GSE5281,GSE5281,GEO,2014
3,Alzheimers_Res_Ther,PMC4731966,GSE67036,GSE67036,GEO,2016
4,Alzheimers_Res_Ther,PMC4731966,GSE1297,GSE1297,GEO,2016
...,...,...,...,...,...,...
78615,Dis_Markers,PMC6589301,GSE62254,GSE62254,GEO,2019
78616,Dis_Markers,PMC3834650,GDS1059,GSE2191,GEO,2013
78617,Dis_Markers,PMC3834650,GDS1,,,2013
78618,Case_Rep_Hematol,PMC6462343,PRJNA437812,SRP135146,SRA,2019


In [15]:
grabRelated(geoAttributes, 'GSE45534', 'Series', 'Taxonomy')

'Mus musculus'

In [28]:
slicedGEOAtt = geoAttributes[['Series', 'Taxonomy']]
slicedGEOAtt.columns = ['converted_accession', 'geoSpecies']
slicedGEOAtt = slicedGEOAtt.drop_duplicates(subset = ['geoSpecies'])

slicedSRAAtt = sraAttributes[['SRAStudy', 'ScientificName']]
slicedSRAAtt.columns = ['converted_accession', 'sraSpecies']
slicedSRAAtt = slicedSRAAtt.drop_duplicates(subset = ['sraSpecies'])

# slicedGEOAtt
# mergedGEO = pd.merge(pmcData, slicedGEOAtt, how = 'left', on = 'converted_accession')
# mergedSRA = pd.merge(mergedGEO, slicedSRAAtt, how = 'left', on = 'converted_accession')

mergedSRA = pd.merge(pmcData, slicedSRAAtt, how = 'left', on = 'converted_accession')

mergedSRA = mergedSRA.drop_duplicates()
mergedSRA



Unnamed: 0,journal,pmc_ID,accession,converted_accession,repository,pmc_date,sraSpecies
0,Alzheimers_Res_Ther,PMC3707052,GSM1,GSE506,GEO,2013,
1,Alzheimers_Res_Ther,PMC3706879,GSE45534,GSE45534,GEO,2013,
2,Alzheimers_Res_Ther,PMC4255636,GSE5281,GSE5281,GEO,2014,
3,Alzheimers_Res_Ther,PMC4731966,GSE67036,GSE67036,GEO,2016,
4,Alzheimers_Res_Ther,PMC4731966,GSE1297,GSE1297,GEO,2016,
...,...,...,...,...,...,...,...
318848,Dis_Markers,PMC6589301,GSE62254,GSE62254,GEO,2019,
318849,Dis_Markers,PMC3834650,GDS1059,GSE2191,GEO,2013,
318850,Dis_Markers,PMC3834650,GDS1,,,2013,
318851,Case_Rep_Hematol,PMC6462343,PRJNA437812,SRP135146,SRA,2019,


In [27]:
mergedGEO = pd.merge(mergedSRA, slicedGEOAtt, how = 'left', on = 'converted_accession')
mergedGEO.dropna(subset = ['converted_accession'])
mergedGEO

Unnamed: 0,journal,pmc_ID,accession,converted_accession,repository,pmc_date,sraSpecies,geoSpecies
0,Alzheimers_Res_Ther,PMC3707052,GSM1,GSE506,GEO,2013,,Homo sapiens
1,Alzheimers_Res_Ther,PMC3707052,GSM1,GSE506,GEO,2013,,Homo sapiens
2,Alzheimers_Res_Ther,PMC3706879,GSE45534,GSE45534,GEO,2013,,Mus musculus
3,Alzheimers_Res_Ther,PMC3706879,GSE45534,GSE45534,GEO,2013,,Mus musculus
4,Alzheimers_Res_Ther,PMC4255636,GSE5281,GSE5281,GEO,2014,,Homo sapiens
...,...,...,...,...,...,...,...,...
97185616,Dis_Markers,PMC3834650,GDS1,,,2013,soil metagenome,"Caenorhabditis elegans ,Vibrio parahaemolyticus"
97185617,Dis_Markers,PMC3834650,GDS1,,,2013,soil metagenome,Trichoderma atroviride
97185618,Dis_Markers,PMC3834650,GDS1,,,2013,soil metagenome,Caenorhabditis elegans;Vibrio parahaemolyticus
97185619,Case_Rep_Hematol,PMC6462343,PRJNA437812,SRP135146,SRA,2019,human gut metagenome,


In [25]:
mergedGEO = mergedGEO.dropna(subset = ['converted_accession'])
mergedGEO

Unnamed: 0,journal,pmc_ID,accession,converted_accession,repository,pmc_date,sraSpecies,geoSpecies
0,Alzheimers_Res_Ther,PMC3707052,GSM1,GSE506,GEO,2013,,Homo sapiens
1,Alzheimers_Res_Ther,PMC3707052,GSM1,GSE506,GEO,2013,,Homo sapiens
2,Alzheimers_Res_Ther,PMC3706879,GSE45534,GSE45534,GEO,2013,,Mus musculus
3,Alzheimers_Res_Ther,PMC3706879,GSE45534,GSE45534,GEO,2013,,Mus musculus
4,Alzheimers_Res_Ther,PMC4255636,GSE5281,GSE5281,GEO,2014,,Homo sapiens
...,...,...,...,...,...,...,...,...
97166104,Dis_Markers,PMC3834650,GDS1059,GSE2191,GEO,2013,,Homo sapiens
97166105,Dis_Markers,PMC3834650,GDS1059,GSE2191,GEO,2013,,Homo sapiens
97166106,Dis_Markers,PMC3834650,GDS1059,GSE2191,GEO,2013,,Homo sapiens
97185619,Case_Rep_Hematol,PMC6462343,PRJNA437812,SRP135146,SRA,2019,human gut metagenome,


In [26]:
ds_date = []
hardware = []
lib_type = []
species = []
tput = []
readLength = []
access = []

for acc in pmcData['converted_accession']:
    if(type(acc) == str):
        if('GSE' in acc):
            ds_date.append(grabRelated(geoAttributes, acc, 'Series', 'Release Date'))
            hardware.append(grabRelated(geoAttributes, acc, 'Series', 'Technology'))
            lib_type.append('MISSING')
            species.append(grabRelated(geoAttributes, acc, 'Series', 'Taxonomy'))
            tput.append('MISSING')
            readLength.append('MISSING')
            access.append('MISSING')
        if('SRP' in acc or 'ERP' in acc or 'DRP' in acc):
            ds_date.append(grabRelated(sraAttributes, acc, 'SRAStudy', 'ReleaseDate'))
            hardware.append(grabRelated(sraAttributes, acc, 'SRAStudy', 'Model'))
            lib_type.append(grabRelated(sraAttributes, acc, 'SRAStudy', 'LibraryStrategy'))
            species.append(grabRelated(sraAttributes, acc, 'SRAStudy', 'ScientificName'))
            tput.append(grabRelated(sraAttributes, acc, 'SRAStudy', 'bases'))
            readLength.append(grabRelated(sraAttributes, acc, 'SRAStudy', 'avgLength'))
            access.append(grabRelated(sraAttributes, acc, 'SRAStudy', 'Consent'))
    else:
        ds_date.append('NaN')
        hardware.append('NaN')
        lib_type.append('NaN')
        species.append('NaN')
        tput.append('NaN')
        readLength.append('NaN')
        access.append('NaN')
        
        
# # for reference, amt. of rows in pmcData
# print(len(pmcData['journal']))   

# # amt of items in each potential column
# for i in [ds_date, hardware, lib_type, species, tput, readLength, access]:
#     print(len(i))

pmcData.loc[:,'dataset_date'] = ds_date
pmcData.loc[:,'hardware'] = hardware
pmcData.loc[:,'library_type'] = lib_type
pmcData.loc[:,'species'] = species
pmcData.loc[:,'throughput'] = tput
pmcData.loc[:,'avg_readLength'] = readLength
pmcData.loc[:,'access_restrictions'] = access

pmcData

pmcData.to_csv('pmcFactorMatrix.csv')

ValueError: Must have equal len keys and value when setting with an iterable

In [27]:
# for reference, amt. of rows in pmcData
print(len(pmcData['journal']))   

# amt of items in each potential column
for i in [ds_date, hardware, lib_type, species, tput, readLength, access]:
    print(len(i))

78620
77730
77730
77730
77730
77730
77730
77730


In [41]:
# print(pmcData.iloc[77725:77735])

# print(grabRelated(sraAttributes, 'SRP072215', 'SRAStudy', 'ReleaseDate'))

sraAttributes.loc[sraAttributes['SRAStudy'] == 'SRP072215']

Unnamed: 0,Run,ReleaseDate,LoadDate,spots,bases,spots_with_mates,avgLength,size_MB,AssemblyName,download_path,Experiment,LibraryName,LibraryStrategy,LibrarySelection,LibrarySource,LibraryLayout,InsertSize,InsertDev,Platform,Model,SRAStudy,BioProject,Study_Pubmed_id,ProjectID,Sample,BioSample,SampleType,TaxID,ScientificName,SampleName,g1k_pop_code,source,g1k_analysis_group,Subject_ID,Sex,Disease,Tumor,Affection_Status,Analyte_Type,Histological_Type,Body_Site,CenterName,Submission,dbgap_study_accession,Consent,RunHash,ReadHash
2483169,SRR3285514,2016-06-03 15:05:20,2016-04-25 10:51:25,15995925,4830769350,15995925,302,2633,,https://sra-download.ncbi.nlm.nih.gov/traces/s...,SRX1656384,TM6-lab-1,WGS,RANDOM,GENOMIC,PAIRED,0,0.0,ILLUMINA,Illumina HiSeq 2500,SRP072215,PRJNA315473,,315473,SRS1356703,SAMN04562534,simple,7227,Drosophila melanogaster,TM6-lab,,,,,male,,no,,,,,STOWERS INSTITUTE FOR MEDICAL RESEARCH,SRA393358,,public,C326BEAC0B99C5773AD0D95D493FE20A,C245C1DA0B1E58BE849B8B9CD1052D84
2526521,SRR3285788,2016-06-03 15:05:20,2016-03-24 10:25:18,18682170,5642015340,18682170,302,2182,,https://sra-download.ncbi.nlm.nih.gov/traces/s...,SRX1656467,CyO-TM3-24759-1,WGS,RANDOM,GENOMIC,PAIRED,0,0.0,ILLUMINA,NextSeq 500,SRP072215,PRJNA315473,,315473,SRS1356730,SAMN04562528,simple,7227,Drosophila melanogaster,CyO-TM3-24759,,,,,male,,no,,,,,STOWERS INSTITUTE FOR MEDICAL RESEARCH,SRA393358,,public,1AA25CA1170DE7D41636D896D78DC690,C4C3D10727F466D48B4E623F74F7027C
2526522,SRR3285726,2016-06-03 15:05:20,2016-03-24 10:24:13,20594831,6219638962,20594831,302,2403,,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/s...,SRX1656466,CyO-TM3-3251-1,WGS,RANDOM,GENOMIC,PAIRED,0,0.0,ILLUMINA,NextSeq 500,SRP072215,PRJNA315473,,315473,SRS1356709,SAMN04562526,simple,7227,Drosophila melanogaster,CyO-TM3-3251,,,,,male,,no,,,,,STOWERS INSTITUTE FOR MEDICAL RESEARCH,SRA393358,,public,140F93444C0D3A80228C6D6180F36E1E,11AA2460706F193FC938143F2701E6EC
2526523,SRR3285794,2016-06-03 15:05:20,2016-03-24 10:28:48,21715643,6558124186,21715643,302,2563,,https://sra-download.ncbi.nlm.nih.gov/traces/s...,SRX1656468,CyO-TM3-38418-1,WGS,RANDOM,GENOMIC,PAIRED,0,0.0,ILLUMINA,NextSeq 500,SRP072215,PRJNA315473,,315473,SRS1356729,SAMN04562529,simple,7227,Drosophila melanogaster,CyO-TM3-38418,,,,,male,,no,,,,,STOWERS INSTITUTE FOR MEDICAL RESEARCH,SRA393358,,public,33172E09A551D2A3A7DCBC380491C513,1B66F0E24FFDD9F7D89A32DD187BBD12
2526524,SRR3285619,2016-06-03 15:05:20,2016-03-24 10:30:15,31273523,9444603946,31273523,302,3433,,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/s...,SRX1656455,CyO-TM3-504-1,WGS,RANDOM,GENOMIC,PAIRED,0,0.0,ILLUMINA,NextSeq 500,SRP072215,PRJNA315473,,315473,SRS1356710,SAMN04562525,simple,7227,Drosophila melanogaster,CyO-TM3-504,,,,,male,,no,,,,,STOWERS INSTITUTE FOR MEDICAL RESEARCH,SRA393358,,public,F6B7B76FB0AA622353CF2C7A3C5FDADE,2CD6C85846D1102B6301F4B128F52C5A
2526525,SRR3457432,2016-06-03 15:05:20,2016-04-27 11:59:52,23242094,7019112388,23242094,302,2683,,https://sra-download.ncbi.nlm.nih.gov/traces/s...,SRX1656380,CyO-TM3-mp-22239-1,WGS,RANDOM,GENOMIC,PAIRED,0,0.0,ILLUMINA,NextSeq 500,SRP072215,PRJNA315473,,315473,SRS1356700,SAMN04562527,simple,7227,Drosophila melanogaster,CyO-TM3-mp-22239,,,,,pooled male and female,,no,,,,,STOWERS INSTITUTE FOR MEDICAL RESEARCH,SRA393358,,public,F27E5FE642A20CEC7941C3F93EE58AF8,FECF114C8023B351F33B93781480EFC4
2526600,SRR3285502,2016-06-03 15:05:20,2016-03-24 10:31:05,44837839,13541027378,44837839,302,5630,,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/s...,SRX1656370,TM3-120-1,WGS,RANDOM,GENOMIC,PAIRED,0,0.0,ILLUMINA,NextSeq 500,SRP072215,PRJNA315473,,315473,SRS1356698,SAMN04562513,simple,7227,Drosophila melanogaster,TM3-120,,,,,male,,no,,,,,STOWERS INSTITUTE FOR MEDICAL RESEARCH,SRA393358,,public,FBB53AE85FCAB70BCB5EAF8FA387687B,E0D8DD73D5E9E8BF6373F313D0FA5946
2526601,SRR3285505,2016-06-03 15:05:20,2016-03-23 11:38:08,18611330,5620621660,18611330,302,2149,,https://sra-download.ncbi.nlm.nih.gov/traces/s...,SRX1656373,TM3-1614-1,WGS,RANDOM,GENOMIC,PAIRED,0,0.0,ILLUMINA,NextSeq 500,SRP072215,PRJNA315473,,315473,SRS1356691,SAMN04562516,simple,7227,Drosophila melanogaster,TM3-1614,,,,,male,,no,,,,,STOWERS INSTITUTE FOR MEDICAL RESEARCH,SRA393358,,public,7048EA0D8B31B2FC104239B4D7B970E1,C86AE95AFFEEC9753F75B9F7209A8306
2526602,SRR3285580,2016-06-03 15:05:20,2016-04-25 10:50:05,23242094,7019112388,23242094,302,2683,,https://sra-download.st-va.ncbi.nlm.nih.gov/so...,SRX1656374,TM3-1679-1,WGS,RANDOM,GENOMIC,PAIRED,0,0.0,ILLUMINA,NextSeq 500,SRP072215,PRJNA315473,,315473,SRS1356696,SAMN04562517,simple,7227,Drosophila melanogaster,TM3-1679,,,,,male,,no,,,,,STOWERS INSTITUTE FOR MEDICAL RESEARCH,SRA393358,,public,F27E5FE642A20CEC7941C3F93EE58AF8,FECF114C8023B351F33B93781480EFC4
2526603,SRR3285507,2016-06-03 15:05:20,2016-03-24 12:08:08,21940558,6626048516,21940558,302,2540,,https://sra-download.st-va.ncbi.nlm.nih.gov/so...,SRX1656375,TM3-2053-1,WGS,RANDOM,GENOMIC,PAIRED,0,0.0,ILLUMINA,NextSeq 500,SRP072215,PRJNA315473,,315473,SRS1356695,SAMN04562518,simple,7227,Drosophila melanogaster,TM3-2053,,,,,male,,no,,,,,STOWERS INSTITUTE FOR MEDICAL RESEARCH,SRA393358,,public,6605D8305CA8800A8B2B82E569D4FF5A,93BF226A4401AEB46F51319679DCA3C3


In [29]:
foo = pmcData.iloc[1:77731]

foo.loc[:,'dataset_date'] = ds_date
foo.loc[:,'hardware'] = hardware
foo.loc[:,'library_type'] = lib_type
foo.loc[:,'species'] = species
foo.loc[:,'throughput'] = tput
foo.loc[:,'avg_readLength'] = readLength
foo.loc[:,'access_restrictions'] = access

foo

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)


Unnamed: 0,journal,pmc_ID,accession,converted_accession,repository,pmc_date,dataset_date,hardware,library_type,species,throughput,avg_readLength,access_restrictions
1,Alzheimers_Res_Ther,PMC3706879,GSE45534,GSE45534,GEO,2013,"Sep 28, 2000",SAGE NlaIII,MISSING,Homo sapiens,MISSING,MISSING,MISSING
2,Alzheimers_Res_Ther,PMC4255636,GSE5281,GSE5281,GEO,2014,"May 25, 2004",in situ oligonucleotide,MISSING,Mus musculus,MISSING,MISSING,MISSING
3,Alzheimers_Res_Ther,PMC4731966,GSE67036,GSE67036,GEO,2016,"Nov 07, 2003",in situ oligonucleotide,MISSING,Homo sapiens,MISSING,MISSING,MISSING
4,Alzheimers_Res_Ther,PMC4731966,GSE1297,GSE1297,GEO,2016,"May 06, 2013",in situ oligonucleotide,MISSING,Rattus norvegicus,MISSING,MISSING,MISSING
5,Alzheimers_Res_Ther,PMC4731966,GSE48350,GSE48350,GEO,2016,"Mar 11, 2002",in situ oligonucleotide,MISSING,Homo sapiens,MISSING,MISSING,MISSING
...,...,...,...,...,...,...,...,...,...,...,...,...,...
77726,G3__Bethesda_,PMC5873915,ERP020955,,,2018,,,MISSING,,MISSING,MISSING,MISSING
77727,G3__Bethesda_,PMC5873901,SRP109316,SRP109316,SRA,2018,"Mar 16, 2009",in situ oligonucleotide,MISSING,Homo sapiens,MISSING,MISSING,MISSING
77728,G3__Bethesda_,PMC5873907,PRJNA413446,SRP123654,SRA,2018,,,,,,,
77729,G3__Bethesda_,PMC5873907,PRJNA315473,SRP072215,SRA,2018,2018-03-13 01:47:12,Illumina MiSeq,AMPLICON,human gut metagenome,15187858,602,public


In [49]:
pmcData.loc[:,'dataset_date'] = ds_date
pmcData.loc[:,'hardware'] = hardware
pmcData.loc[:,'library_type'] = lib_type
pmcData.loc[:,'species'] = species
pmcData.loc[:,'throughput'] = tput
pmcData.loc[:,'avg_readLength'] = readLength
pmcData.loc[:,'access_restrictions'] = access


pmcData

Unnamed: 0,journal,pmc_ID,accession,converted_accession,repository,dataset_date,hardware,library_type,species,throughput,avg_readLength,access_restrictions
1,Alzheimers_Res_Ther,PMC3706879,GSE45534,GSE45534,GEO,"May 25, 2004",in situ oligonucleotide,MISSING,Mus musculus,MISSING,MISSING,MISSING
2,Alzheimers_Res_Ther,PMC4255636,GSE5281,GSE5281,GEO,"Nov 07, 2003",in situ oligonucleotide,MISSING,Homo sapiens,MISSING,MISSING,MISSING
3,Alzheimers_Res_Ther,PMC4731966,GSE67036,GSE67036,GEO,"May 06, 2013",in situ oligonucleotide,MISSING,Rattus norvegicus,MISSING,MISSING,MISSING
4,Alzheimers_Res_Ther,PMC4731966,GSE1297,GSE1297,GEO,"Mar 11, 2002",in situ oligonucleotide,MISSING,Homo sapiens,MISSING,MISSING,MISSING
5,Alzheimers_Res_Ther,PMC4731966,GSE48350,GSE48350,GEO,"Nov 07, 2003",in situ oligonucleotide,MISSING,Homo sapiens,MISSING,MISSING,MISSING
6,Alzheimers_Res_Ther,PMC4731966,GSE36980,GSE36980,GEO,,,MISSING,,MISSING,MISSING,MISSING
7,Alzheimers_Res_Ther,PMC4731966,GSE12679,GSE12679,GEO,"Nov 07, 2003",in situ oligonucleotide,MISSING,Homo sapiens,MISSING,MISSING,MISSING
8,Alzheimers_Res_Ther,PMC4731966,GSE12293,GSE12293,GEO,"Nov 07, 2003",in situ oligonucleotide,MISSING,Homo sapiens,MISSING,MISSING,MISSING
9,Alzheimers_Res_Ther,PMC4731966,GSE46871,GSE46871,GEO,,,MISSING,,MISSING,MISSING,MISSING
10,Alzheimers_Res_Ther,PMC5513091,GSE93678,GSE93678,GEO,"Apr 16, 2013",high-throughput sequencing,MISSING,Mus musculus,MISSING,MISSING,MISSING


In [30]:
pd.set_option('display.min_rows', 100)
pmcData

Unnamed: 0,journal,pmc_ID,accession,converted_accession,repository,dataset_date,hardware,library_type,species,avg_thoroughput
1,Alzheimers_Res_Ther,PMC3706879,GSE45534,GSE45534,GEO,"May 25, 2004",in situ oligonucleotide,expressionArray;RNA-Seq,Mus musculus,
2,Alzheimers_Res_Ther,PMC4255636,GSE5281,GSE5281,GEO,"Nov 07, 2003",in situ oligonucleotide,expressionArray;RNA-Seq,Homo sapiens,
3,Alzheimers_Res_Ther,PMC4731966,GSE67036,GSE67036,GEO,"May 06, 2013",in situ oligonucleotide,expressionArray;RNA-Seq,Rattus norvegicus,
4,Alzheimers_Res_Ther,PMC4731966,GSE1297,GSE1297,GEO,"Mar 11, 2002",in situ oligonucleotide,expressionArray;RNA-Seq,Homo sapiens,
5,Alzheimers_Res_Ther,PMC4731966,GSE48350,GSE48350,GEO,"Nov 07, 2003",in situ oligonucleotide,expressionArray;RNA-Seq,Homo sapiens,
6,Alzheimers_Res_Ther,PMC4731966,GSE36980,GSE36980,GEO,,,expressionArray;RNA-Seq,,
7,Alzheimers_Res_Ther,PMC4731966,GSE12679,GSE12679,GEO,"Nov 07, 2003",in situ oligonucleotide,expressionArray;RNA-Seq,Homo sapiens,
8,Alzheimers_Res_Ther,PMC4731966,GSE12293,GSE12293,GEO,"Nov 07, 2003",in situ oligonucleotide,expressionArray;RNA-Seq,Homo sapiens,
9,Alzheimers_Res_Ther,PMC4731966,GSE46871,GSE46871,GEO,,,expressionArray;RNA-Seq,,
10,Alzheimers_Res_Ther,PMC5513091,GSE93678,GSE93678,GEO,"Apr 16, 2013",high-throughput sequencing,expressionArray;RNA-Seq,Mus musculus,


In [27]:
for i in [ds_date, hardware, lib_type, species, tput, readLength]:
    print(len(i))

99
99
99
99
28
28


In [None]:
pmcData.to_csv('pmcMatrix_withTechSpecies.csv')