This script produces a table of PMC paper IDs matched to the SRA or GEO datasets they contain, and metadata about those datasets.

In [67]:
# import required modules
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rc
import pandas as pd
import seaborn as sns

In [68]:
# define function returning corresponding metadata or Study/Series ID given any information on a dataset
def grabRelated(table, key, key_col, out_col):
    indices = table.index[table[key_col] == key].tolist()
    if(len(indices) > 0):
        allHits = table.loc[indices][[out_col]]
        modeHits = allHits.mode()
        if(modeHits.empty):
            return 'NaN'
        else:
            return modeHits.iloc[0,0]
    else:
        return 'NaN'

# define functions to perform QC on accessions, checking if they exist in the references
def sraChecksOut(acc, col):
    if(acc in sraReference[col].tolist()):
        return True
    else:
        return False
    
def geoChecksOut(acc, col):
    if(acc in geoReference[col].tolist()):
        return True
    else:
        return False

STEP I: Clean the data

In [69]:
# import GEO reference data

# import table with accession, platform, and series
geoAPS = pd.read_csv('../geo_sample.csv')

# import table with datasets IDs
geoDS = pd.read_csv('../geo_series_table.csv', low_memory = False)
geoDS.columns = ['Series', 'Title', 'Series Type', 'Taxonomy', 
                 'Sample Count', 'Datasets', 'Supplementary Types', 
                 'Supplementary Links', 'PubMed ID', 'SRA Accession', 
                 'Contact', 'Release Date']

# add datasets column by merging
allData = pd.merge(geoAPS, geoDS, how = 'outer', on = 'Series')
geoReference = allData[['Series', 'Accession', 'Platform', 'Datasets']]

geoReference

Unnamed: 0,Series,Accession,Platform,Datasets
0,GSE506,GSM1,GPL4,
1,GSE506,GSM2,GPL4,
2,GSE462,GSM3,GPL5,
3,GSE462,GSM4,GPL5,
4,GSE462,GSM5,GPL5,
...,...,...,...,...
3280011,GSE136775,,,
3280012,GSE136776,,,
3280013,GSE137458,,,
3280014,GSE137562,,,


In [70]:
# import SRA reference data
sraReference = pd.read_csv('../sraIDfull.csv', error_bad_lines = False, low_memory=False, quoting=3)
sraReference = sraReference[['SRAStudy', 'Run', 'Experiment', 
                             'BioProject', 'Submission', 'Sample']]
sraReference

Unnamed: 0,SRAStudy,Run,Experiment,BioProject,Submission,Sample
0,ERP000767,ERR3550121,ERX3556726,PRJEB2600,ERA2143351,ERS3773255
1,ERP000767,ERR3550142,ERX3556747,PRJEB2600,ERA2143351,ERS3773255
2,ERP000767,ERR3550147,ERX3556752,PRJEB2600,ERA2143351,ERS3773263
3,ERP000767,ERR3550125,ERX3556730,PRJEB2600,ERA2143351,ERS3773263
4,ERP000767,ERR3550137,ERX3556742,PRJEB2600,ERA2143351,ERS3773252
...,...,...,...,...,...,...
7676058,SRP002493,SRR049674,SRX020916,,SRA017695,SRS073299
7676059,SRP002493,SRR049675,SRX021028,,SRA017695,SRS073299
7676060,SRP002493,SRR049677,SRX021029,,SRA017695,SRS073299
7676061,SRP002064,SRR042496,SRX020084,PRJNA46359,DIEGO,SRS011854


In [71]:
# import data scraped from PubMed XML files
pmcData = pd.read_csv('../data_tables/preFilterMatrix.csv')
pmcData

Unnamed: 0,journal,pmc_ID,accession
0,Alzheimers_Res_Ther,PMC3707052,GSM1
1,Alzheimers_Res_Ther,PMC3706879,GSE45534
2,Alzheimers_Res_Ther,PMC3706879,GSE45534
3,Alzheimers_Res_Ther,PMC3706879,GSE45534
4,Alzheimers_Res_Ther,PMC3706879,GSE45534
...,...,...,...
144401,Dis_Markers,PMC3834650,GDS1059
144402,Dis_Markers,PMC3834650,GDS1059
144403,Dis_Markers,PMC3834650,GDS1
144404,Case_Rep_Hematol,PMC6462343,PRJNA437812


In [6]:
# check every accession against the corresponding reference, and convert to STUDY/SERIES
# add converted accessions to a new list, to be added as a column later
# CAUTION: long input list, loop takes several hours to run

converted_acc = []

for acc in pmcData['accession']:
    style = ''
    # assign a style to the accession, corresponds to column names
    # SRA styles
    if('SRP' in acc or 'ERP' in acc or 'DRP' in acc):
        style = 'SRAStudy'
    elif('SRR' in acc or 'ERR' in acc or 'DRR' in acc):
        style = 'Run'
    elif('SRX' in acc or 'ERX' in acc or 'DRX' in acc):
        style = 'Experiment'
    elif('PRJNA' in acc or 'PRJD' in acc or 'PRJEB' in acc):
        style = 'BioProject'
    elif('SRA' in acc or 'ERA' in acc or 'DRA' in acc):
        style = 'Submission'
    elif('SRS' in acc or 'ERS' in acc or 'DRS' in acc):
        style = 'Sample'
    # GEO styles
    elif('GSE' in acc):
        style = 'Series'
    elif('GSM' in acc):
        style = 'Accession'
    elif('GPL' in acc):
        style = 'Platform'
    elif('GDS' in acc):
        style = 'Datasets'
    else:
        style = ''
        
    # for SRA accessions, check if they exist in the SRA reference
    # ...and add corresponding STUDY ID to new column
    if(style == 'SRAStudy' or style == 'Run' or style == 'Experiment' or 
       style == 'BioProject' or style == 'Submission' or style == 'Sample'):
        if(not sraChecksOut(acc, style)):
            converted_acc.append('NaN')
        else:
            converted_acc.append(grabRelated(sraReference, acc, style, 'SRAStudy'))
    # for GEO accessions, check if they exist in the GEO reference
    # ...and add corresponding SERIES ID to new column
    elif(style == 'Series' or style == 'Accession' or style == 'Platform' or style == 'Datasets'):
        if(not geoChecksOut(acc, style)):
            converted_acc.append('NaN')
        else:
            converted_acc.append(grabRelated(geoReference, acc, style, 'Series'))
    # something doesn't belong, record as missing
    else:
        converted_acc.append('NaN')

# add the converted accession list as a new column in PubMed data table
# pd.set_option('display.min_rows', 50)
pmcData.loc[:,'converted_accession'] = converted_acc
pmcData = pmcData.drop_duplicates()
pmcData

Unnamed: 0,journal,pmc_ID,accession,converted_accession
0,Alzheimers_Res_Ther,PMC3707052,GSM1,GSE506
1,Alzheimers_Res_Ther,PMC3706879,GSE45534,GSE45534
5,Alzheimers_Res_Ther,PMC4255636,GSE5281,GSE5281
6,Alzheimers_Res_Ther,PMC4731966,GSE67036,GSE67036
7,Alzheimers_Res_Ther,PMC4731966,GSE1297,GSE1297
8,Alzheimers_Res_Ther,PMC4731966,GSE48350,GSE48350
9,Alzheimers_Res_Ther,PMC4731966,GSE36980,GSE36980
10,Alzheimers_Res_Ther,PMC4731966,GSE12679,GSE12679
11,Alzheimers_Res_Ther,PMC4731966,GSE12293,GSE12293
17,Alzheimers_Res_Ther,PMC4731966,GSE46871,GSE46871


In [8]:
# save 'checkpoint' to CSV file
# pmcData.to_csv('postFilterMatrix.csv')

In [72]:
# temporary cell saving time from big loop...

pmcData = pd.read_csv('../data_tables/postFilterMatrix.csv')
pmcData

Unnamed: 0,journal,pmc_ID,accession,converted_accession
0,Alzheimers_Res_Ther,PMC3707052,GSM1,GSE506
1,Alzheimers_Res_Ther,PMC3706879,GSE45534,GSE45534
2,Alzheimers_Res_Ther,PMC4255636,GSE5281,GSE5281
3,Alzheimers_Res_Ther,PMC4731966,GSE67036,GSE67036
4,Alzheimers_Res_Ther,PMC4731966,GSE1297,GSE1297
...,...,...,...,...
78615,Dis_Markers,PMC6589301,GSE62254,GSE62254
78616,Dis_Markers,PMC3834650,GDS1059,GSE2191
78617,Dis_Markers,PMC3834650,GDS1,
78618,Case_Rep_Hematol,PMC6462343,PRJNA437812,SRP135146


Step II: Add all desired factors about each SRA or GEO dataset

In [73]:
# Add a column tagging each accession as GEO or SRA

repoList = []

for i in pmcData['converted_accession']:
    if(type(i) == str):
        if('GSE' in i):
            repoList.append('GEO')
        elif('SRP' in i or 'ERP' in i or 'DRP' in i):
            repoList.append('SRA')
        else:
            repoList.append('NaN')
    else:
        repoList.append('NaN')
        
pmcData['repository'] = repoList
pmcData

Unnamed: 0,journal,pmc_ID,accession,converted_accession,repository
0,Alzheimers_Res_Ther,PMC3707052,GSM1,GSE506,GEO
1,Alzheimers_Res_Ther,PMC3706879,GSE45534,GSE45534,GEO
2,Alzheimers_Res_Ther,PMC4255636,GSE5281,GSE5281,GEO
3,Alzheimers_Res_Ther,PMC4731966,GSE67036,GSE67036,GEO
4,Alzheimers_Res_Ther,PMC4731966,GSE1297,GSE1297,GEO
...,...,...,...,...,...
78615,Dis_Markers,PMC6589301,GSE62254,GSE62254,GEO
78616,Dis_Markers,PMC3834650,GDS1059,GSE2191,GEO
78617,Dis_Markers,PMC3834650,GDS1,,
78618,Case_Rep_Hematol,PMC6462343,PRJNA437812,SRP135146,SRA


In [74]:
# import SRA attribute data
# CAUTION: large file! Time delay on import...

sraAttributes = pd.read_csv('../sra_complete_runs.csv', error_bad_lines = False, low_memory=False)
pd.set_option('display.max_columns', 50)
sraAttributes

Unnamed: 0,Run,ReleaseDate,LoadDate,spots,bases,spots_with_mates,avgLength,size_MB,AssemblyName,download_path,Experiment,LibraryName,LibraryStrategy,LibrarySelection,LibrarySource,LibraryLayout,InsertSize,InsertDev,Platform,Model,SRAStudy,BioProject,Study_Pubmed_id,ProjectID,Sample,BioSample,SampleType,TaxID,ScientificName,SampleName,g1k_pop_code,source,g1k_analysis_group,Subject_ID,Sex,Disease,Tumor,Affection_Status,Analyte_Type,Histological_Type,Body_Site,CenterName,Submission,dbgap_study_accession,Consent,RunHash,ReadHash
0,ERR3550121,2019-09-25 19:35:40,,0,0,0,0,0,,,ERX3556726,,WGS,RANDOM,GENOMIC,PAIRED,250,0.0,ILLUMINA,Illumina HiSeq 2000,ERP000767,PRJEB2600,,204335,ERS3773255,,simple,0,Leishmania infantum,0c4ac31a-dec5-11e9-ba7b-68b599768938,,,,,,,no,,,,,UOC-CCG,ERA2143351,,public,,
1,ERR3550142,2019-09-25 19:35:41,,0,0,0,0,0,,,ERX3556747,,WGS,RANDOM,GENOMIC,PAIRED,250,0.0,ILLUMINA,Illumina HiSeq 2000,ERP000767,PRJEB2600,,204335,ERS3773255,,simple,0,Leishmania infantum,0c4ac31a-dec5-11e9-ba7b-68b599768938,,,,,,,no,,,,,UOC-CCG,ERA2143351,,public,,
2,ERR3550147,2019-09-25 19:35:41,,0,0,0,0,0,,,ERX3556752,,WGS,RANDOM,GENOMIC,PAIRED,250,0.0,ILLUMINA,Illumina HiSeq 2000,ERP000767,PRJEB2600,,204335,ERS3773263,,simple,0,Leishmania donovani,0e2d3a2c-dec6-11e9-a5d1-68b59976a384,,,,,,,no,,,,,UOC-CCG,ERA2143351,,public,,
3,ERR3550125,2019-09-25 19:35:40,,0,0,0,0,0,,,ERX3556730,,WGS,RANDOM,GENOMIC,PAIRED,250,0.0,ILLUMINA,Illumina HiSeq 2000,ERP000767,PRJEB2600,,204335,ERS3773263,,simple,0,Leishmania donovani,0e2d3a2c-dec6-11e9-a5d1-68b59976a384,,,,,,,no,,,,,UOC-CCG,ERA2143351,,public,,
4,ERR3550137,2019-09-25 19:35:40,,0,0,0,0,0,,,ERX3556742,,WGS,RANDOM,GENOMIC,PAIRED,250,0.0,ILLUMINA,Illumina HiSeq 2000,ERP000767,PRJEB2600,,204335,ERS3773252,,simple,0,Leishmania donovani,188c9a6e-deba-11e9-ba7b-68b599768938,,,,,,,no,,,,,UOC-CCG,ERA2143351,,public,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7676058,SRR049674,2010-05-27 09:26:35,2014-05-27 02:57:13,35888370,1794418500,0,50,1504,,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/s...,SRX020916,VVWTA 0h A,FL-cDNA,RT-PCR,VIRAL RNA,SINGLE,0,0.0,ABI_SOLID,AB SOLiD System 3.0,SRP002493,,2.0,0,SRS073299,SAMN00013848,simple,10245,Vaccinia virus,VACV,,,,,,,no,,,,,NIAID-RML-RTS,SRA017695,,public,1C9EE0CEFAE2352A0321B24EAB64EE18,753ED95CDA0B112FD877F38960613605
7676059,SRR049675,2010-05-27 09:26:35,2014-05-27 02:58:17,40264881,2013244050,0,50,1696,,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/s...,SRX021028,VVWTA 0.5A,FL-cDNA,RT-PCR,VIRAL RNA,SINGLE,0,0.0,ABI_SOLID,AB SOLiD System 3.0,SRP002493,,2.0,0,SRS073299,SAMN00013848,simple,10245,Vaccinia virus,VACV,,,,,,,no,,,,,NIAID-RML-RTS,SRA017695,,public,9F96F35BE762574A084985FD9C918C28,392587111A738BC44AE8878E649D052A
7676060,SRR049677,2010-05-27 09:26:35,2014-05-27 02:58:58,40852259,2042612950,0,50,1732,,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/s...,SRX021029,VVWTA 1h A,FL-cDNA,RT-PCR,VIRAL RNA,SINGLE,0,0.0,ABI_SOLID,AB SOLiD System 3.0,SRP002493,,2.0,0,SRS073299,SAMN00013848,simple,10245,Vaccinia virus,VACV,,,,,,,no,,,,,NIAID-RML-RTS,SRA017695,,public,22DCD0A93DAA207187EF6C6F6A835A55,416EBCA2B1A6AB70E4F778F16C36FCE5
7676061,SRR042496,2010-05-11 15:16:41,2012-01-19 16:52:13,47760,16505980,0,345,42,,https://sra-downloadb.be-md.ncbi.nlm.nih.gov/s...,SRX020084,VirRnaMgMosqCeI,OTHER,RANDOM PCR,VIRAL RNA,SINGLE,0,0.0,LS454,454 GS FLX Titanium,SRP002064,PRJNA46359,2.0,46359,SRS011854,SAMN00012272,simple,1284618,environmental samples,VirRnaMgMosqCeI,,,,,,,no,,,,,SAN DIEGO STATE UNIVERSITY,SRA012164,,public,633E95BCDFC18AD9C8B5DEAAFAE4DD3F,00AB1B11657DAE26FFECDAD623148A3C


In [75]:
# import GEO attribute data and add Series column

geoAttributes = pd.read_csv('../geo_platforms_table.csv')
geoAttributes.rename(columns={'Accession':'Platform'}, inplace=True)
geoAttributes = pd.merge(geoAttributes, geoReference, how = 'left', on = 'Platform')
geoAttributes

Unnamed: 0,Platform,Title,Technology,Taxonomy,Data Rows,Samples Count,Series Count,Contact,Release Date,Series,Accession,Datasets
0,GPL4,SAGE:10:NlaIII:Homo sapiens,SAGE NlaIII,Homo sapiens,265577,580,58,GEO,"Sep 28, 2000",GSE506,GSM1,
1,GPL4,SAGE:10:NlaIII:Homo sapiens,SAGE NlaIII,Homo sapiens,265577,580,58,GEO,"Sep 28, 2000",GSE506,GSM2,
2,GPL4,SAGE:10:NlaIII:Homo sapiens,SAGE NlaIII,Homo sapiens,265577,580,58,GEO,"Sep 28, 2000",GSE10,GSM571,
3,GPL4,SAGE:10:NlaIII:Homo sapiens,SAGE NlaIII,Homo sapiens,265577,580,58,GEO,"Sep 28, 2000",GSE10,GSM572,
4,GPL4,SAGE:10:NlaIII:Homo sapiens,SAGE NlaIII,Homo sapiens,265577,580,58,GEO,"Sep 28, 2000",GSE10,GSM573,
...,...,...,...,...,...,...,...,...,...,...,...,...
2961228,GPL27549,Illumina HiSeq 2000 (Escherichia coli str. K-1...,high-throughput sequencing,Escherichia coli str. K-12 substr. DH10B,0,0,0,GEO,"Sep 30, 2019",,,
2961229,GPL27550,Illumina HiSeq 4000 (Solanum chacoense),high-throughput sequencing,Solanum chacoense,0,0,0,GEO,"Sep 30, 2019",,,
2961230,GPL27552,"Illumina HiSeq 2500 (Caenorhabditis elegans ,V...",high-throughput sequencing,"Caenorhabditis elegans ,Vibrio parahaemolyticus",0,0,0,GEO,"Sep 30, 2019",,,
2961231,GPL27554,Illumina HiSeq 2000 (Trichoderma atroviride),high-throughput sequencing,Trichoderma atroviride,0,0,0,GEO,"Sep 30, 2019",,,


In [76]:
# np.unique(geoAttributes['Technology'].tolist()).tolist()

In [77]:
# add column for paper publish date
# this data was scraped from XML files on the hoffman2 cluster: /u/scratch/n/nikodm/pmcOA/

pmc_dates = pd.read_csv('../data_lists/postFilterDates.txt')

pmcData.loc[:,'pmc_date'] = pmc_dates['date'].tolist()
pmcData

Unnamed: 0,journal,pmc_ID,accession,converted_accession,repository,pmc_date
0,Alzheimers_Res_Ther,PMC3707052,GSM1,GSE506,GEO,2013
1,Alzheimers_Res_Ther,PMC3706879,GSE45534,GSE45534,GEO,2013
2,Alzheimers_Res_Ther,PMC4255636,GSE5281,GSE5281,GEO,2014
3,Alzheimers_Res_Ther,PMC4731966,GSE67036,GSE67036,GEO,2016
4,Alzheimers_Res_Ther,PMC4731966,GSE1297,GSE1297,GEO,2016
...,...,...,...,...,...,...
78615,Dis_Markers,PMC6589301,GSE62254,GSE62254,GEO,2019
78616,Dis_Markers,PMC3834650,GDS1059,GSE2191,GEO,2013
78617,Dis_Markers,PMC3834650,GDS1,,,2013
78618,Case_Rep_Hematol,PMC6462343,PRJNA437812,SRP135146,SRA,2019


In [86]:
# Get every factor we're interested in from our tables of GEO and SRA metadata...

# take a slice of the GEO and SRA attribute tables with only the info we want
# drop duplicate converted_accession entries to avoid unnecessarily large table
slicedGEOAtt = geoAttributes[['Series', 'Release Date', 'Technology', 'Taxonomy']]
slicedGEOAtt.columns = ['converted_accession', 'geoRelease', 'geoHardware', 'geoSpecies']
slicedGEOAtt = slicedGEOAtt.drop_duplicates(subset = ['converted_accession'])

slicedSRAAtt = sraAttributes[['SRAStudy', 'ReleaseDate', 'Model', 
                              'LibraryStrategy', 'ScientificName', 
                              'bases', 'avgLength', 'Consent']]
slicedSRAAtt.columns = ['converted_accession', 'sraRelease', 'sraHardware', 
                        'sraLibrary_strategy', 'sraSpecies', 
                        'sraBases', 'sraAvg_length', 'sraAccess']
slicedSRAAtt = slicedSRAAtt.drop_duplicates(subset = ['converted_accession'])

In [87]:
# merge SRA attributes onto pmcData table
mergedSRA = pd.merge(pmcData, slicedSRAAtt, how = 'left', on = 'converted_accession')
mergedSRA = mergedSRA.drop_duplicates()

# merge GEO attributes onto table of pmcData + SRA Attributes
allFactors = pd.merge(mergedSRA, slicedGEOAtt, how = 'left', on = 'converted_accession')
allFactors = allFactors.dropna(subset = ['converted_accession'])

In [88]:
# clean up columns with factor for both SRA and GEO, rearrange columns
allFactors['species'] = allFactors['sraSpecies'].fillna(allFactors['geoSpecies'])
allFactors = allFactors.drop(labels = ['sraSpecies', 'geoSpecies'], axis = 1)

allFactors['hardware'] = allFactors['sraHardware'].fillna(allFactors['geoHardware'])
allFactors = allFactors.drop(labels = ['sraHardware', 'geoHardware'], axis = 1)

allFactors['repository_date'] = allFactors['sraRelease'].fillna(allFactors['geoRelease'])
allFactors = allFactors.drop(labels = ['sraRelease', 'geoRelease'], axis = 1)

cols = ['journal', 'pmc_ID', 'accession', 'converted_accession', 'repository', 
        'pmc_date', 'repository_date', 'species', 
        'hardware', 'sraLibrary_strategy', 'sraAvg_length', 'sraBases', 'sraAccess']

allFactors = allFactors[cols]
allFactors

Unnamed: 0,journal,pmc_ID,accession,converted_accession,repository,pmc_date,repository_date,species,hardware,sraLibrary_strategy,sraAvg_length,sraBases,sraAccess
0,Alzheimers_Res_Ther,PMC3707052,GSM1,GSE506,GEO,2013,"Sep 28, 2000",Homo sapiens,SAGE NlaIII,,,,
1,Alzheimers_Res_Ther,PMC3706879,GSE45534,GSE45534,GEO,2013,"May 25, 2004",Mus musculus,in situ oligonucleotide,,,,
2,Alzheimers_Res_Ther,PMC4255636,GSE5281,GSE5281,GEO,2014,"Nov 07, 2003",Homo sapiens,in situ oligonucleotide,,,,
3,Alzheimers_Res_Ther,PMC4731966,GSE67036,GSE67036,GEO,2016,"May 06, 2013",Rattus norvegicus,in situ oligonucleotide,,,,
4,Alzheimers_Res_Ther,PMC4731966,GSE1297,GSE1297,GEO,2016,"Mar 11, 2002",Homo sapiens,in situ oligonucleotide,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
78614,Dis_Markers,PMC6589307,GSE14795,GSE14795,GEO,2019,"Mar 11, 2002",Homo sapiens,in situ oligonucleotide,,,,
78615,Dis_Markers,PMC6589301,GSE62254,GSE62254,GEO,2019,,,,,,,
78616,Dis_Markers,PMC3834650,GDS1059,GSE2191,GEO,2013,"Mar 16, 2009",Homo sapiens,in situ oligonucleotide,,,,
78618,Case_Rep_Hematol,PMC6462343,PRJNA437812,SRP135146,SRA,2019,2018-03-13 01:47:12,human gut metagenome,Illumina MiSeq,AMPLICON,602.0,17743348.0,public


In [81]:
# # old loop that takes forever, replaced by merge functions... holding onto just in case

# ds_date = []
# hardware = []
# lib_type = []
# species = []
# tput = []
# readLength = []
# access = []

# for acc in pmcData['converted_accession']:
#     if(type(acc) == str):
#         if('GSE' in acc):
#             ds_date.append(grabRelated(geoAttributes, acc, 'Series', 'Release Date'))
#             hardware.append(grabRelated(geoAttributes, acc, 'Series', 'Technology'))
#             lib_type.append('MISSING')
#             species.append(grabRelated(geoAttributes, acc, 'Series', 'Taxonomy'))
#             tput.append('MISSING')
#             readLength.append('MISSING')
#             access.append('MISSING')
#         if('SRP' in acc or 'ERP' in acc or 'DRP' in acc):
#             ds_date.append(grabRelated(sraAttributes, acc, 'SRAStudy', 'ReleaseDate'))
#             hardware.append(grabRelated(sraAttributes, acc, 'SRAStudy', 'Model'))
#             lib_type.append(grabRelated(sraAttributes, acc, 'SRAStudy', 'LibraryStrategy'))
#             species.append(grabRelated(sraAttributes, acc, 'SRAStudy', 'ScientificName'))
#             tput.append(grabRelated(sraAttributes, acc, 'SRAStudy', 'bases'))
#             readLength.append(grabRelated(sraAttributes, acc, 'SRAStudy', 'avgLength'))
#             access.append(grabRelated(sraAttributes, acc, 'SRAStudy', 'Consent'))
#     else:
#         ds_date.append('NaN')
#         hardware.append('NaN')
#         lib_type.append('NaN')
#         species.append('NaN')
#         tput.append('NaN')
#         readLength.append('NaN')
#         access.append('NaN')
        
        
# # # for reference, amt. of rows in pmcData
# # print(len(pmcData['journal']))   

# # # amt of items in each potential column
# # for i in [ds_date, hardware, lib_type, species, tput, readLength, access]:
# #     print(len(i))

# pmcData.loc[:,'dataset_date'] = ds_date
# pmcData.loc[:,'hardware'] = hardware
# pmcData.loc[:,'library_type'] = lib_type
# pmcData.loc[:,'species'] = species
# pmcData.loc[:,'throughput'] = tput
# pmcData.loc[:,'avg_readLength'] = readLength
# pmcData.loc[:,'access_restrictions'] = access

# pmcData

# pmcData.to_csv('pmcFactorMatrix.csv')