In [10]:
# import modules

import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rc
import pandas as pd
import seaborn as sns

In [11]:
# import GEO reference data

# import table with accession, platform, and series
geoAPS = pd.read_csv('../geo_sample.csv')

# import table with datasets IDs
geoDS = pd.read_csv('../geo_series_table.csv', low_memory = False)
geoDS.columns = ['Series', 'Title', 'Series Type', 'Taxonomy', 
                 'Sample Count', 'Datasets', 'Supplementary Types', 
                 'Supplementary Links', 'PubMed ID', 'SRA Accession', 
                 'Contact', 'Release Date']

# add datasets column by merging
allData = pd.merge(geoAPS, geoDS, how = 'outer', on = 'Series')
geoReference = allData[['Series', 'Accession', 'Platform', 'Datasets']]

geoReference

Unnamed: 0,Series,Accession,Platform,Datasets
0,GSE506,GSM1,GPL4,
1,GSE506,GSM2,GPL4,
2,GSE462,GSM3,GPL5,
3,GSE462,GSM4,GPL5,
4,GSE462,GSM5,GPL5,
...,...,...,...,...
3280011,GSE136775,,,
3280012,GSE136776,,,
3280013,GSE137458,,,
3280014,GSE137562,,,


In [12]:
# import SRA reference data
sraReference = pd.read_csv('../sraIDfull.csv', error_bad_lines = False, low_memory=False, quoting=3)
sraReference = sraReference[['SRAStudy', 'Run', 'Experiment', 
                             'BioProject', 'Submission', 'Sample']]
sraReference

Unnamed: 0,SRAStudy,Run,Experiment,BioProject,Submission,Sample
0,ERP000767,ERR3550121,ERX3556726,PRJEB2600,ERA2143351,ERS3773255
1,ERP000767,ERR3550142,ERX3556747,PRJEB2600,ERA2143351,ERS3773255
2,ERP000767,ERR3550147,ERX3556752,PRJEB2600,ERA2143351,ERS3773263
3,ERP000767,ERR3550125,ERX3556730,PRJEB2600,ERA2143351,ERS3773263
4,ERP000767,ERR3550137,ERX3556742,PRJEB2600,ERA2143351,ERS3773252
...,...,...,...,...,...,...
7676058,SRP002493,SRR049674,SRX020916,,SRA017695,SRS073299
7676059,SRP002493,SRR049675,SRX021028,,SRA017695,SRS073299
7676060,SRP002493,SRR049677,SRX021029,,SRA017695,SRS073299
7676061,SRP002064,SRR042496,SRX020084,PRJNA46359,DIEGO,SRS011854


In [13]:
# import data scraped from PubMed XML files
pmcData = pd.read_csv('../data_tables/preFilterMatrix.csv')
pmcData

# temporary mini dataframe...
# pmcData = pmcData.iloc[141499:144406]
# pmcData

Unnamed: 0,journal,pmc_ID,accession
0,Alzheimers_Res_Ther,PMC3707052,GSM1
1,Alzheimers_Res_Ther,PMC3706879,GSE45534
2,Alzheimers_Res_Ther,PMC3706879,GSE45534
3,Alzheimers_Res_Ther,PMC3706879,GSE45534
4,Alzheimers_Res_Ther,PMC3706879,GSE45534
...,...,...,...
144401,Dis_Markers,PMC3834650,GDS1059
144402,Dis_Markers,PMC3834650,GDS1059
144403,Dis_Markers,PMC3834650,GDS1
144404,Case_Rep_Hematol,PMC6462343,PRJNA437812


In [14]:
# define function returning corresponding metadata or Study/Series ID given any information on a dataset
def grabRelated(table, key, key_col, out_col):
    indices = table.index[table[key_col] == key].tolist()
    if(len(indices) > 0):
        allHits = table.loc[indices][[out_col]]
        modeHits = allHits.mode()
        if(modeHits.empty):
            return 'NaN'
        else:
            return modeHits.iloc[0,0]
    else:
        return 'NaN'

# define functions performing QC on accessions, checking if they exist in the references
def sraChecksOut(acc, col):
    if(acc in sraReference[col].tolist()):
        return True
    else:
        return False
    
def geoChecksOut(acc, col):
    if(acc in geoReference[col].tolist()):
        return True
    else:
        return False

In [15]:
# HAVE BEEN GENERALIZED INTO grabRelated... SAVING JUST IN CASE
        # def corrSRAStudy(acc, col):
        #     indices = sraReference.index[sraReference[col] == acc].tolist()
        #     if(len(indices) > 0):
        #         allSRAStudies = sraReference.loc[indices][['SRAStudy']]
        #         modeSRA = allSRAStudies.mode()
        #         if(modeSRA.empty):
        #             return 'NaN'
        #         else:
        #             return modeSRA.iloc[0,0]
        #     else:
        #         return 'NaN'

        # def corrGEOSeries(acc, col):
        #     indices = geoReference.index[geoReference[col] == acc].tolist()
        #     if(len(indices) > 0):
        #         allGEOSeries = geoReference.loc[indices][['Series']]
        #         modeGEO = allGEOSeries.mode()
        #         if(modeGEO.empty):
        #             return 'NaN'
        #         else:
        #             return modeGEO.iloc[0,0]
        #     else:
        #         return 'NaN'

In [6]:
# check every accession against the corresponding reference, and convert to STUDY/SERIES
# add converted accessions to a new list, to be added as a column later

converted_acc = []

for acc in pmcData['accession']:
    style = ''
    # assign a style to the accession, corresponds to column names
    # SRA styles
    if('SRP' in acc or 'ERP' in acc or 'DRP' in acc):
        style = 'SRAStudy'
    elif('SRR' in acc or 'ERR' in acc or 'DRR' in acc):
        style = 'Run'
    elif('SRX' in acc or 'ERX' in acc or 'DRX' in acc):
        style = 'Experiment'
    elif('PRJNA' in acc or 'PRJD' in acc or 'PRJEB' in acc):
        style = 'BioProject'
    elif('SRA' in acc or 'ERA' in acc or 'DRA' in acc):
        style = 'Submission'
    elif('SRS' in acc or 'ERS' in acc or 'DRS' in acc):
        style = 'Sample'
    # GEO styles
    elif('GSE' in acc):
        style = 'Series'
    elif('GSM' in acc):
        style = 'Accession'
    elif('GPL' in acc):
        style = 'Platform'
    elif('GDS' in acc):
        style = 'Datasets'
    else:
        style = ''
        
    # for SRA accessions, check if they exist in the SRA reference
    # ...and add corresponding STUDY ID to new column
    if(style == 'SRAStudy' or style == 'Run' or style == 'Experiment' or 
       style == 'BioProject' or style == 'Submission' or style == 'Sample'):
        if(not sraChecksOut(acc, style)):
            converted_acc.append('NaN')
        else:
            converted_acc.append(grabRelated(sraReference, acc, style, 'SRAStudy'))
    # for GEO accessions, check if they exist in the GEO reference
    # ...and add corresponding SERIES ID to new column
    elif(style == 'Series' or style == 'Accession' or style == 'Platform' or style == 'Datasets'):
        if(not geoChecksOut(acc, style)):
            converted_acc.append('NaN')
        else:
            converted_acc.append(grabRelated(geoReference, acc, style, 'Series'))
    # something doesn't belong, record as missing
    else:
        converted_acc.append('NaN')

# add the converted accession list as a new column in PubMed data table
# pd.set_option('display.min_rows', 50)
pmcData.loc[:,'converted_accession'] = converted_acc
pmcData = pmcData.drop_duplicates()
pmcData

Unnamed: 0,journal,pmc_ID,accession,converted_accession
0,Alzheimers_Res_Ther,PMC3707052,GSM1,GSE506
1,Alzheimers_Res_Ther,PMC3706879,GSE45534,GSE45534
5,Alzheimers_Res_Ther,PMC4255636,GSE5281,GSE5281
6,Alzheimers_Res_Ther,PMC4731966,GSE67036,GSE67036
7,Alzheimers_Res_Ther,PMC4731966,GSE1297,GSE1297
8,Alzheimers_Res_Ther,PMC4731966,GSE48350,GSE48350
9,Alzheimers_Res_Ther,PMC4731966,GSE36980,GSE36980
10,Alzheimers_Res_Ther,PMC4731966,GSE12679,GSE12679
11,Alzheimers_Res_Ther,PMC4731966,GSE12293,GSE12293
17,Alzheimers_Res_Ther,PMC4731966,GSE46871,GSE46871


In [8]:
# save 'checkpoint' to CSV file
# pmcData.to_csv('postFilterMatrix.csv')

In [45]:
# temporary cell saving time from big loop...

pmcData = pd.read_csv('../data_tables/postFilterMatrix.csv')
pmcData

Unnamed: 0,journal,pmc_ID,accession,converted_accession
0,Alzheimers_Res_Ther,PMC3707052,GSM1,GSE506
1,Alzheimers_Res_Ther,PMC3706879,GSE45534,GSE45534
2,Alzheimers_Res_Ther,PMC4255636,GSE5281,GSE5281
3,Alzheimers_Res_Ther,PMC4731966,GSE67036,GSE67036
4,Alzheimers_Res_Ther,PMC4731966,GSE1297,GSE1297
5,Alzheimers_Res_Ther,PMC4731966,GSE48350,GSE48350
6,Alzheimers_Res_Ther,PMC4731966,GSE36980,GSE36980
7,Alzheimers_Res_Ther,PMC4731966,GSE12679,GSE12679
8,Alzheimers_Res_Ther,PMC4731966,GSE12293,GSE12293
9,Alzheimers_Res_Ther,PMC4731966,GSE46871,GSE46871


In [46]:
# Add a column tagging each accession as GEO or SRA

repoList = []

for i in pmcData['converted_accession']:
    if(type(i) == str):
        if('GSE' in i):
            repoList.append('GEO')
        elif('SRP' in i or 'ERP' in i or 'DRP' in i):
            repoList.append('SRA')
        else:
            repoList.append('NaN')
    else:
        repoList.append('NaN')
        
pmcData['repository'] = repoList
pmcData

Unnamed: 0,journal,pmc_ID,accession,converted_accession,repository
0,Alzheimers_Res_Ther,PMC3707052,GSM1,GSE506,GEO
1,Alzheimers_Res_Ther,PMC3706879,GSE45534,GSE45534,GEO
2,Alzheimers_Res_Ther,PMC4255636,GSE5281,GSE5281,GEO
3,Alzheimers_Res_Ther,PMC4731966,GSE67036,GSE67036,GEO
4,Alzheimers_Res_Ther,PMC4731966,GSE1297,GSE1297,GEO
5,Alzheimers_Res_Ther,PMC4731966,GSE48350,GSE48350,GEO
6,Alzheimers_Res_Ther,PMC4731966,GSE36980,GSE36980,GEO
7,Alzheimers_Res_Ther,PMC4731966,GSE12679,GSE12679,GEO
8,Alzheimers_Res_Ther,PMC4731966,GSE12293,GSE12293,GEO
9,Alzheimers_Res_Ther,PMC4731966,GSE46871,GSE46871,GEO


In [42]:
# import SRA attribute data
# CAUTION: huge file! Time delay on import...

sraAttributes = pd.read_csv('../sra_complete_runs.csv', error_bad_lines = False, low_memory=False)
pd.set_option('display.max_columns', 50)
sraAttributes

Unnamed: 0,Run,ReleaseDate,LoadDate,spots,bases,spots_with_mates,avgLength,size_MB,AssemblyName,download_path,Experiment,LibraryName,LibraryStrategy,LibrarySelection,LibrarySource,LibraryLayout,InsertSize,InsertDev,Platform,Model,SRAStudy,BioProject,Study_Pubmed_id,ProjectID,Sample,BioSample,SampleType,TaxID,ScientificName,SampleName,g1k_pop_code,source,g1k_analysis_group,Subject_ID,Sex,Disease,Tumor,Affection_Status,Analyte_Type,Histological_Type,Body_Site,CenterName,Submission,dbgap_study_accession,Consent,RunHash,ReadHash
0,ERR3550121,2019-09-25 19:35:40,,0,0,0,0,0,,,ERX3556726,,WGS,RANDOM,GENOMIC,PAIRED,250,0.0,ILLUMINA,Illumina HiSeq 2000,ERP000767,PRJEB2600,,204335,ERS3773255,,simple,0,Leishmania infantum,0c4ac31a-dec5-11e9-ba7b-68b599768938,,,,,,,no,,,,,UOC-CCG,ERA2143351,,public,,
1,ERR3550142,2019-09-25 19:35:41,,0,0,0,0,0,,,ERX3556747,,WGS,RANDOM,GENOMIC,PAIRED,250,0.0,ILLUMINA,Illumina HiSeq 2000,ERP000767,PRJEB2600,,204335,ERS3773255,,simple,0,Leishmania infantum,0c4ac31a-dec5-11e9-ba7b-68b599768938,,,,,,,no,,,,,UOC-CCG,ERA2143351,,public,,
2,ERR3550147,2019-09-25 19:35:41,,0,0,0,0,0,,,ERX3556752,,WGS,RANDOM,GENOMIC,PAIRED,250,0.0,ILLUMINA,Illumina HiSeq 2000,ERP000767,PRJEB2600,,204335,ERS3773263,,simple,0,Leishmania donovani,0e2d3a2c-dec6-11e9-a5d1-68b59976a384,,,,,,,no,,,,,UOC-CCG,ERA2143351,,public,,
3,ERR3550125,2019-09-25 19:35:40,,0,0,0,0,0,,,ERX3556730,,WGS,RANDOM,GENOMIC,PAIRED,250,0.0,ILLUMINA,Illumina HiSeq 2000,ERP000767,PRJEB2600,,204335,ERS3773263,,simple,0,Leishmania donovani,0e2d3a2c-dec6-11e9-a5d1-68b59976a384,,,,,,,no,,,,,UOC-CCG,ERA2143351,,public,,
4,ERR3550137,2019-09-25 19:35:40,,0,0,0,0,0,,,ERX3556742,,WGS,RANDOM,GENOMIC,PAIRED,250,0.0,ILLUMINA,Illumina HiSeq 2000,ERP000767,PRJEB2600,,204335,ERS3773252,,simple,0,Leishmania donovani,188c9a6e-deba-11e9-ba7b-68b599768938,,,,,,,no,,,,,UOC-CCG,ERA2143351,,public,,
5,ERR3550158,2019-09-25 19:35:41,,0,0,0,0,0,,,ERX3556763,,WGS,RANDOM,GENOMIC,PAIRED,250,0.0,ILLUMINA,Illumina HiSeq 2000,ERP000767,PRJEB2600,,204335,ERS3773252,,simple,0,Leishmania donovani,188c9a6e-deba-11e9-ba7b-68b599768938,,,,,,,no,,,,,UOC-CCG,ERA2143351,,public,,
6,ERR3550157,2019-09-25 19:35:41,,0,0,0,0,0,,,ERX3556762,,WGS,RANDOM,GENOMIC,PAIRED,250,0.0,ILLUMINA,Illumina HiSeq 2000,ERP000767,PRJEB2600,,204335,ERS3773247,,simple,0,Leishmania infantum,1f43052a-deb3-11e9-ba7b-68b599768938,,,,,,,no,,,,,UOC-CCG,ERA2143351,,public,,
7,ERR3550134,2019-09-25 19:35:40,,0,0,0,0,0,,,ERX3556739,,WGS,RANDOM,GENOMIC,PAIRED,250,0.0,ILLUMINA,Illumina HiSeq 2000,ERP000767,PRJEB2600,,204335,ERS3773247,,simple,0,Leishmania infantum,1f43052a-deb3-11e9-ba7b-68b599768938,,,,,,,no,,,,,UOC-CCG,ERA2143351,,public,,
8,ERR3550135,2019-09-25 19:35:40,,0,0,0,0,0,,,ERX3556740,,WGS,RANDOM,GENOMIC,PAIRED,250,0.0,ILLUMINA,Illumina HiSeq 2000,ERP000767,PRJEB2600,,204335,ERS3773260,,simple,0,Leishmania donovani,28ec979a-dec6-11e9-b6f4-68b59976a384,,,,,,,no,,,,,UOC-CCG,ERA2143351,,public,,
9,ERR3550145,2019-09-25 19:35:41,,0,0,0,0,0,,,ERX3556750,,WGS,RANDOM,GENOMIC,PAIRED,250,0.0,ILLUMINA,Illumina HiSeq 2000,ERP000767,PRJEB2600,,204335,ERS3773260,,simple,0,Leishmania donovani,28ec979a-dec6-11e9-b6f4-68b59976a384,,,,,,,no,,,,,UOC-CCG,ERA2143351,,public,,


In [19]:
# import GEO attribute data and add Series column

geoAttributes = pd.read_csv('../geo_platforms_table.csv')
geoAttributes.rename(columns={'Accession':'Platform'}, inplace=True)
geoAttributes = pd.merge(geoAttributes, geoReference, how = 'left', on = 'Platform')
geoAttributes

Unnamed: 0,Platform,Title,Technology,Taxonomy,Data Rows,Samples Count,Series Count,Contact,Release Date,Series,Accession,Datasets
0,GPL4,SAGE:10:NlaIII:Homo sapiens,SAGE NlaIII,Homo sapiens,265577,580,58,GEO,"Sep 28, 2000",GSE506,GSM1,
1,GPL4,SAGE:10:NlaIII:Homo sapiens,SAGE NlaIII,Homo sapiens,265577,580,58,GEO,"Sep 28, 2000",GSE506,GSM2,
2,GPL4,SAGE:10:NlaIII:Homo sapiens,SAGE NlaIII,Homo sapiens,265577,580,58,GEO,"Sep 28, 2000",GSE10,GSM571,
3,GPL4,SAGE:10:NlaIII:Homo sapiens,SAGE NlaIII,Homo sapiens,265577,580,58,GEO,"Sep 28, 2000",GSE10,GSM572,
4,GPL4,SAGE:10:NlaIII:Homo sapiens,SAGE NlaIII,Homo sapiens,265577,580,58,GEO,"Sep 28, 2000",GSE10,GSM573,
...,...,...,...,...,...,...,...,...,...,...,...,...
2961228,GPL27549,Illumina HiSeq 2000 (Escherichia coli str. K-1...,high-throughput sequencing,Escherichia coli str. K-12 substr. DH10B,0,0,0,GEO,"Sep 30, 2019",,,
2961229,GPL27550,Illumina HiSeq 4000 (Solanum chacoense),high-throughput sequencing,Solanum chacoense,0,0,0,GEO,"Sep 30, 2019",,,
2961230,GPL27552,"Illumina HiSeq 2500 (Caenorhabditis elegans ,V...",high-throughput sequencing,"Caenorhabditis elegans ,Vibrio parahaemolyticus",0,0,0,GEO,"Sep 30, 2019",,,
2961231,GPL27554,Illumina HiSeq 2000 (Trichoderma atroviride),high-throughput sequencing,Trichoderma atroviride,0,0,0,GEO,"Sep 30, 2019",,,


In [48]:
# temporary mini dataframe...
pmcData = pmcData.iloc[1:100]
pmcData

Unnamed: 0,journal,pmc_ID,accession,converted_accession,repository,dataset_date
1,Alzheimers_Res_Ther,PMC3706879,GSE45534,GSE45534,GEO,
2,Alzheimers_Res_Ther,PMC4255636,GSE5281,GSE5281,GEO,
3,Alzheimers_Res_Ther,PMC4731966,GSE67036,GSE67036,GEO,
4,Alzheimers_Res_Ther,PMC4731966,GSE1297,GSE1297,GEO,
5,Alzheimers_Res_Ther,PMC4731966,GSE48350,GSE48350,GEO,
6,Alzheimers_Res_Ther,PMC4731966,GSE36980,GSE36980,GEO,
7,Alzheimers_Res_Ther,PMC4731966,GSE12679,GSE12679,GEO,
8,Alzheimers_Res_Ther,PMC4731966,GSE12293,GSE12293,GEO,
9,Alzheimers_Res_Ther,PMC4731966,GSE46871,GSE46871,GEO,
10,Alzheimers_Res_Ther,PMC5513091,GSE93678,GSE93678,GEO,


In [43]:
ds_date = []
hardware = []
lib_type = []
species = []
tput = []
readLength = []
access = []

for acc in pmcData['converted_accession']:
    if(type(acc) == str):
        if('GSE' in acc):
            ds_date.append(grabRelated(geoAttributes, acc, 'Series', 'Release Date'))
            hardware.append(grabRelated(geoAttributes, acc, 'Series', 'Technology'))
            lib_type.append('MISSING')
            species.append(grabRelated(geoAttributes, acc, 'Series', 'Taxonomy'))
            tput.append('MISSING')
            readLength.append('MISSING')
            access.append('MISSING')
        if('SRP' in acc or 'ERP' in acc or 'DRP' in acc):
            ds_date.append(grabRelated(sraAttributes, acc, 'SRAStudy', 'ReleaseDate'))
            hardware.append(grabRelated(sraAttributes, acc, 'SRAStudy', 'Model'))
            lib_type.append(grabRelated(sraAttributes, acc, 'SRAStudy', 'LibraryStrategy'))
            species.append(grabRelated(sraAttributes, acc, 'SRAStudy', 'ScientificName'))
            tput.append(grabRelated(sraAttributes, acc, 'SRAStudy', 'bases'))
            readLength.append(grabRelated(sraAttributes, acc, 'SRAStudy', 'avgLength'))
            access.append(grabRelated(sraAttributes, acc, 'SRAStudy', 'Consent'))
    else:
        ds_date.append('NaN')
        hardware.append('NaN')
        lib_type.append('NaN')
        species.append('NaN')
        tput.append('NaN')
        readLength.append('NaN')
        access.append('NaN')
        
        
# for reference, amt. of rows in pmcData
print(len(pmcData['journal']))   

# amt of items in each potential column
for i in [ds_date, hardware, lib_type, species, tput, readLength, access]:
    print(len(i))

99
99
99
99
99
99
99
99


In [49]:
pmcData.loc[:,'dataset_date'] = ds_date
pmcData.loc[:,'hardware'] = hardware
pmcData.loc[:,'library_type'] = lib_type
pmcData.loc[:,'species'] = species
pmcData.loc[:,'throughput'] = tput
pmcData.loc[:,'avg_readLength'] = readLength
pmcData.loc[:,'access_restrictions'] = access


pmcData

Unnamed: 0,journal,pmc_ID,accession,converted_accession,repository,dataset_date,hardware,library_type,species,throughput,avg_readLength,access_restrictions
1,Alzheimers_Res_Ther,PMC3706879,GSE45534,GSE45534,GEO,"May 25, 2004",in situ oligonucleotide,MISSING,Mus musculus,MISSING,MISSING,MISSING
2,Alzheimers_Res_Ther,PMC4255636,GSE5281,GSE5281,GEO,"Nov 07, 2003",in situ oligonucleotide,MISSING,Homo sapiens,MISSING,MISSING,MISSING
3,Alzheimers_Res_Ther,PMC4731966,GSE67036,GSE67036,GEO,"May 06, 2013",in situ oligonucleotide,MISSING,Rattus norvegicus,MISSING,MISSING,MISSING
4,Alzheimers_Res_Ther,PMC4731966,GSE1297,GSE1297,GEO,"Mar 11, 2002",in situ oligonucleotide,MISSING,Homo sapiens,MISSING,MISSING,MISSING
5,Alzheimers_Res_Ther,PMC4731966,GSE48350,GSE48350,GEO,"Nov 07, 2003",in situ oligonucleotide,MISSING,Homo sapiens,MISSING,MISSING,MISSING
6,Alzheimers_Res_Ther,PMC4731966,GSE36980,GSE36980,GEO,,,MISSING,,MISSING,MISSING,MISSING
7,Alzheimers_Res_Ther,PMC4731966,GSE12679,GSE12679,GEO,"Nov 07, 2003",in situ oligonucleotide,MISSING,Homo sapiens,MISSING,MISSING,MISSING
8,Alzheimers_Res_Ther,PMC4731966,GSE12293,GSE12293,GEO,"Nov 07, 2003",in situ oligonucleotide,MISSING,Homo sapiens,MISSING,MISSING,MISSING
9,Alzheimers_Res_Ther,PMC4731966,GSE46871,GSE46871,GEO,,,MISSING,,MISSING,MISSING,MISSING
10,Alzheimers_Res_Ther,PMC5513091,GSE93678,GSE93678,GEO,"Apr 16, 2013",high-throughput sequencing,MISSING,Mus musculus,MISSING,MISSING,MISSING


In [30]:
pd.set_option('display.min_rows', 100)
pmcData

Unnamed: 0,journal,pmc_ID,accession,converted_accession,repository,dataset_date,hardware,library_type,species,avg_thoroughput
1,Alzheimers_Res_Ther,PMC3706879,GSE45534,GSE45534,GEO,"May 25, 2004",in situ oligonucleotide,expressionArray;RNA-Seq,Mus musculus,
2,Alzheimers_Res_Ther,PMC4255636,GSE5281,GSE5281,GEO,"Nov 07, 2003",in situ oligonucleotide,expressionArray;RNA-Seq,Homo sapiens,
3,Alzheimers_Res_Ther,PMC4731966,GSE67036,GSE67036,GEO,"May 06, 2013",in situ oligonucleotide,expressionArray;RNA-Seq,Rattus norvegicus,
4,Alzheimers_Res_Ther,PMC4731966,GSE1297,GSE1297,GEO,"Mar 11, 2002",in situ oligonucleotide,expressionArray;RNA-Seq,Homo sapiens,
5,Alzheimers_Res_Ther,PMC4731966,GSE48350,GSE48350,GEO,"Nov 07, 2003",in situ oligonucleotide,expressionArray;RNA-Seq,Homo sapiens,
6,Alzheimers_Res_Ther,PMC4731966,GSE36980,GSE36980,GEO,,,expressionArray;RNA-Seq,,
7,Alzheimers_Res_Ther,PMC4731966,GSE12679,GSE12679,GEO,"Nov 07, 2003",in situ oligonucleotide,expressionArray;RNA-Seq,Homo sapiens,
8,Alzheimers_Res_Ther,PMC4731966,GSE12293,GSE12293,GEO,"Nov 07, 2003",in situ oligonucleotide,expressionArray;RNA-Seq,Homo sapiens,
9,Alzheimers_Res_Ther,PMC4731966,GSE46871,GSE46871,GEO,,,expressionArray;RNA-Seq,,
10,Alzheimers_Res_Ther,PMC5513091,GSE93678,GSE93678,GEO,"Apr 16, 2013",high-throughput sequencing,expressionArray;RNA-Seq,Mus musculus,


In [27]:
for i in [ds_date, hardware, lib_type, species, tput, readLength]:
    print(len(i))

99
99
99
99
28
28


In [None]:
pmcData.to_csv('pmcMatrix_withTechSpecies.csv')