In [49]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

# Process diamond output

In [55]:
df = pd.read_csv("../gene_hits.tsv", header=0, index_col=0, sep="\t")

# drop the extra header cols
df = df[df["Query accession"]!="Query accession"]

# Create a sample id and read number columns from query accession 
df["Sample ID"] = df["Query accession"].apply(lambda x: x.split(".")[0])
df["Read number"] = df["Query accession"].apply(lambda x: x.split(".")[1])


## Remove where missing gene names

In [56]:
percent_with_names = sum(df["Gene name"].isna()==False)/df.shape[0]

print(round(percent_with_names*100), "percent of hits matched a gene name on uniprot")

# Only keep where we have a gene name
df = df[df["Gene name"].isna()==False]

42 percent of hits matched a gene name on uniprot


## Get metacyc gene names

In [70]:
pathway_1 = pd.read_csv("../database-building/metacyc-nic-deg-1.txt", 
                        header=1, sep="\t")
pathway_2 = pd.read_csv("../database-building/metacyc-nic-deg-2.txt", 
                        header=1, sep="\t")
pathway_3 = pd.read_csv("../database-building/metacyc-nic-deg-3.txt", 
                        header=1, sep="\t")

metacyc_gene_names = set(pathway_1["Gene name"]) | set(pathway_2["Gene name"]) | set(pathway_3["Gene name"])

## Remove where gene names not in metacyc pathways

In [78]:
df = df[df["Gene name"].apply(lambda x: x in metacyc_gene_names)]

In [82]:
df["E-value"] = df["E-value"].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [86]:
df = df.loc[df.groupby("Read number")["E-value"].idxmin()]

# Metadata

In [92]:
metadata = pd.read_csv("../metadata/metadata.csv")

In [147]:
SRA1_metadata = pd.read_csv("../metadata/PRJNA508385_SRA_metadata.txt")
SRA2_metadata = pd.read_csv("../metadata/PRJNA548383_SRA_metadata.txt")

# This one was formatted really weirdly... csv in a tsv
SRA3_metadata_weird = pd.read_csv("../metadata/PRJNA544061_SRA_metadata.txt")
sra3_metadata_data = [str(x) for x in SRA3_metadata_weird.index]
SRA3_metadata = [(x.split(",")[0][2:], x.split(",")[5]) for x in sra2_metadata_data]
SRA3_metadata = pd.DataFrame(SRA3_metadata, columns=["BioProject","BioSample"])

# CHANGE THIS WE NEED SAMPLE NAME NOT BIOSAMPLE

SRA_metadata = pd.concat([SRA1_metadata[["BioProject","BioSample"]],
                          SRA2_metadata[["BioProject","BioSample"]],
                          SRA3_metadata[["BioProject","BioSample"]]],
                        axis=0)

In [146]:
set(SRA_metadata["BioSample"])

{'SAMN12026608',
 'SAMN12026609',
 'SAMN12026610',
 'SAMN12026611',
 'SAMN12026612',
 'SAMN12026613',
 'SAMN12026614',
 'SAMN12026615',
 'SAMN12026616',
 'SAMN12026617',
 'SAMN12026618',
 'SAMN12026619',
 'SAMN12026620',
 'SAMN12026621',
 'SAMN12026622',
 'SAMN12026623',
 'SAMN12026624',
 'SAMN12026625',
 'SAMN12026626',
 'SAMN12026627',
 'SAMN12026628',
 'SAMN12026629',
 'SAMN12026630',
 'SAMN12026631',
 'SAMN12026632',
 'SAMN12026633',
 'SAMN12026634',
 'SAMN12026635',
 'SAMN12026636',
 'SAMN12026637',
 'SAMN12026638',
 'SAMN12026639',
 'SAMN12026640',
 'SAMN12026641',
 'SAMN12026642',
 'SAMN12026643',
 'SAMN12026644',
 'SAMN12026645',
 'SAMN12026646',
 'SAMN12026647',
 'SAMN12026648',
 'SAMN12026649',
 'SAMN12026650',
 'SAMN12026651',
 'SAMN12026652',
 'SAMN12026653',
 'SAMN12026654',
 'SAMN12026655',
 'SAMN12026656',
 'SAMN12026657',
 'SAMN12026658',
 'SAMN12026659',
 'SAMN12026660',
 'SAMN12026661',
 'SAMN12026662',
 'SAMN12026663',
 'SAMN12026664',
 'SAMN12026665',
 'SAMN12026666

In [138]:
pd.DataFrame(SRA2_metadata, 
             columns=["BioProject","BioSample"])

Unnamed: 0,BioProject,BioSample
0,SRR9668678,SAMN12026672
1,SRR9668679,SAMN12026679
2,SRR9668682,SAMN12026611
3,SRR9668685,SAMN12026612
4,SRR9668688,SAMN12026673
...,...,...
68,SRR9668738,SAMN12026637
69,SRR9668739,SAMN12026669
70,SRR9668744,SAMN12026653
71,SRR9668745,SAMN12026654


In [119]:
SRA2_metadata = pd.read_csv("../metadata/PRJNA544061_SRA_metadata.txt")

SRA2_metadata
#SRA2_metadata[["BioProject","BioSample"]]


Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,"Run,Assay Type,AvgSpotLen,Bases,BioProject,BioSample,BioSampleModel,Bytes,Center Name,collection_date,Conse\t""nt","DA""\tTASTOR\t""E filetype",DATASTORE provider,DATASTORE region,env_broad_scale,env_local_scale,"env_medi""\t""um","et""\t""hnicity",Experiment,\tgeo_loc_name_cou\tntry,...,Library Name,LibraryLayout,LibrarySelection,LibrarySource,Organism,Platform,ReleaseDate,samp_collect_device,Sample Name,"SRA Study"""
"SRR9668678,WGS,295,1620853343,PRJNA544061,SAMN12026672,""MIMS.me,MIGS/MIMS/MIMARKS.human-oral"",738192471,THE\tOHIO\tSTATE\t""UNIVERSITY",2016,public,sra,"fastq""""",gs,s3,"ncbi""""",gs.US,ncbi.public,"s3.us-east-1""""","human""\toral\t""cavity","subgingival""\t""crevice","biofilm""\tmaterial\t""[ENVO:01000156]",Caucasian,SRX6429241,USA,"North""\t""America",...,D351_S37,PAIRED,RANDOM,METAGENOMIC,human oral metagenome,ILLUMINA,2020-07-11T00:00:00Z,endodontic paper-points,D351_S37,"SRP214147"""
"SRR9668679,WGS,295,1746006061,PRJNA544061,SAMN12026679,""MIMS.me,MIGS/MIMS/MIMARKS.human-oral"",790761447,THE\tOHIO\tSTATE\t""UNIVERSITY",2016,public,fastq,"sra""""",ncbi,s3,"gs""""",ncbi.public,gs.US,"s3.us-east-1""""","human""\toral\t""cavity","subgingival""\t""crevice","biofilm""\tmaterial\t""[ENVO:01000156]",Caucasian,SRX6429239,USA,"North""\t""America",...,D358_S44,PAIRED,RANDOM,METAGENOMIC,human oral metagenome,ILLUMINA,2020-07-11T00:00:00Z,endodontic paper-points,D358_S44,"SRP214147"""
"SRR9668682,WGS,294,1516442707,PRJNA544061,SAMN12026611,""MIMS.me,MIGS/MIMS/MIMARKS.human-oral"",666788965,THE\tOHIO\tSTATE\t""UNIVERSITY",2016,public,sra,"fastq""""",gs,s3,"ncbi""""",s3.us-east-1,gs.US,"ncbi.public""""","human""\toral\t""cavity","subgingival""\t""crevice","biofilm""\tmaterial\t""[ENVO:01000156]",Caucasian,SRX6429236,USA,"North""\t""America",...,E104_S4,PAIRED,RANDOM,METAGENOMIC,human oral metagenome,ILLUMINA,2020-07-11T00:00:00Z,endodontic paper-points,E104_S4,"SRP214147"""
"SRR9668685,WGS,294,1542258574,PRJNA544061,SAMN12026612,""MIMS.me,MIGS/MIMS/MIMARKS.human-oral"",668359185,THE\tOHIO\tSTATE\t""UNIVERSITY",2016,public,sra,"fastq""""",s3,gs,"ncbi""""",gs.US,s3.us-east-1,"ncbi.public""""","human""\toral\t""cavity","subgingival""\t""crevice","biofilm""\tmaterial\t""[ENVO:01000156]",Caucasian,SRX6429233,USA,"North""\t""America",...,E105_S5,PAIRED,RANDOM,METAGENOMIC,human oral metagenome,ILLUMINA,2020-07-11T00:00:00Z,endodontic paper-points,E105_S5,"SRP214147"""
"SRR9668688,WGS,296,1527570983,PRJNA544061,SAMN12026673,""MIMS.me,MIGS/MIMS/MIMARKS.human-oral"",689973150,THE\tOHIO\tSTATE\t""UNIVERSITY",2016,public,fastq,"sra""""",ncbi,gs,"s3""""",s3.us-east-1,gs.US,"ncbi.public""""","human""\toral\t""cavity","subgingival""\t""crevice","biofilm""\tmaterial\t""[ENVO:01000156]",Caucasian,SRX6429240,USA,"North""\t""America",...,D352_S38,PAIRED,RANDOM,METAGENOMIC,human oral metagenome,ILLUMINA,2020-07-11T00:00:00Z,endodontic paper-points,D352_S38,"SRP214147"""
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"SRR9668738,WGS,294,1773763634,PRJNA544061,SAMN12026637,""MIMS.me,MIGS/MIMS/MIMARKS.human-oral"",786143508,THE\tOHIO\tSTATE\t""UNIVERSITY",2016,public,sra,"fastq""""",gs,s3,"ncbi""""",ncbi.public,s3.us-east-1,"gs.US""""","human""\toral\t""cavity","subgingival""\t""crevice","biofilm""\tmaterial\t""[ENVO:01000156]",Caucasian,SRX6429184,USA,"North""\t""America",...,F323_S30,PAIRED,RANDOM,METAGENOMIC,human oral metagenome,ILLUMINA,2020-07-11T00:00:00Z,endodontic paper-points,F323_S30,"SRP214147"""
"SRR9668739,WGS,295,1565391127,PRJNA544061,SAMN12026669,""MIMS.me,MIGS/MIMS/MIMARKS.human-oral"",712488933,THE\tOHIO\tSTATE\t""UNIVERSITY",2016,public,sra,"fastq""""",gs,s3,"ncbi""""",ncbi.public,gs.US,"s3.us-east-1""""","human""\toral\t""cavity","subgingival""\t""crevice","biofilm""\tmaterial\t""[ENVO:01000156]",Asian,SRX6429180,USA,"North Ame""\t""rica",...,D348_S34,PAIRED,RANDOM,METAGENOMIC,human oral metagenome,ILLUMINA,2020-07-11T00:00:00Z,endodontic paper-points,D348_S34,"SRP214147"""
"SRR9668744,WGS,295,1628499209,PRJNA544061,SAMN12026653,""MIMS.me,MIGS/MIMS/MIMARKS.human-oral"",695063167,THE\tOHIO\tSTATE\t""UNIVERSITY",2016,public,sra,"fastq""""",ncbi,gs,"s3""""",s3.us-east-1,gs.US,"ncbi.public""""","human""\toral\t""cavity","subgingival""\t""crevice","biofilm""\tmaterial\t""[ENVO:01000156]",Caucasian,SRX6429174,USA,"North""\t""America",...,D303_S11,PAIRED,RANDOM,METAGENOMIC,human oral metagenome,ILLUMINA,2020-07-11T00:00:00Z,endodontic paper-points,D303_S11,"SRP214147"""
"SRR9668745,WGS,293,1963459905,PRJNA544061,SAMN12026654,""MIMS.me,MIGS/MIMS/MIMARKS.human-oral"",839568466,THE\tOHIO\tSTATE\t""UNIVERSITY",2016,public,fastq,"sra""""",ncbi,s3,"gs""""",s3.us-east-1,ncbi.public,"gs.US""""","human""\toral\t""cavity","subgingival""\t""crevice","biofilm""\tmaterial\t""[ENVO:01000156]",Caucasian,SRX6429173,USA,"North""\t""America",...,D305_S13,PAIRED,RANDOM,METAGENOMIC,human oral metagenome,ILLUMINA,2020-07-11T00:00:00Z,endodontic paper-points,D305_S13,"SRP214147"""


# Get a count per sample

In [90]:
df.groupby("Sample ID").count()

Unnamed: 0_level_0,Query accession,Target accession,Sequence identity,Length,Mismatches,Gap openings,Query start,Query end,Target start,Target end,E-value,Bit score,Gene name,Read number
Sample ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
SRR9641788,113,113,113,113,113,113,113,113,113,113,113,113,113,113
SRR9641789,160,160,160,160,160,160,160,160,160,160,160,160,160,160
SRR9641790,29,29,29,29,29,29,29,29,29,29,29,29,29,29
SRR9641791,106,106,106,106,106,106,106,106,106,106,106,106,106,106
SRR9641792,82,82,82,82,82,82,82,82,82,82,82,82,82,82
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR9668746,227,227,227,227,227,227,227,227,227,227,227,227,227,227
SRR9668747,452,452,452,452,452,452,452,452,452,452,452,452,452,452
SRR9668748,273,273,273,273,273,273,273,273,273,273,273,273,273,273
SRR9668749,356,356,356,356,356,356,356,356,356,356,356,356,356,356


In [124]:
a = ["a","b","c"]
a[2]

'c'