In [49]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

# Process diamond output

In [55]:
df = pd.read_csv("../gene_hits.tsv", header=0, index_col=0, sep="\t")

# drop the extra header cols
df = df[df["Query accession"]!="Query accession"]

# Create a sample id and read number columns from query accession 
df["Sample ID"] = df["Query accession"].apply(lambda x: x.split(".")[0])
df["Read number"] = df["Query accession"].apply(lambda x: x.split(".")[1])


## Remove where missing gene names

In [56]:
percent_with_names = sum(df["Gene name"].isna()==False)/df.shape[0]

print(round(percent_with_names*100), "percent of hits matched a gene name on uniprot")

# Only keep where we have a gene name
df = df[df["Gene name"].isna()==False]

42 percent of hits matched a gene name on uniprot


## Get metacyc gene names

In [70]:
pathway_1 = pd.read_csv("../database-building/metacyc-nic-deg-1.txt", 
                        header=1, sep="\t")
pathway_2 = pd.read_csv("../database-building/metacyc-nic-deg-2.txt", 
                        header=1, sep="\t")
pathway_3 = pd.read_csv("../database-building/metacyc-nic-deg-3.txt", 
                        header=1, sep="\t")

metacyc_gene_names = set(pathway_1["Gene name"]) | set(pathway_2["Gene name"]) | set(pathway_3["Gene name"])

## Remove where gene names not in metacyc pathways

In [78]:
df = df[df["Gene name"].apply(lambda x: x in metacyc_gene_names)]

In [82]:
df["E-value"] = df["E-value"].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [86]:
df = df.loc[df.groupby("Read number")["E-value"].idxmin()]

# Metadata

# Get a count per sample

In [90]:
df.groupby("Sample ID").count()

Unnamed: 0_level_0,Query accession,Target accession,Sequence identity,Length,Mismatches,Gap openings,Query start,Query end,Target start,Target end,E-value,Bit score,Gene name,Read number
Sample ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
SRR9641788,113,113,113,113,113,113,113,113,113,113,113,113,113,113
SRR9641789,160,160,160,160,160,160,160,160,160,160,160,160,160,160
SRR9641790,29,29,29,29,29,29,29,29,29,29,29,29,29,29
SRR9641791,106,106,106,106,106,106,106,106,106,106,106,106,106,106
SRR9641792,82,82,82,82,82,82,82,82,82,82,82,82,82,82
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR9668746,227,227,227,227,227,227,227,227,227,227,227,227,227,227
SRR9668747,452,452,452,452,452,452,452,452,452,452,452,452,452,452
SRR9668748,273,273,273,273,273,273,273,273,273,273,273,273,273,273
SRR9668749,356,356,356,356,356,356,356,356,356,356,356,356,356,356
