In [1]:
from .config import expand_iupac
from operator import mul
from functools import reduce
import pandas as pd
from collections import Counter

possibilities = {key:len(value) for key,value in expand_iupac.items()}

fn = ["../output{}/summory.csv".format(i) for i in range(3)]

def amb_count(seq):
    return sum([value for key,value in Counter(seq).items() if key in "yrwskmdvhbn-"])

def amb_combinations(seq):
    return reduce(mul, [possibilities[nuc.upper()] for nuc in seq])

In [2]:
dfs = []
for i in range(len(fn)):
    df = pd.read_csv(fn[i])
    df["Ambiguities_Allowed"] = i
    df["Primer_Version"] = df.Exon_Name.apply(lambda ortho: int(ortho[-1:]))
    df.Exon_Name = df.Exon_Name.apply(lambda ortho: ortho[:-3])
    df.rename(columns = {
    'species':'Species_List'
    }, inplace=True)
    dfs.append(df)
df = pd.concat(dfs)
df.Species_List = df.Species_List.apply(lambda st: [sp.strip(" ',[]") for sp in st.split()])
df["Species_Count"] = df.Species_List.apply(len)

del df["Gene_Product"]
del df["Species"]
df.sort_values(by=["Exon_Name","Ambiguities_Allowed","Primer_Version"], inplace=True)
df.drop_duplicates(subset=["Exon_Name","PRIMER_LEFT_0_SEQUENCE","PRIMER_RIGHT_0_SEQUENCE"], keep="first", inplace=True)

df.rename(columns = {
    'Exon_Name':'CDS_Ortholog',
    'PRIMER_LEFT_0_SEQUENCE':'PRIMER_LEFT_SEQUENCE',
    'PRIMER_RIGHT_0_SEQUENCE':'PRIMER_RIGHT_SEQUENCE',
    'PRIMER_LEFT_0_TM':'PRIMER_LEFT_TM',
    'PRIMER_RIGHT_0_TM':'PRIMER_RIGHT_TM'
    }, inplace=True)


df['Total_Ambiguities'] = df.apply(lambda row: amb_count(row["PRIMER_LEFT_SEQUENCE"] + row["PRIMER_RIGHT_SEQUENCE"]), axis=1)
df['Left_Ambiguities'] = df["PRIMER_LEFT_SEQUENCE"].apply(amb_count)
df['Right_Ambiguities'] = df["PRIMER_RIGHT_SEQUENCE"].apply(amb_count)
df['Max_Ambiguities'] = df.apply(lambda row: max(amb_count(row["PRIMER_LEFT_SEQUENCE"]), amb_count(row["PRIMER_RIGHT_SEQUENCE"])), axis=1)
df['Total_Combinations'] = df.apply(lambda row: amb_combinations(row["PRIMER_LEFT_SEQUENCE"]) + amb_combinations(row["PRIMER_RIGHT_SEQUENCE"]), axis=1)
df['Left_Combinations'] = df["PRIMER_LEFT_SEQUENCE"].apply(amb_combinations)
df['Right_Combinations'] = df["PRIMER_RIGHT_SEQUENCE"].apply(amb_combinations)

df.index = df[["CDS_Ortholog", "PRIMER_LEFT_SEQUENCE", "PRIMER_RIGHT_SEQUENCE"]]

df = df[[
        'CDS_Ortholog',
        'Primer_Version',
        'Ambiguities_Allowed',
        'Species_Count',
        'Total_Ambiguities',
        'Left_Ambiguities',
        'Right_Ambiguities',
        'Max_Ambiguities',
        'Total_Combinations',
        'Left_Combinations',
        'Right_Combinations',
        'PI_Score',
        'Target_Sequence_Length',
        'PRIMER_LEFT_SEQUENCE',
        'PRIMER_RIGHT_SEQUENCE',
        'PRIMER_LEFT_TM',
        'PRIMER_RIGHT_TM',
        'Species_List'
    ]]

In [3]:
filename = "combined_summory.csv"
df.to_csv(filename, index=False)

In [4]:
from bokeh.charts import Scatter, Histogram, output_notebook, output_file, show
output_notebook()

In [5]:
tooltips=[
    ('CDS_Ortholog', '@CDS_Ortholog'),
    ('Ambiguities_Allowed', '@Ambiguities_Allowed'),
    ('Primer_Version', '@Primer_Version'),
    ('PI_Score', '@PI_Score'),
    ('Target_Sequence_Length', '@Target_Sequence_Length'),
    ('Total_Combinations', '@Total_Combinations'),
    ('PRIMER_LEFT_SEQUENCE', '@PRIMER_LEFT_SEQUENCE'),
    ('PRIMER_LEFT_TM', '@PRIMER_LEFT_TM'),
    ('PRIMER_RIGHT_SEQUENCE', '@PRIMER_RIGHT_SEQUENCE'),
    ('PRIMER_RIGHT_TM', '@PRIMER_RIGHT_TM'),
    ('Species_List', '@Species_List')
]

p = Scatter(df, x='Target_Sequence_Length', y='PI_Score', color="Max_Ambiguities", tooltips=tooltips,
           title="Score vs Insert_Length Colored by Max_Ambiguities")

output_file("scatter.html")
show(p)

In [6]:
h1 = Histogram(df, values="Target_Sequence_Length", title="Insert_Length Distribution Colored by Max_Ambiguities",
               color="Max_Ambiguities", bins=30, legend="top_right")


h2 = Histogram(df, values='PI_Score', title="Score Distribution Colored by Max_Ambiguities",
               color="Max_Ambiguities", bins=30, legend="top_right")

h3 = Histogram(df, values='Species_Count', title="Species_Count Distribution Colored by Max_Ambiguities",
               color="Max_Ambiguities", bins=13, legend="top_right")

output_file("len_hist.html")
show(h1)

output_file("score_hist.html")
show(h2)

output_file("sp_count_hist.html")
show(h3)

INFO:bokeh.core.state:Session output file 'len_hist.html' already exists, will be overwritten.


INFO:bokeh.core.state:Session output file 'score_hist.html' already exists, will be overwritten.


INFO:bokeh.core.state:Session output file 'sp_count_hist.html' already exists, will be overwritten.
