In [1]:
import numpy as np
import pandas as pd
import collections as collect
import os
from IPython.display import clear_output

In [2]:
popSize=50000

In [3]:
def controller(popSize,Fitnesses,Freqs,Statuses,outfile,statList=["B","N","D","L","All"]):
    #reads and writes inputs and outputs. 
    for Stat in statList:
        WT=pd.read_csv(Fitnesses) #read in Fitness Table
        WT=WT.fillna(0.0)
        FT=pd.read_csv(Freqs)     #read in Freq Table
        FT=FT.fillna(0.0)
        ST=pd.read_csv(Statuses)     #read in Freq Table
        genomeCounts=Counter(mask(WT,ST,Stat),FT,popSize)#Run function to sample and count genomes
        PD=convert(genomeCounts)
        PD.to_csv(Stat+outfile)
    #clear_output()
    print("Complete. Outfile "+outfile+" written.")

In [10]:
def mask(WT,ST,Stat):
    if Stat=="All":
        pass
    else:
        for col in WT.columns[4:]:
            print(col)
            WT[col].loc[ST[col]!=Stat]=1.0#mask all mutations not of type Stat in genome        
    return(WT)

In [11]:
def Counter(WT,FT,popSize): 
    # executes runSampling(), computes product of fitnesses after sampling. 
    # RETURNS genome fitness as product of individual sampled fitnesses. 
    genomeCounts=dict()
    for H in WT.host.unique():
        genomeCounts[H]={}
        for S in WT.set.unique():
            genomeCounts[H][S]={}
            for passage in WT.passage.unique():
                print(passage)
                print(H)
                print(S)
                print(H+"_"+S+"_"+passage)
                genomeExpectations=runSampling(H,S,passage,popSize,WT,FT)
                print("Tabulating...")
                genomeCounts[H][S][passage]=[genome.prod() for genome in genomeExpectations]
                print("\t...done.")
                clear_output()
    print(genomeCounts)
    return(genomeCounts)

In [12]:
def runSampling(H,S,passage,popSize,WT,FT):#prepares the matrices for sampling, runs stateSampler() on each vector
    colnames=WT.columns
    fDF=FT.loc[(FT.set==S)&(FT.host==H)&(FT.passage==passage),:]
    pArray=np.array(fDF.iloc[:,4:])
    wDF=WT.loc[(WT.set==S)&(WT.host==H)&(WT.passage==passage),:]
    wArray=np.array(wDF.iloc[:,4:])
    popSample=np.array(stateSampler(pArray,wArray,popSize,colnames))
    genomes=popSample.transpose()
    return(genomes)

In [13]:
def stateSampler(pArray,wArray,N,colnames):#samples alleles weighted by empirical frequencies
    print("Sampling "+str(popSize)+" genomes...")
    choice=[np.random.choice(a=wArray[i], p=pArray[i], size=N) for i in range(len(pArray))] #position
    print("\t...done.\n")
    return choice

In [14]:
def convert(genomeCounts):#converts to panda DF from Count dicts
    clear_output()
    i=0
    PD=pd.DataFrame()
    print("Converting output...")
    for H in genomeCounts.keys():
        for S in genomeCounts[H].keys():
            for passage in genomeCounts[H][S].keys():
                PDentries=pd.DataFrame.from_dict(genomeCounts[H][S][passage])
                name=H+"_"+S+"_"+passage
                PDentries["name"]=name
                PD=PD.append(PDentries)
    print("...done.")
    return(PD)

In [16]:
infileFitness="./Data_Tables/fitnessTable_Sampling.csv"
infileFreq="./Data_Tables/freqTable_Sampling.csv"
infileStatus="./Data_Tables/statusTable_Sampling.csv"

controller(popSize,infileFitness,infileFreq,infileStatus,outfile="_FitnessSamples.csv")

Converting output...
...done.
Complete. Outfile _FitnessSamples.csv written.
