# Merging and Parsing Mutation Call Files

## Part 1: Filtering calls and inputting files

In [10]:
import os
import pandas as pd

for file in os.listdir("/Users/johnnavarro/AS_variants_list"):
    if file.endswith("_withreads.txt"):
        with open(file) as fin:
            outfile = file.split(".")[0] + "filtered.txt"
            with open(outfile, "w+") as fout:
                for line in fin:
                    if float(line.split("\t")[5]) > 0.03:
                        fout.write(line)

In [11]:
#open files and specify column names


columnNames = ["Sample", "Chromosome", "Position", "Ref Allele", "Alt Allele", "Allele Frequency", "Number Reads", "Total Reads"]
fileNames = []
sampleNames = []
dfList = []
for file in os.listdir("/Users/johnnavarro/AS_variants_list"):
    if file.endswith("_withreadsfiltered.txt"):
        fileNames.append(file)
for f in sorted(fileNames):
    specificNames = []
    #these names do not have the name of strains
    for val in columnNames[0:4]:
        specificNames.append(val)
    #these names have the name of the strain 
    for val in columnNames[4:]:
        val = val + "_" + f.split("_")[0]
        specificNames.append(val)
    #append each dataframe to a list
    dfList.append(pd.read_csv(f,sep="\t",header=None,names=specificNames).drop([specificNames[-1],specificNames[-2]],axis=1))
display(dfList[0].head(20))


Unnamed: 0,Sample,Chromosome,Position,Ref Allele,Alt Allele_AS218,Allele Frequency_AS218
0,AS218,NZ_CP020397.1,51,A,G,1.0
1,AS218,NZ_CP020397.1,126,G,A,1.0
2,AS218,NZ_CP020397.1,495,A,T,1.0
3,AS218,NZ_CP020397.1,516,C,T,1.0
4,AS218,NZ_CP020397.1,651,C,G,1.0
5,AS218,NZ_CP020397.1,735,T,C,1.0
6,AS218,NZ_CP020397.1,1173,T,C,1.0
7,AS218,NZ_CP020397.1,1191,T,C,0.995086
8,AS218,NZ_CP020397.1,1744,G,A,1.0
9,AS218,NZ_CP020397.1,2184,A,G,1.0


## Part 2: Merging and parsing

In [12]:
#starting dataframe prior to merge
df2 = dfList[0].drop(["Sample"],axis=1)
sortednames = sorted(fileNames)
#merge each dataframe to form combined dataframe
for df in dfList[1:]:
    df2 = pd.merge(df2,df.drop("Sample",axis=1),on=["Chromosome","Position","Ref Allele"], how="outer")
df3 = df2.loc[:,df2.columns.str.startswith("Allele Frequency")]
display(df2)





Unnamed: 0,Chromosome,Position,Ref Allele,Alt Allele_AS218,Allele Frequency_AS218,Alt Allele_AS219,Allele Frequency_AS219,Alt Allele_AS222,Allele Frequency_AS222,Alt Allele_AS223,...,Alt Allele_AS232,Allele Frequency_AS232,Alt Allele_AS233,Allele Frequency_AS233,Alt Allele_AS236,Allele Frequency_AS236,Alt Allele_AS237,Allele Frequency_AS237,Alt Allele_AS240,Allele Frequency_AS240
0,NZ_CP020397.1,51,A,G,1.0,G,1.0,G,1.00000,G,...,G,1.0,G,1.0,G,1.0,G,1.0,G,1.000000
1,NZ_CP020397.1,126,G,A,1.0,A,1.0,A,1.00000,A,...,A,1.0,A,1.0,A,1.0,A,1.0,A,1.000000
2,NZ_CP020397.1,495,A,T,1.0,T,1.0,T,1.00000,T,...,T,1.0,T,1.0,T,1.0,T,1.0,T,1.000000
3,NZ_CP020397.1,516,C,T,1.0,T,1.0,T,0.99802,T,...,T,1.0,T,1.0,T,1.0,T,1.0,T,0.997468
4,NZ_CP020397.1,651,C,G,1.0,G,1.0,G,1.00000,G,...,G,1.0,G,1.0,G,1.0,G,1.0,G,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54871,NZ_CP020398.1,2337519,C,,,,,,,,...,,,,,,,,,G,0.042056
54872,NZ_CP020399.1,424723,C,,,,,,,,...,,,,,,,,,T,1.000000
54873,NZ_CP020399.1,433533,G,,,,,,,,...,,,,,,,,,T,0.033457
54874,NZ_CP020399.1,485999,C,,,,,,,,...,,,,,,,,,T,0.030303


In [13]:
#removes positions that have all frequencies > 0.95
df4 = df2[df2.loc[:,df2.columns.str.startswith("Allele Frequency")].min(axis=1) < 0.95]
#keep positions that fluctuate in frequency in different strains
df5 = df4[df4.loc[:,df4.columns.str.startswith("Allele Frequency")].min(axis=1) < 0.50]
df6 = df5[df5.loc[:,df5.columns.str.startswith("Allele Frequency")].max(axis=1) > 0.50]
#sort them by position
dfFinal = df6.set_index("Position")
#output to txt file 
dfFinal.to_csv("as_merged2.txt",sep="\t")
display(dfFinal)


Unnamed: 0_level_0,Chromosome,Ref Allele,Alt Allele_AS218,Allele Frequency_AS218,Alt Allele_AS219,Allele Frequency_AS219,Alt Allele_AS222,Allele Frequency_AS222,Alt Allele_AS223,Allele Frequency_AS223,...,Alt Allele_AS232,Allele Frequency_AS232,Alt Allele_AS233,Allele Frequency_AS233,Alt Allele_AS236,Allele Frequency_AS236,Alt Allele_AS237,Allele Frequency_AS237,Alt Allele_AS240,Allele Frequency_AS240
Position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
31544,NZ_CP020397.1,T,C,1.000000,C,0.913690,,,,,...,,,,,,,,,,
49109,NZ_CP020397.1,A,T,0.118644,T,0.095406,T,0.134804,T,0.103672,...,T,0.113565,T,0.127036,T,0.104089,T,0.106452,T,1.0
49136,NZ_CP020397.1,C,T,0.135714,T,0.145390,T,0.167476,T,0.139640,...,T,0.168168,T,0.146789,T,0.155102,T,0.130699,T,1.0
49140,NZ_CP020397.1,T,C,0.135514,C,0.144876,C,0.171569,C,0.138702,...,C,0.164671,C,0.148148,C,0.145161,C,0.131661,C,1.0
49152,NZ_CP020397.1,G,C,0.119048,C,0.140152,C,0.154242,C,0.130233,...,C,0.159021,C,0.134868,C,0.138462,C,0.121311,C,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2608054,NZ_CP020397.1,G,,,,,,,,,...,,,,,,,,,,
1194017,NZ_CP020398.1,G,,,,,,,,,...,,,,,,,,,,
909756,NZ_CP020397.1,A,,,,,,,,,...,G,0.333333,,,G,0.500000,,,,
1067799,NZ_CP020398.1,C,,,,,,,,,...,,,,,,,,,T,1.0
