In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

# Read in the variant analysis result for the isolate sequencing

In [3]:

# Define the folder path
data_path = Path("../../code/isolate_sequencing/results/filtered_variants")


In [10]:
# List all CSV files in the folder
csv_files = [file for file in data_path.glob('*.csv')]
# Read each CSV file into a dictionary of DataFrames
dataframes = {file.stem: pd.read_csv(file) for file in csv_files}


In [20]:

# name_to_file = {x.split('.')[0].replace('_iso','').replace('_anc', ''): x for x in dataframes.keys()}
name_to_file = {
    'sucB-M6': 'SucB_M6_iso.filtered',
    'aceE': 'AceE_anc.filtered',
    'sucB': 'SucB_anc.filtered',
    'aceE-M4': 'AceE_M4_iso.filtered',
    'sucB-M7': 'SucB_M7_iso.filtered',
    'aceE-M3': 'AceE_M3_iso.filtered',
    'all_samples': 'all_samples.filtered',
    'aceE-M2': 'AceE_M2_iso.filtered',
    'sucB-M5': 'SucB_M5_iso.filtered'}
aceE_evolved = ['aceE-M2', 'aceE-M3', 'aceE-M4']
sucB_evolved = ['sucB-M5', 'sucB-M6', 'sucB-M7']

## Remove the snips that are also in the ancestors

In [27]:
# aceE
aceE_df = dataframes[name_to_file['aceE']]
aceE_locations = aceE_df['pos'].unique()
for isolate in aceE_evolved:
    isolate_df = dataframes[name_to_file[isolate]]
    rows_to_remove = []
    for i, row in isolate_df.iterrows():
        if row['pos'] in aceE_locations:
            print(f"Removing {isolate} row {i} with pos {row['pos']}")
            rows_to_remove.append(i)
    isolate_df.drop(rows_to_remove, inplace=True)

sucB_df = dataframes[name_to_file['sucB']]
sucB_locations = sucB_df['pos'].unique()
for isolate in sucB_evolved:
    isolate_df = dataframes[name_to_file[isolate]]
    rows_to_remove = []
    for i, row in isolate_df.iterrows():
        if row['pos'] in sucB_locations:
            print(f"Removing {isolate} row {i} with pos {row['pos']}")
            rows_to_remove.append(i)
    isolate_df.drop(rows_to_remove, inplace=True)

    

Removing sucB-M5 row 0 with pos 447145
Removing sucB-M5 row 2 with pos 2719426
Removing sucB-M6 row 0 with pos 447145
Removing sucB-M6 row 1 with pos 2719426
Removing sucB-M7 row 0 with pos 447145
Removing sucB-M7 row 3 with pos 2719426


In [65]:
ace_M4_df = dataframes[name_to_file['aceE-M4']]
print(ace_M4_df.shape)
m4_genes = ace_M4_df.gene.unique()
pd.Series(m4_genes).to_csv(data_path / 'm4_genes.csv', index=False)

(267, 15)


In [70]:
dataframes[name_to_file['sucB-M6']]

Unnamed: 0,chrom,pos,qual,depth,freq,alt,alt_count,ref,type,len,eff,gene,product,linegroup,sample
2,CP009273,3081560,6450.74,186,1.0,"Substitution(type_='SNV', value='A')",186,C,snp,1,,,,CP009273.3081560.SucB_M6_iso,SucB_M6_iso
3,CP009273,3782776,6329.78,179,1.0,"Substitution(type_='SNV', value='A')",179,C,snp,1,synonymous_variant,waaH,LPS(HepIII)-glucuronic acid glycosyltransferase,CP009273.3782776.SucB_M6_iso,SucB_M6_iso
4,CP009273,4209467,7773.93,220,1.0,"Substitution(type_='SNV', value='A')",220,C,snp,1,missense_variant,aceK,isocitrate dehydrogenase kinase/phosphatase,CP009273.4209467.SucB_M6_iso,SucB_M6_iso
5,CP009273,4538904,7974.41,235,0.995745,"Substitution(type_='SNV', value='T')",234,G,snp,1,missense_variant,fimH,minor component of type 1 fimbriae,CP009273.4538904.SucB_M6_iso,SucB_M6_iso


In [69]:
dataframes[name_to_file['sucB-M7']]

Unnamed: 0,chrom,pos,qual,depth,freq,alt,alt_count,ref,type,len,eff,gene,product,linegroup,sample
1,CP009273,752345,6971.83,198,1.0,"Substitution(type_='SNV', value='T')",198,C,snp,1,missense_variant,sdhA,"succinate dehydrogenase, flavoprotein subunit",CP009273.752345.SucB_M7_iso,SucB_M7_iso
2,CP009273,2234612,3502.85,112,1.0,"Substitution(type_='INDEL', value='AG')",112,ATACGTTGATG,del,9,disruptive_inframe_deletion,galS,galactose- and fucose-inducible galactose regu...,CP009273.2234612.SucB_M7_iso,SucB_M7_iso
4,CP009273,3782776,8405.95,238,1.0,"Substitution(type_='SNV', value='A')",238,C,snp,1,synonymous_variant,waaH,LPS(HepIII)-glucuronic acid glycosyltransferase,CP009273.3782776.SucB_M7_iso,SucB_M7_iso
5,CP009273,4209467,7904.26,224,1.0,"Substitution(type_='SNV', value='A')",224,C,snp,1,missense_variant,aceK,isocitrate dehydrogenase kinase/phosphatase,CP009273.4209467.SucB_M7_iso,SucB_M7_iso


# Read in the variants from the meta sequencing

In [71]:
meta_folder = Path('../../code/meta_sequencing/results/fixed_filtered_variants')

In [76]:
# List all CSV files in the folder
meta_csv_files = [file for file in meta_folder.glob('*.csv')]
# Read each CSV file into a dictionary of DataFrames
meta_dataframes = {file.stem: pd.read_csv(file) for file in meta_csv_files}


In [78]:
meta_dataframes.keys()

dict_keys(['SucB_M6.filtered', 'AceE_M4_D44.filtered', 'SucB_M7.filtered', 'AceE_M3_D44.filtered', 'SucB_M5.filtered', 'all_samples.filtered', 'AceE_M2_D44.filtered'])

In [80]:
all_meta_df = meta_dataframes['all_samples.filtered']

In [90]:
all_meta_df['sample'].str.contains('AceE')

0      False
1      False
2      False
3      False
4      False
       ...  
145     True
146     True
147     True
148     True
149     True
Name: sample, Length: 150, dtype: bool

In [91]:
aceE_all = all_meta_df.loc[all_meta_df['sample'].str.contains('AceE'), :]
sucB_all = all_meta_df.loc[all_meta_df['sample'].str.contains('SucB'), :]

In [98]:
all_meta_df.loc[all_meta_df.gene.duplicated()&all_meta_df.gene.notna()]

Unnamed: 0,chrom,pos,qual,depth,freq,alt,alt_count,ref,type,len,eff,gene,product,linegroup,sample
48,CP009273,1471105,62035.6,1818,0.981848,"Substitution(type_='SNV', value='C')",1785,A,snp,1,missense_variant,ydbD,PF10971 family putative periplasmic methylglyo...,CP009273.1471105.SucB_M6,SucB_M6
132,CP009273,2234612,55390.8,1904,0.955357,"Substitution(type_='INDEL', value='AG')",1819,ATACGTTGATG,del,9,disruptive_inframe_deletion,galS,galactose- and fucose-inducible galactose regu...,CP009273.2234612.SucB_M7,SucB_M7
134,CP009273,3782776,92280.5,2659,0.987965,"Substitution(type_='SNV', value='A')",2627,C,snp,1,synonymous_variant,waaH,LPS(HepIII)-glucuronic acid glycosyltransferase,CP009273.3782776.SucB_M7,SucB_M7
135,CP009273,4209467,102664.0,3016,0.981101,"Substitution(type_='SNV', value='A')",2959,C,snp,1,missense_variant,aceK,isocitrate dehydrogenase kinase/phosphatase,CP009273.4209467.SucB_M7,SucB_M7
136,CP009273,1436272,53943.5,1737,0.937248,"Substitution(type_='SNV', value='A')",1628,G,snp,1,stop_gained,ldhA,"fermentative D-lactate dehydrogenase, NAD-depe...",CP009273.1436272.AceE_M2_D44,AceE_M2_D44
139,CP009273,2234994,88507.2,2777,0.948506,"Substitution(type_='SNV', value='A')",2634,G,snp,1,stop_gained,galS,galactose- and fucose-inducible galactose regu...,CP009273.2234994.AceE_M2_D44,AceE_M2_D44
141,CP009273,1664690,73818.5,2092,0.999522,"Substitution(type_='SNV', value='C')",2091,A,snp,1,synonymous_variant,ynfM,putative arabinose efflux transporter,CP009273.1664690.AceE_M4_D44,AceE_M4_D44
146,CP009273,1664690,89593.5,2547,0.999607,"Substitution(type_='SNV', value='C')",2546,A,snp,1,synonymous_variant,ynfM,putative arabinose efflux transporter,CP009273.1664690.AceE_M3_D44,AceE_M3_D44
148,CP009273,3945422,87250.0,3046,0.902167,"Substitution(type_='SNV', value='G')",2748,A,snp,1,synonymous_variant,ilvG,,CP009273.3945422.AceE_M3_D44,AceE_M3_D44


In [99]:
sucB_all# Get a list of unique duplicated genes
unique_duplicated_genes = all_meta_df['gene'][all_meta_df['gene'].duplicated()].unique()

# Display the unique duplicated genes
print(unique_duplicated_genes)

[nan 'ydbD' 'galS' 'waaH' 'aceK' 'ldhA' 'ynfM' 'ilvG']


In [108]:
pd.Series(sucB_all.gene.unique()).to_csv(meta_folder / 'sucB_genes.csv', index=False)
pd.Series(aceE_all.gene.unique()).to_csv(meta_folder / 'aceE_genes.csv', index=False)

In [106]:
all_meta_df.loc[all_meta_df['gene'] == 'ydbD']

Unnamed: 0,chrom,pos,qual,depth,freq,alt,alt_count,ref,type,len,eff,gene,product,linegroup,sample
47,CP009273,1470490,57737.3,1705,0.980059,"Substitution(type_='SNV', value='A')",1671,C,snp,1,missense_variant,ydbD,PF10971 family putative periplasmic methylglyo...,CP009273.1470490.SucB_M6,SucB_M6
48,CP009273,1471105,62035.6,1818,0.981848,"Substitution(type_='SNV', value='C')",1785,A,snp,1,missense_variant,ydbD,PF10971 family putative periplasmic methylglyo...,CP009273.1471105.SucB_M6,SucB_M6
