In [4]:
import pandas as pd
import numpy as np
import plotly.express as px

### Add data to counts

In [24]:
counts = pd.read_csv("../../resources/final_counts.txt", sep="\t")
counts.shape

(16387, 181)

In [25]:
bouake_counts = pd.read_csv("../../../rna-seq-bouake/results/quant/rawcounts.tsv", sep="\t")
bouake_counts.columns = bouake_counts.columns.str.replace("Anogam_", " Bouake_").str.replace("MC", "col_unexp").str.replace("MS", "col_exp").str.replace("SC", "gamb_unexp").str.replace("SS", "gamb_exp")
bouake_counts = bouake_counts[[x for x in list(bouake_counts) if '_exp' not in x]]
bouake_counts.head(3)

Unnamed: 0,GeneID,Bouake_Ki11,Bouake_Ki2,Bouake_Ki5,Bouake_Ki7,Bouake_col_unexp6,Bouake_col_unexp7,Bouake_col_unexp8,Bouake_col_unexp9,Bouake_Ng1,Bouake_Ng11,Bouake_Ng4,Bouake_Ng7,Bouake_gamb_unexp10,Bouake_gamb_unexp6,Bouake_gamb_unexp7,Bouake_gamb_unexp9
0,AGAP000002,5686,5634,7127,5494,5470,3016,6043,4496,5441,5123,4811,3718,4628,6639,3164,5128
1,AGAP000005,10900,10540,13624,10039,8295,3185,8557,7962,10638,9715,8562,6951,5700,8411,3102,7615
2,AGAP000007,4195,4428,4815,3950,3660,1844,3639,4189,3450,3764,3390,2729,3204,4084,2350,4416


In [26]:
busia_counts = pd.read_csv("../../../rna-seq-busia/results/counts/rawcounts.tsv", sep="\t")
busia_counts.columns = busia_counts.columns.str.replace("G24-", "").str.replace("G28-", "")
busia_counts = busia_counts[[x for x in list(busia_counts) if "Parental" not in x]]
busia_counts.head(3)

Unnamed: 0,GeneID,BusiaSurvivors1,BusiaSurvivors2,BusiaSurvivors3,BusiaSurvivors4,BusiaSurvivors5,BusiaSurvivors6,Kisumu1,Kisumu2,Kisumu3,Kisumu4
0,AGAP000002,5454,5744,4947,5720,6411,5964,4001,3905,3712,4173
1,AGAP000005,9109,10237,8409,9858,12172,10570,5423,5431,4938,5914
2,AGAP000007,4221,4312,3682,4035,4149,3966,3693,3443,3418,3374


In [27]:
abo_counts = pd.read_csv("../../../rna-seq-abo-tiass/results/quant/rawcounts.tsv", sep="\t")
genes = abo_counts['GeneID'] # remove aboisso as contaminated
abo_counts = pd.concat([genes, abo_counts.iloc[:, 13:]], axis=1) # remove aboisso as contaminated

ng_counts = bouake_counts.filter(like="Ng")
ng_counts.columns = ng_counts.columns.str.replace("Bouake_Ng", "Ngousso")

abo_counts = pd.concat([abo_counts, ng_counts], axis=1)
abo_counts.head(2)

Unnamed: 0,GeneID,ContTia1,ContTia2,ContTia4,DeltTia1,DeltTia2,DeltTia5,MalaTia1,MalaTia2,MalaTia4,PiriTia2,PiriTia3,PiriTia4,Ngousso1,Ngousso11,Ngousso4,Ngousso7
0,AGAP000002,482,827,785,819,776,981,767,907,946,481,575,813,5441,5123,4811,3718
1,AGAP000005,1130,1688,1672,1594,1717,1812,1671,1856,1825,842,1135,1296,10638,9715,8562,6951


#### Lets merge these counts

In [28]:
counts = counts.merge(bouake_counts).merge(abo_counts).merge(busia_counts)
counts.shape

(16387, 223)

#### Now we must update col_data

In [29]:
metadata = pd.read_csv("../../config/ALLcoldata.txt", sep="\t")
metadata = metadata[['colData', 'condition', 'resistance', 'species', 'batch']]
#meta.loc[:,'condition'] = meta['condition'].str.replace("_fun", "")
metadata.loc[:, 'condition'] = metadata.loc[:,'condition'].str.replace("NG", "Ngousso").str.replace("Kis", 'Kisumu')

In [30]:
bouake_df = pd.DataFrame(bouake_counts.iloc[:, 1:].columns, columns=['colData'])
bouake_df.loc[:, 'condition'] = bouake_df.loc[:, 'colData'].str.rstrip('0123456789').str.replace("Bouake_Ki", "Kisumu").str.replace("Bouake_Ng", "Ngousso")
bouake_df.loc[:, 'resistance'] = np.hstack([np.repeat("susceptible", 4), 
                                           np.repeat("resistant", 4), 
                                           np.repeat("susceptible", 4), 
                                           np.repeat("resistant", 4)])
bouake_df.loc[:, 'species'] = np.repeat("gambiae", 16)
bouake_df.loc[:, 'batch'] = np.hstack([np.repeat(10, 4), 
                                       np.repeat(11, 8), 
                                       np.repeat(10, 4)])

In [31]:
busia_df = pd.DataFrame(busia_counts .iloc[:, 1:].columns, columns=['colData'])
busia_df.loc[:, 'condition'] = busia_df.loc[:, 'colData'].str.rstrip('0123456789')
busia_df.loc[:, 'resistance'] = np.hstack([np.repeat("resistant", 6), np.repeat("susceptible", 4)])
busia_df.loc[:, 'species'] = np.repeat("gambiae", 10)
busia_df.loc[:, 'batch'] = 12

In [32]:
abo_df = pd.DataFrame(abo_counts .iloc[:, 1:].columns, columns=['colData'])
abo_df.loc[:, 'condition'] = abo_df.loc[:, 'colData'].str.rstrip('0123456789')
abo_df.loc[:, 'resistance'] = np.hstack([np.repeat("resistant", 12), np.repeat("susceptible", 4)])
abo_df.loc[:, 'species'] = np.repeat("gambiae", 16)
abo_df.loc[:, 'batch'] = 13

In [33]:
metadata = pd.concat([metadata, bouake_df, abo_df, busia_df])

### Write to file

In [38]:
# metadata = pd.read_csv("../../config/sample_metadata.tsv", sep="\t")
# metadata = 

comp_df = pd.read_csv("../../config/comparison_metadata.tsv", sep="\t")

comps_df_merge = comp_df.melt(id_vars=['comparison', 'species','country'], 
                               value_vars=['resistant', 'susceptible'],
                              value_name='condition', var_name='resistance')

replace_di = { 'NG':'Cameroon',
    'Ngousso':'Cameroon', 
    'Moz':'Mozambique',
    'BanSu':  'Burkina Faso',
    'Kisumu': 'Kenya',
    'Ki': 'Kenya',
    'Kis': 'Kenya',
    "Fang":"Angola",
    "Dongola":"Sudan"}


comps_res = comps_df_merge.query("resistance == 'resistant'")
comps_sus = comps_df_merge.query("resistance == 'susceptible'")
comps_sus = comps_sus.assign(country=comps_sus['condition'])
comps_sus = comps_sus.replace({"country":replace_di})

In [39]:
comps_merge_df = pd.concat([comps_res, comps_sus])

In [40]:
metadata = metadata.merge(comps_merge_df[['condition', 'country']].drop_duplicates(), how='left')

In [31]:
pd.set_option('display.max_rows', 500)

In [42]:
counts.to_csv("../../results/final_raw_counts.tsv.gz", sep="\t", index=None, compression='gzip')
metadata.to_csv("../../config/sample_metadata.tsv", sep="\t", index=None)

### Create metadata

In [36]:
def load_species(df, meta):
    spp=np.array([])
    for c in df.resistant:
        spps = meta.query(f"condition == '{c}'")['species'].unique()
        spp = np.append(spp, spps)
    return spp

In [32]:
metadata = pd.read_csv("../../config/sample_metadata.tsv", sep="\t")
meta = metadata[['condition', 'species']].drop_duplicates()

In [33]:
fc_data = pd.read_csv("../../results/fc_data.tsv", sep="\t")
fc_data = fc_data.iloc[:, 1:-2]

comp_df = pd.DataFrame(fc_data.columns.str.replace("_log2FoldChange", ""), columns=['comparison'])

comp_df.loc[:, 'resistant'] = comp_df.loc[:, 'comparison'].str.split("_v_").str.get(0)
comp_df.loc[:, 'susceptible'] = comp_df.loc[:, 'comparison'].str.split("_v_").str.get(-1)
comp_df.loc[:, 'susceptible'] = comp_df.loc[:, 'susceptible']
comp_df.loc[:, 'species'] = load_species(comp_df, meta)

In [40]:
comp_df = comp_df.query("~resistant.str.contains('_exp')", engine='python').query("~resistant.str.contains('Parental')", engine='python')

In [41]:
comp_df.loc[:, 'country'] = ['Burkina Faso', 'Burkina Faso', 'Burkina Faso', 'Burkina Faso', 'Burkina Faso', 
                             'Burkina Faso', 'Cameroon', 'Chad', 'Niger', 'Nigeria', 
                             "Cote d'Ivoire", "Cote d'Ivoire", "Cote d'Ivoire", "Cote d'Ivoire", "Cote d'Ivoire",
                             "Cote d'Ivoire", 'Cameroon', 'Mozambique', 'Ghana', 'Malawi',
                             'Uganda', 'Ethiopia', 'Ethiopia', 'Ethiopia', 'Ethiopia'] + np.repeat("Cote d'Ivoire", 6).tolist() + ["Uganda"]

In [43]:
comp_df.to_csv("../../config/comparison_metadata.tsv", sep="\t", index=False)

### Get latitude and longitude of locations

### full sample metadata