In [29]:
import pandas as pd
import numpy as np
import plotly.express as px

### Add data to counts

In [69]:
sample_metadata = pd.read_csv("../../config/sample_metadata.tsv", sep="\t")
gamb_colu_meta = sample_metadata.query("species in ['coluzzii', 'gambiae']")
gamb_colu_arab_meta = sample_metadata.query("species in ['coluzzii', 'gambiae', 'arabiensis']")
gamb_colu_arab_fun_meta = sample_metadata.query("species in ['coluzzii', 'gambiae', 'arabiensis', 'funestus']")
fun_meta = sample_metadata.query("species in ['funestus']")

In [70]:
# gamb colu
gamb_colu_df = pd.read_csv("../../resources/gam_and_col_counts.txt", sep="\t").rename(columns={'Geneid':'GeneID'}).set_index("GeneID").iloc[:, 6:]
agg_dict = {k:'mean' for k in gamb_colu_df.columns if k != 'GeneID'}
gamb_colu_df = gamb_colu_df.reset_index().groupby("GeneID").agg(agg_dict)
cols = gamb_colu_meta['colData'].to_list()
assert all(np.isin(gamb_colu_df.columns,cols))

# gamb colu arab
gamb_colu_arab_df = pd.read_csv("../../resources/gambiae_complex_counts.txt", sep="\t").rename(columns={'Geneid':'GeneID'}).set_index("GeneID").iloc[:, 7:]
agg_dict = {k:'mean' for k in gamb_colu_arab_df.columns if k != 'GeneID'}
gamb_colu_arab_df = gamb_colu_arab_df.reset_index().groupby("GeneID").agg(agg_dict)
cols = sample_metadata.query("species in ['coluzzii', 'gambiae', 'arabiensis']")['colData'].to_list()
assert all(np.isin(gamb_colu_arab_df.columns, cols))

# gamb colu arab fun 
gamb_colu_arab_fun_df = pd.read_csv("../../resources/final_counts.txt", sep="\t").rename(columns={'Geneid':'GeneID'}).set_index("GeneID").iloc[:, 8:]
agg_dict = {k:'mean' for k in gamb_colu_arab_fun_df.columns if k != 'GeneID'}
gamb_colu_arab_fun_df = gamb_colu_arab_fun_df.reset_index().groupby("GeneID").agg(agg_dict)
cols = sample_metadata.query("species in ['coluzzii', 'gambiae', 'arabiensis', 'funestus']")['colData'].to_list()
assert all(np.isin(gamb_colu_arab_fun_df.columns, cols))

# funestus 
names = {'CMR':'Cameroon_fun', 
         'FANG':'Fang', 
         'FMZ':'Fumoz', 
         'GHA':'Ghana_fun',
         'MAL':'Malawi_fun',
         'UGA': 'Uganda_fun'}

def rename_duplicates(old):
    seen = {}
    for x in old:
        if x in seen:
            seen[x] += 1
            yield "%s%d" % (x, seen[x])
        else:
            seen[x] = 1
            yield "%s%d" % (x, seen[x])
            
fun_df = pd.read_csv("../../resources/funestus.counts.tsv", sep="\t")
fun_names = fun_df.columns[1:].str.slice(start=0, stop=4).str.replace("-", "").to_list()
fun_df = fun_df.rename(columns={'Geneid':'GeneID'}).set_index("GeneID")
fun_df.columns = [i for i in rename_duplicates([names[name] for name in fun_names])]

### Add Bouake and Busia

In [71]:
bouake_counts = pd.read_csv("../../../rna-seq-bouake/results/counts/rawcounts.tsv", sep="\t")
bouake_counts.columns = bouake_counts.columns.str.replace("coluzziiCont", "Bouake_colu_unexp").str.replace("gambiaeCont", "Bouake_gamb_unexp").str.replace("Kisumu", "Bouake_Kisumu").str.replace("Ngousso", "Bouake_Ngousso")
bouake_counts = bouake_counts[[x for x in list(bouake_counts) if 'PM' not in x]]
bouake_counts.head(3)

Unnamed: 0,GeneID,Bouake_Kisumu1,Bouake_Kisumu2,Bouake_Kisumu3,Bouake_Kisumu4,Bouake_colu_unexp1,Bouake_colu_unexp2,Bouake_colu_unexp3,Bouake_colu_unexp4,Bouake_Ngousso1,Bouake_Ngousso2,Bouake_Ngousso3,Bouake_Ngousso4,Bouake_gamb_unexp1,Bouake_gamb_unexp2,Bouake_gamb_unexp3,Bouake_gamb_unexp4
0,AGAP000002,5686,5634,7127,5494,5470,3016,6043,4496,5441,5123,4811,3718,4628,6639,3164,5128
1,AGAP000005,10900,10540,13624,10039,8295,3185,8557,7962,10638,9715,8562,6951,5700,8411,3102,7615
2,AGAP000007,4195,4428,4815,3950,3660,1844,3639,4189,3450,3764,3390,2729,3204,4084,2350,4416


In [72]:
busia_counts = pd.read_csv("../../../rna-seq-busia/results/counts/rawcounts.tsv", sep="\t")
busia_counts.columns = busia_counts.columns.str.replace("G24-", "").str.replace("G28-", "")
busia_counts = busia_counts[[x for x in list(busia_counts) if "Parental" not in x]]
busia_counts.head(3)

Unnamed: 0,GeneID,BusiaSurvivors1,BusiaSurvivors2,BusiaSurvivors3,BusiaSurvivors4,BusiaSurvivors5,BusiaSurvivors6,Kisumu1,Kisumu2,Kisumu3,Kisumu4
0,AGAP000002,5454,5744,4947,5720,6411,5964,4001,3905,3712,4173
1,AGAP000005,9109,10237,8409,9858,12172,10570,5423,5431,4938,5914
2,AGAP000007,4221,4312,3682,4035,4149,3966,3693,3443,3418,3374


#### Lets merge these counts

In [73]:
gamb_colu_df = gamb_colu_df.reset_index().merge(bouake_counts).merge(busia_counts).set_index("GeneID")
print(gamb_colu_df.shape)

gamb_colu_arab_df = gamb_colu_arab_df.reset_index().merge(bouake_counts).merge(busia_counts).set_index("GeneID")
print(gamb_colu_arab_df.shape)

gamb_colu_arab_fun_df = gamb_colu_arab_fun_df.reset_index().merge(bouake_counts).merge(busia_counts).set_index("GeneID")
print(gamb_colu_arab_fun_df.shape)

(10270, 124)
(8651, 152)
(8599, 198)


In [75]:
### reordering columns
gamb_colu_df = gamb_colu_df.loc[:, gamb_colu_meta['colData'].to_list()]

gamb_colu_arab_df = gamb_colu_arab_df.loc[:, gamb_colu_arab_meta['colData'].to_list()]

gamb_colu_arab_fun_df = gamb_colu_arab_fun_df.loc[:, gamb_colu_arab_fun_meta['colData'].to_list()]

fun_df = fun_df.loc[:, fun_meta['colData'].to_list()]

### Write counts to file

In [79]:
# gamb colu
gamb_colu_df.to_csv("../../results/gamb_colu.counts.tsv", sep="\t")
np.log2(gamb_colu_df + 1).round(1).to_csv("../../results/gamb_colu.log2counts.tsv", sep="\t")

# gamb colu arab
gamb_colu_arab_df.to_csv("../../results/gamb_colu_arab.counts.tsv", sep="\t")
np.log2(gamb_colu_arab_df + 1).round(1).to_csv("../../results/gamb_colu_arab.log2counts.tsv", sep="\t")

# gamb colu arab funestus
gamb_colu_arab_fun_df.to_csv("../../results/gamb_colu_arab_fun.counts.tsv", sep="\t")
np.log2(gamb_colu_arab_fun_df + 1).round(1).to_csv("../../results/gamb_colu_arab_fun.log2counts.tsv", sep="\t")

# funestus 
fun_df.to_csv("../../results/fun.counts.tsv", sep="\t")
np.log2(fun_df + 1).round(1).to_csv("../../results/fun.log2counts.tsv", sep="\t")

#### Now we must update col_data

In [29]:
metadata = pd.read_csv("../../config/ALLcoldata.txt", sep="\t")
metadata = metadata[['colData', 'condition', 'resistance', 'species', 'batch']]
#meta.loc[:,'condition'] = meta['condition'].str.replace("_fun", "")
metadata.loc[:, 'condition'] = metadata.loc[:,'condition'].str.replace("NG", "Ngousso").str.replace("Kis", 'Kisumu')

In [30]:
bouake_df = pd.DataFrame(bouake_counts.iloc[:, 1:].columns, columns=['colData'])
bouake_df.loc[:, 'condition'] = bouake_df.loc[:, 'colData'].str.rstrip('0123456789').str.replace("Bouake_Ki", "Kisumu").str.replace("Bouake_Ng", "Ngousso")
bouake_df.loc[:, 'resistance'] = np.hstack([np.repeat("susceptible", 4), 
                                           np.repeat("resistant", 4), 
                                           np.repeat("susceptible", 4), 
                                           np.repeat("resistant", 4)])
bouake_df.loc[:, 'species'] = np.repeat("gambiae", 16)
bouake_df.loc[:, 'batch'] = np.hstack([np.repeat(10, 4), 
                                       np.repeat(11, 8), 
                                       np.repeat(10, 4)])

In [31]:
busia_df = pd.DataFrame(busia_counts .iloc[:, 1:].columns, columns=['colData'])
busia_df.loc[:, 'condition'] = busia_df.loc[:, 'colData'].str.rstrip('0123456789')
busia_df.loc[:, 'resistance'] = np.hstack([np.repeat("resistant", 6), np.repeat("susceptible", 4)])
busia_df.loc[:, 'species'] = np.repeat("gambiae", 10)
busia_df.loc[:, 'batch'] = 12

In [33]:
metadata = pd.concat([metadata, bouake_df, busia_df])

### Write to file

In [38]:
# metadata = pd.read_csv("../../config/sample_metadata.tsv", sep="\t")
# metadata = 

comp_df = pd.read_csv("../../config/comparison_metadata.tsv", sep="\t")

comps_df_merge = comp_df.melt(id_vars=['comparison', 'species','country'], 
                               value_vars=['resistant', 'susceptible'],
                              value_name='condition', var_name='resistance')

replace_di = { 'NG':'Cameroon',
    'Ngousso':'Cameroon', 
    'Moz':'Mozambique',
    'BanSu':  'Burkina Faso',
    'Kisumu': 'Kenya',
    'Ki': 'Kenya',
    'Kis': 'Kenya',
    "Fang":"Angola",
    "Dongola":"Sudan"}


comps_res = comps_df_merge.query("resistance == 'resistant'")
comps_sus = comps_df_merge.query("resistance == 'susceptible'")
comps_sus = comps_sus.assign(country=comps_sus['condition'])
comps_sus = comps_sus.replace({"country":replace_di})

In [39]:
comps_merge_df = pd.concat([comps_res, comps_sus])

In [40]:
metadata = metadata.merge(comps_merge_df[['condition', 'country']].drop_duplicates(), how='left')

In [31]:
pd.set_option('display.max_rows', 500)

In [42]:
counts.to_csv("../../results/final_raw_counts.tsv", sep="\t", index=None)
metadata.to_csv("../../config/sample_metadata.tsv", sep="\t", index=None)

### Create metadata

In [36]:
def load_species(df, meta):
    spp=np.array([])
    for c in df.resistant:
        spps = meta.query(f"condition == '{c}'")['species'].unique()
        spp = np.append(spp, spps)
    return spp

In [32]:
metadata = pd.read_csv("../../config/sample_metadata.tsv", sep="\t")
meta = metadata[['condition', 'species']].drop_duplicates()

In [33]:
fc_data = pd.read_csv("../../results/fc_data.tsv", sep="\t")
fc_data = fc_data.iloc[:, 1:-2]

comp_df = pd.DataFrame(fc_data.columns.str.replace("_log2FoldChange", ""), columns=['comparison'])

comp_df.loc[:, 'resistant'] = comp_df.loc[:, 'comparison'].str.split("_v_").str.get(0)
comp_df.loc[:, 'susceptible'] = comp_df.loc[:, 'comparison'].str.split("_v_").str.get(-1)
comp_df.loc[:, 'susceptible'] = comp_df.loc[:, 'susceptible']
comp_df.loc[:, 'species'] = load_species(comp_df, meta)

In [40]:
comp_df = comp_df.query("~resistant.str.contains('_exp')", engine='python').query("~resistant.str.contains('Parental')", engine='python')

In [41]:
comp_df.loc[:, 'country'] = ['Burkina Faso', 'Burkina Faso', 'Burkina Faso', 'Burkina Faso', 'Burkina Faso', 
                             'Burkina Faso', 'Cameroon', 'Chad', 'Niger', 'Nigeria', 
                             "Cote d'Ivoire", "Cote d'Ivoire", "Cote d'Ivoire", "Cote d'Ivoire", "Cote d'Ivoire",
                             "Cote d'Ivoire", 'Cameroon', 'Mozambique', 'Ghana', 'Malawi',
                             'Uganda', 'Ethiopia', 'Ethiopia', 'Ethiopia', 'Ethiopia'] + np.repeat("Cote d'Ivoire", 6).tolist() + ["Uganda"]

In [43]:
comp_df.to_csv("../../config/comparison_metadata.tsv", sep="\t", index=False)