Steps to process samples through Freyja, edit and plot

In [None]:
#Freyja processing step 1 - one per sample

#freyja update - run this to update the Usher tree for freyja
filename=$1
wd=$(pwd)

lines=$(cat $filename)
refs="/gpfs/gsfs12/users/Irp-jiang/share/covid_data/WWBE/info/"

for line in $lines;
do
    prefix=$(echo $line | cut -d "/" -f 12 | cut -d "_" -f 1)
    freyja demix ${line}_trimmed_union.vcf depth/${prefix}.depth --output freyja_output/${prefix}_freyja
done

In [None]:
# Freyja processing step 2 - aggregrating the samples

freyja aggregate freyja_output/ --output [aggregated-filename.tsv]

Python script to edit the aggregate tsv file out of Freyja and plot major VOCs abundance by location

In [None]:
from operator import index
import matplotlib.pyplot as plt
import pandas as pd
import re
import copy
import matplotlib.dates as mdates
import seaborn as sns
import datetime as dt
from matplotlib.dates import DateFormatter
import numpy as np

plt.style.use('tableau-colorblind10')

def prepLineageDict(agg_d0):
    agg_d0.loc[:, 'lineages'] = agg_d0['lineages']\
          .apply(lambda x:
                 x.replace("'", "")
                  .replace("]", "")
                  .replace("[", "")
                  .replace(")", "")
                  .replace("(", "")
                  .replace("\n", "")).copy()
    agg_d0 = agg_d0[agg_d0['lineages'].apply(lambda x: len(x) > 0)].copy()
    agg_d0.loc[:, 'lineages'] = agg_d0['lineages'].apply(lambda x:
                                                         re.sub(' +', ' ', x)
                                                           .split(' ')).copy()
    agg_d0.loc[:, 'abundances'] = agg_d0['abundances']\
          .apply(lambda x:
                 x.replace("'", "")
                  .replace("]", "")
                  .replace("[", "")
                  .replace(")", "")
                  .replace("(", "")
                  .replace("\n", "")).copy()
    agg_d0.loc[:, 'abundances'] = agg_d0['abundances']\
          .apply(lambda x:
                 re.sub(' +', ' ', x)
                   .split(' ')).copy()
    agg_d0=agg_d0.reset_index()
    lin_list=[]
    for samp in agg_d0.index:
        lin_dict={}
        Delta=0
        Others=0
        Alpha=0
        Recombinants=0
        Recomb_omicron=0
        Omicron=0
        BA_1=0
        BA_2=0
        BA_3=0
        BA_4=0
        BA_5=0
        for lin, abund in zip(agg_d0.loc[samp, 'lineages'], agg_d0.loc[samp, 'abundances']):
            abund = float(abund)
            if lin in ["XS", "XD", "XF"] or lin.startswith("miscDelta"):
                Recombinants+=abund
            elif lin.startswith("misBA") or lin in ["XE", "XG", "XH", "XJ", "XK", "XL", "XM", "XN", "XP"\
                "XQ", "XS", "XT", "XU", "XV", "XW", "XY", "XZ", "XAA", "XAB", "XAC", "XAD", "XAE", "XAF", "XAG", "XAH" ]:
                Recomb_omicron+=abund
            elif lin.startswith("AY.") or lin == "B.1.617.2":
                Delta+=abund
            elif lin == "B.1.1.7" or lin.startswith("Q."):
                Alpha+=abund
            elif lin == "B.1.1.529":
                Omicron+=abund
            elif lin == "BA.1" or lin.startswith("BA.1.") or lin.startswith("BC.") or lin.startswith("BD."):
                BA_1+=abund
            elif lin.startswith("BA.2") or lin.startswith("BJ.") \
                 or lin.startswith("BK.")or lin.startswith("BG.") or lin.startswith("BH."):
                BA_2+=abund
            elif lin.startswith("BA.3"):
                BA_3+=abund
            elif lin.startswith("BA.4"):
                BA_4+=abund
            elif lin.startswith("BA.5") or lin.startswith("BF.") or lin.startswith("BE."):
                BA_5+=abund
            else:
                Others+=abund
            total = Recombinants + Alpha + Delta + Omicron + Others + BA_1 + BA_2 + BA_3 + BA_4 + BA_5
        lin_dict["Alpha"]=Alpha
        lin_dict["Delta"]=Delta
        lin_dict["Others"]=Others
        lin_dict["Recombinants (Delta/Omicron)"]=Recombinants
        lin_dict["Recombinants (Omicron)"]=Recomb_omicron
        lin_dict["B.1.1.529 (Omicron)"]= Omicron
        lin_dict["BA.1 or BA.1.* (Omicron)"] = BA_1
        lin_dict["BA.2 or BA.2.* (Omicron)"]= BA_2
        lin_dict["BA.3 or BA.3.* (Omicron)"]=BA_3
        lin_dict["BA.4 or BA.4.* (Omicron)"]=BA_4
        lin_dict["BA.5 or BA.5.* (Omicron)"] = BA_5
        lin_list.append(lin_dict)
    agg_d0.loc[:, 'linDict'] = lin_list
    return agg_d0

def prepSummaryDict(agg_d0):
    agg_d0.loc[:, 'summarized'] = agg_d0['summarized']\
          .apply(lambda x:
                 x.replace("'", "")
                  .replace("]", "")
                  .replace("[", "")
                  .replace(")", "")
                  .replace("(", "")
                  .replace("\n", "")
                  .split(', ')).copy()
    # drop any samples with NO lineages identified from analysis
    agg_d0 = agg_d0[agg_d0['summarized'].apply(lambda x: len(x) > 1)].copy()
    agg_d0.loc[:, 'summarized'] = agg_d0['summarized']\
          .apply(lambda x:
                 dict(zip(x[0::2],
                          x[1::2]))).copy()
    agg_d0.loc[:, 'summarized'] = agg_d0['summarized']\
          .apply(lambda x:
                 {k: float(v)
                  for k, v
                  in x.items()}).copy()
    return agg_d0

def makePlot_time(agg_df, lineages, interval,
                  windowSize, colors0, group_name):
    cmap = plt.cm.tab20
    if lineages:
         queryType = 'linDict'
         agg_df = prepLineageDict(agg_df)
    else:
        queryType = 'summarized'
        agg_df = prepSummaryDict(agg_df)

    df_abundances = pd.DataFrame()
    for i, sampLabel in enumerate(agg_df.index):
        dat = agg_df.loc[sampLabel, queryType]
        if isinstance(dat, list):
            df_abundances = df_abundances.append(
                pd.Series(agg_df.loc[sampLabel, queryType][0],
                          name=agg_df.loc[sampLabel,
                                            'Sampler_Date']))
        else:
            df_abundances = df_abundances.append(
                pd.Series(agg_df.loc[sampLabel, queryType],
                          name=agg_df.loc[sampLabel,
                                            'Sampler_Date']))
    df_abundances = df_abundances.fillna(0)
    df_abundances = df_abundances.groupby(pd.Grouper(freq=interval)).mean()
    df_abundances = df_abundances.reset_index()
    df_abundances["Date"] = pd.to_datetime(df_abundances["index"])
    df_abundances["WeekDate"] = df_abundances.apply(lambda row: row["Date"] - dt.timedelta(days=row["Date"].weekday()), axis=1)
    df_abundances.drop(["index", "Date"], axis=1, inplace =True)
    df_abundances = df_abundances.groupby(["WeekDate"]).sum().sort_values("WeekDate")
    print(df_abundances.index)
    plt.style.use("tableau-colorblind10")
    df_abund = df_abundances.melt(ignore_index=False)
    df_abund = df_abund.reset_index()
    fig, ax = plt.subplots(figsize=(6,3.5))
    if len(colors0) == 0:
        for i in range(0, df_abundances.shape[1]):
            label = df_abundances.columns[i]
            if len(colors0) == 0:
                ax.bar(df_abundances.index, df_abundances.iloc[:, i],
                       width=7, bottom=df_abundances.iloc[:, 0:i].sum(axis=1), edgecolor="white",
                       label=label, color=cmap(i / 20.))
            else:
                ax.bar(df_abundances.index, df_abundances.iloc[:, i],
                       width=7, bottom=df_abundances.iloc[:, 0:i].sum(axis=1), edgecolor="white",
                       label=label, color=colors0[i])
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), prop={'size':7})
    ax.set_ylabel('Variant Prevalence')
    date_form = DateFormatter("%b-%Y")
    ax.xaxis.set_major_formatter(date_form)
    ax.xaxis.set_major_locator(mdates.MonthLocator(interval=1))
    ax.xaxis.set_minor_locator(mdates.DayLocator(interval=5))
    ax.set_title(name, fontsize=7)
    ax.tick_params(labelsize=7)
    ax.set_xlabel('')
    ax.set_ylim([0, 1])
    ax.set_xlim([dt.date(2021,7,15), dt.date(2022,6,15)]) # date range of samples from project
    plt.setp(ax.get_xticklabels(), rotation=90)
    fig.tight_layout()
    plt.savefig("figures_lineages_2022-08-22/time_plot_days"+group_name+".png")
    plt.savefig("figures_lineages_2022-08-22/time_plot_days"+group_name+".pdf")
    plt.show()
    plt.close()


if __name__ == '__main__':
    agg_df = pd.read_csv("all_ww_2022-08-22.tsv", skipinitialspace=True, sep='\t') #freyja result + metadata info
    agg_df["Sample_name"]=agg_df["Sample_name"].astype("str")
    agg_df["Sampler_Date"] =pd.to_datetime(agg_df["Sampler_Date"])
    agg_df = agg_df.set_index("Sample_name")
    meta_grouped = agg_df.groupby(["Location"])
    agg_df_edited = prepLineageDict(agg_df)
    agg_df_edited.to_csv("all_ww_2022-08-22_edited_lineages.tsv", index=False, sep="\t")
    for name, group in meta_grouped:
        group_name = name
        group = group.reset_index()
        group = group.set_index("Sample_name"))
        lineages = True
        windowSize = 14
        color0 = []
        makePlot_time(group, lineages, windowSize, color0, group_name)