Processing of GISAID data - downloaded sequences in fasta and metadata in tsv

In [None]:
#!/bin/bash

dataset=$1
step=$2

fasta="/gpfs/gsfs12/users/Irp-jiang/share/covid_data/GISAID/${1}/sequences.fasta"
metadata="/gpfs/gsfs12/users/Irp-jiang/share/covid_data/GISAID/${1}/metadata.tsv"

if [ $step == "1" ]
then
    mkdir "/gpfs/gsfs12/users/Irp-jiang/share/rafa_data/GISAID_processing_WWBE/$1"
    mkdir "/gpfs/gsfs12/users/Irp-jiang/share/rafa_data/GISAID_processing_WWBE/$1/01.processing"
    cd "/gpfs/gsfs12/users/Irp-jiang/share/rafa_data/GISAID_processing_WWBE/$1/01.processing"
    module load seqtk seqkit
    seqkit fx2tab -j 56 $fasta > sequences.tsv
    sort -k1,1 sequences.tsv > sequences_sort.tsv
    awk '{print $1"|"$4"|"$16"\t"$3}' FS="\t" $metadata |sed '1d'|sort -k1,1 -S 80% --parallel 56 > name.tsv
    cut -f1 name.tsv|uniq -d >name.d
    fgrep -f name.d -w name.tsv|cut -f 2 > redo.list
    paste name.tsv sequences_sort.tsv|fgrep -f redo.list -w -v |cut -f 2,4|seqkit -j 56 tab2fx >sequences.rename.fa
fi
if [ $step == "2" ]
then 
    # Download sequences from the redo.list and save as redo_sequences.fasta before running step 2
    cd "/gpfs/gsfs12/users/Irp-jiang/share/rafa_data/GISAID_processing_WWBE/$1/01.processing"
    module load seqtk seqkit
    seqkit replace -j 56 -p "^(.+)\|(.+)\|(.+)$" -r "\${2}" redo_sequences.fasta > sequences.add.fa
    cat sequences.rename.fa sequences.add.fa >sequences.acc.fa
    seqkit fx2tab sequences.add.fa -n|cat - redo.list |sort |uniq -u > remove.list
    module load seqkit
    rm sequences.rename.fa sequences.add.fa
    rm -rf sequences.tsv name.tsv name.d sequences_sort.tsv
    seqkit split sequences.acc.fa -s 50000 -O split
fi

In [None]:
#!bin/bash
#This bash script will set the folders and files to run Nextclade and Pangolin at the same time through swarm
#variables
wd=$(pwd)
version=$1
path="/gpfs/gsfs12/users/Irp-jiang/share/rafa_data/GISAID_processing_WWBE/"

# going into the versiond (date)
cd $path$version

#02.Nextclade
#create folder

mkdir 02.nextclade
cd 02.nextclade

#copy the updated run_nextclade.sh to be used 
cp $wd/run_nextclade.sh $path$version/02.nextclade

#run nextclade in split files - creates output file for each split file 
# create swarm file using split X 
for file in $path$version/01.processing/split/*; 
do
    base=$(echo $file | cut -d "/" -f 12 | cut -d "." -f 1,2,3)
    echo "bash run_nextclade.sh "$base" "$version >> run.swarm
done

# run nextclade swarm
swarm -f run.swarm -t 32 -g 80

cd ..

#03.Pangolin
# run pangolin

mkdir 03.pangolin
cd 03.pangolin

#copy the updated run_pangolin.sh to be used
cp $wd/run_pangolin.sh $path$version/03.pangolin

#this step needs to be done in split files - use the same strategy as nextclade
#create swarm file using split X 
for file in $path$version/01.processing/split/*; 
do
    base=$(echo $file | cut -d "/" -f 12 | cut -d "." -f 1,2,3)
    echo "bash run_pangolin.sh "$base" "$version >> run.swarm
done

# run pangolin swarm
swarm -f run.swarm -t 32 -g 80

In [None]:
# run_pangolin.sh
num=$1
version=$2
wd=$(pwd)
module load singularity
singularity run /gpfs/gsfs12/users/Irp-jiang/share/rafa_data/GISAID_processing_WWBE/auxiliar_files/pangolin/pangolin_4.1.2-pdata-1.13.sif pangolin --outfile ${num}.lineage_report.csv -t 32 /gpfs/gsfs12/users/Irp-jiang/share/rafa_data/GISAID_processing_WWBE/${version}/01.processing/split/${num}.fa
sleep 10


# run_nextclade.sh

num=$1
version=$2
wd=$(pwd)
module load singularity
singularity run /gpfs/gsfs12/users/Irp-jiang/share/rafa_data/GISAID_processing_WWBE/auxiliar_files/nextclade/nextclade_2.4.0.sif nextclade run \
   --output-all=${wd}/output/ \
   --in-order \
   --input-dataset=/gpfs/gsfs12/users/Irp-jiang/share/rafa_data/GISAID_processing_WWBE/auxiliar_files/nextclade/data/sars-cov-2/ \
   --output-selection=tsv,csv \
   --output-basename=${num}.nextclade \
   /gpfs/gsfs12/users/Irp-jiang/share/rafa_data/GISAID_processing_WWBE/${version}/01.processing/split/${num}.fa
sleep 10


In [None]:
module load datamash

#variables
wd=$(pwd)
version=$1
frequency=$2
path="/gpfs/gsfs12/users/Irp-jiang/share/rafa_data/GISAID_processing_WWBE/"

# going into the versiond (date)
cd $path$version
#organize log files
mkdir $path$version/02.nextclade/log
mv $path$version/02.nextclade/swarm* $path$version/02.nextclade/log/

mkdir $path$version/03.pangolin/log
mv $path$version/03.pangolin/swarm* $path$version/03.pangolin/log/

#merge files from Nextclade and Pangolin output

# merge and select Nextclade
awk 'FNR==1 && NR!=1{next;}{print}' $path$version/02.nextclade/output/*.tsv > $path$version/02.nextclade/output/nextclade.tsv

#get the samples that passed filters
dos2unix $path$version/02.nextclade/output/nextclade.tsv

#This was valid before the Nextclade v.1.10.2
#awk 'NR==1||$4=="good" && $28=="good" && $32=="good" && $37=="good" && $41=="good" && $48=="good" && $52=="good"' FS=$'\t' $path$version/02.nextclade/output/nextclade.tsv > $path$version/02.nextclade/nextclade.selected.tsv

#new collumns after update to nextclade v.1.11.0
awk 'NR==1||$5=="good" && $38=="good" && $42=="good" && $47=="good" && $51=="good" && $58=="good" && $62=="good"' FS=$'\t' $path$version/02.nextclade/output/nextclade.tsv > $path$version/02.nextclade/nextclade.selected.tsv

#Concatanate de pangolin results:
awk '(NR == 1) || (FNR > 1)' $path$version/03.pangolin/*.csv > $path$version/03.pangolin/lineage_report.csv

#get the info from samples that passed filter - OLD PANGOLIN - before May 2022 updates
#awk '$12=="passed_qc"{print $1"\t"$2}' FS="," $path$version/03.pangolin/lineage_report.csv |sort -k1,1 > $path$version/03.pangolin/acc_pangolin.tsv

#get the info from samples that passed filter
awk '$14=="pass"{print $1"\t"$2}' FS="," $path$version/03.pangolin/lineage_report.csv |sort -k1,1 > $path$version/03.pangolin/acc_pangolin.tsv

#04.refine

mkdir 04.refine

awk 'NR>1' $path$version/02.nextclade/nextclade.selected.tsv |sort -k1,1 |join -1 1 -2 1 $path$version/03.pangolin/acc_pangolin.tsv - -t $'\t'  > $path$version/04.refine/full_table.tsv
cat $path$version/04.refine/full_table.tsv | sort -k1,1 -u > $path$version/04.refine/full_table_nodup.tsv


#05.sort_SNVs

mkdir 05.sort_SNVs
cd 05.sort_SNVs
cp $wd/get_prevalence.py $path$version/05.sort_SNVs

echo -e "sequence\tlineage\tsubstitutions\tdeletions\tinsertions\tprivateNucMutations.reversionSubstitutions\tprivateNucMutations.labeledSubstitutions\tprivateNucMutations.unlabeledSubstitutions" > $path$version/05.sort_SNVs/full_table_info.tsv
cat $path$version/04.refine/full_table_nodup.tsv | cut -f 1,2,17,18,19,20,21,22 >> $path$version/05.sort_SNVs/full_table_info.tsv

#take out any empy SNPs lines
#add one nucleotide per line
#get snps info
awk '{print $1,$2","$3}' FS="\t" OFS="\t" $path$version/05.sort_SNVs/full_table_info.tsv | awk '{for(i=2;i<=NF;i++){print $1"\t"$i}}' FS="," OFS="\t" > $path$version/05.sort_SNVs/full_table_info_nuc.tsv
awk -F '\t' '$3 != ""' full_table_info_nuc.tsv > full_table_info_nuc_nonNAN.tsv
#get deletion info
awk '{print $1,$2","$4}' FS="\t" OFS="\t" $path$version/05.sort_SNVs/full_table_info.tsv | awk '{for(i=2;i<=NF;i++){print $1"\t"$i}}' FS="," OFS="\t" > $path$version/05.sort_SNVs/full_table_info_del.tsv
awk -F '\t' '$3 != ""' full_table_info_del.tsv > full_table_info_del_nonNAN.tsv
#get insertion info
awk '{print $1,$2","$5}' FS="\t" OFS="\t" $path$version/05.sort_SNVs/full_table_info.tsv | awk '{for(i=2;i<=NF;i++){print $1"\t"$i}}' FS="," OFS="\t" > $path$version/05.sort_SNVs/full_table_info_ins.tsv
awk -F '\t' '$3 != ""' full_table_info_ins.tsv > full_table_info_ins_nonNAN.tsv

#merge snps, deltetions and insertios
cat full_table_info_nuc_nonNAN.tsv full_table_info_del_nonNAN.tsv full_table_info_ins_nonNAN.tsv > full_table_info_all.tsv

#merge deletions and insertion to use SNVS from USHER tree
cat full_table_info_del_nonNAN.tsv full_table_info_ins_nonNAN.tsv > full_table_info_ins_del.tsv

#get count of variants per lineage
cat $path$version/05.sort_SNVs/full_table_info_all.tsv | datamash --header-out --sort groupby 3,2 count 1 > $path$version/05.sort_SNVs/full_table_info_all_permut.tsv
cat $path$version/05.sort_SNVs/full_table_info_ins_del.tsv | datamash --header-out --sort groupby 3,2 count 1 > $path$version/05.sort_SNVs/full_table_info_ins_del_permut.tsv

#get count of total sequences per lineage
cat $path$version/05.sort_SNVs/full_table_info.tsv | datamash --header-out --sort groupby 2 countunique 1 > $path$version/05.sort_SNVs/full_table_info_countseqs.tsv

#get lineage prevalence using above files
python $path$version/05.sort_SNVs/get_prevalence.py full_table_info_countseqs.tsv full_table_info_all_permut.tsv full_table_info_all_final.tsv
python $path$version/05.sort_SNVs/get_prevalence.py full_table_info_countseqs.tsv full_table_info_ins_del_permut.tsv full_table_info_ins_del_final.tsv

#filter data based on frequency within the lineage - only did it for snvs info since deletions and isertion are already rare
cat $path$version/05.sort_SNVs/full_table_info_all_final.tsv | awk -v start="${frequency}" -v end="100.0" -F"\t" '$5>=start && $5<=end' > $path$version/05.sort_SNVs/full_table_info_all_final_gtr${frequency:0:2}.tsv
cat $path$version/05.sort_SNVs/full_table_info_ins_del_final.tsv | awk -v start="${frequency}" -v end="100.0" -F"\t" '$5>=start && $5<=end' > $path$version/05.sort_SNVs/full_table_info_ins_del_final_gtr${frequency:0:2}.tsv

#put lineage SNPS per line
cat $path$version/05.sort_SNVs/full_table_info_all_final_gtr${frequency:0:2}.tsv | datamash --sort groupby 1 count 2 collapse 2 > $path$version/05.sort_SNVs/full_table_info_all_final_gtr${frequency:0:2}_perlin.tsv
cat $path$version/05.sort_SNVs/full_table_info_all_final_gtr${frequency:0:2}.tsv | datamash --sort groupby 2 count 1 collapse 1 > $path$version/05.sort_SNVs/full_table_info_all_final_gtr${frequency:0:2}_listsnvs.tsv
cat $path$version/05.sort_SNVs/full_table_info_ins_del_final_gtr${frequency:0:2}.tsv | datamash --sort groupby 1 count 2 collapse 2 > $path$version/05.sort_SNVs/full_table_info_ins_del_final_gtr${frequency:0:2}_perlin.tsv
cat $path$version/05.sort_SNVs/full_table_info_ins_del_final_gtr${frequency:0:2}.tsv | datamash --sort groupby 2 count 1 collapse 1 > $path$version/05.sort_SNVs/full_table_info_ins_del_final_gtr${frequency:0:2}_listsnvs.tsv
#set-up for lineage assingment
cd ..
mkdir 06.lineage_assingment
cd 06.lineage_assingment
cp $path$version/05.sort_SNVs/full_table_info_ins_del_final_gtr${frequency:0:2}_listsnvs.tsv .

In [None]:
wd=$(pwd)
version=$1
usher_date=$2
path="/gpfs/gsfs12/users/Irp-jiang/share/rafa_data/GISAID_processing_WWBE/"

#update lineages
cd /data/salgadofontenr2/pango-designation/
git pull
cd  $wd

cp /data/salgadofontenr2/pango-designation/lineage_notes.txt $path$version/06.lineage_assingment
cp /data/salgadofontenr2/pango-designation/pango_designation/alias_key.json $path$version/06.lineage_assingment
cp /gpfs/gsfs12/users/Irp-jiang/share/rafa_data/GISAID_processing_WWBE/${version}/05.sort_SNVs/full_table_info_ins_del_final_gtr75_listsnvs.tsv $path$version/06.lineage_assingment
cp /gpfs/gsfs12/users/Irp-jiang/share/rafa_data/GISAID_processing_WWBE/Usher_processing/${usher_date}/lineagePaths_edited_${usher_date}.pkl $path$version/06.lineage_assingment
cp /gpfs/gsfs12/users/Irp-jiang/share/rafa_data/GISAID_processing_WWBE/Usher_processing/${usher_date}/lineagePaths_edited_clades_${usher_date}.pkl $path$version/06.lineage_assingment

# conda activate cov-dist

cd $path$version/06.lineage_assingment
cat lineage_notes.txt | cut -f 1 | sed '/^*/d' > lineage_list.txt
python $wd/get_derivedsnvs.py $version full_table_info_ins_del_final_gtr75_listsnvs.tsv lineagePaths_edited_${usher_date}.pkl

Python script to obtain derived SNVs per lineage

In [None]:
#working function but no nested dictionaries
#!/usr/bin/env python
from numpy.lib.utils import info
import pandas as pd
from ete3 import Tree
import pickle
import sys
import json

#date of current version
version = sys.argv[1]
#name of the file coming from gisaid with sequence insertions and deletions "Full_table_info_ins_del_final_gtr75_listsnvs.tsv"
ins_del = sys.argv[2]
# usher processes info of snvs per lineage "lineagePaths_edited_<date>.pkl"
snvs = sys.argv[3]


# Load the alias info
with open('alias_key.json') as f:
    d = json.load(f)
    m = { key: value if value else key for key, value in d.items() if not type(value) is list }
f.close()

def get_pair(i):
    l=i.split(".")
    pair = set()
    while len(l) > 1:
        suf = l.pop()
        pre = ".".join(l)
        if len(l) == 1:
            pair.add((m[pre],pre+"."+suf))
        else:
            pair.add((pre,pre+"."+suf))
    return pair

# Load parent children key from alias_key
data = [line.strip() for line in open("lineage_list.txt", 'r')]
t={("root","A"),("root","B")}
for i in data:
    t.update(get_pair(i))


tree = Tree.from_parent_child_table(t)

#merge snvs from Usher and ins/deletion derived from GISAID 
file_ins_del = pd.read_csv(ins_del, sep="\t", header=None, names=["Lineage","root_id", "SNVs"])
file_ins_del.set_index("Lineage", inplace=True)
file_snvs = pd.read_pickle(snvs)
file_snvs.set_index("Lineage", inplace=True)
for i in file_snvs.index:
    for index in file_ins_del.index:
        if i == index:
            list_to_add=file_ins_del.loc[index, "SNVs"].split(",")
            old_list=file_snvs.loc[i, "SNVs"]
            new_list = old_list + list_to_add
            file_snvs.at[i,"SNVs"] = new_list

#function for store info in a dataframe
def store_info(file_snvs,tree):
    data = pd.DataFrame()
    lineages_list=[]
    parents_list=[]
    child_list = []
    snvs_list=[]
    derived_snvs_list=[]
    for i ,row in file_snvs.iterrows():
        sample = i #row['Lineage']
        sample_snvs = row["SNVs"] #row["SNVs"].split(",") #when not using usher
        sample_snvs = list(set(sample_snvs))
        if sample not in lineages_list:
            children, parent, sisters =  get_relationship(sample,tree)
            lineages_list.append(sample)
            parents_list.append(parent)
            child_list.append(children)
            snvs_list.append(snvs_lineage_all(sample_snvs, file_snvs)) # changed to get info of snvs shared by lineage
            list_snvs = get_defining_snvs(sample, parent, file_snvs) #add 
            derived_snvs_list.append(snvs_lineage_sisters(list_snvs, sisters , file_snvs, sample))
    data["lineage"] = lineages_list
    data['parent'] = parents_list
    data['child'] = child_list
    data["snvs"] = snvs_list
    data["derived_snvs"] =derived_snvs_list
    return data

def check_missing_parent(file_snvs, tree):
    temp_df = pd.DataFrame()
    lineages_to_check= [lin for lin in file_snvs.index] #file_snvs["Lineage"]]
    to_add=[]
    for i, row in file_snvs.iterrows():
        lineage = i #row["Lineage"]
        child, parent, sister = get_relationship(lineage, tree)
        if parent not in lineages_to_check and parent != "root" and parent != "B" and parent != "":
            to_add.append(parent)
    to_add=list(set(to_add))
    if to_add:
        for lin in to_add:
            snvs_list=[]
            child, parent, sister = get_relationship(lin, tree)
            child_check = [ch for ch in child if ch in lineages_to_check]
            child_num=len(child_check)
            for children in child_check:
                if children in file_snvs.index:
                    info = file_snvs.loc[children,"SNVs"]
                    snvs_list += info
            snvs_list_shared= [snv for snv in snvs_list if snvs_list.count(snv) == child_num]
            temp_df = temp_df.append({"Lineage": lin, "number": 0, "SNVs": snvs_list_shared}, ignore_index=True)
    temp_df = temp_df.set_index("Lineage")
    file_snvs = file_snvs.append(temp_df)
    return file_snvs


#function to get relationship info
def get_relationship(lineage,tree):
    child =[]
    sisters =[]
    parent = ""
    for node in tree.traverse():
        if node.name == lineage:
            c = node.get_children()
            parent = (node.up).name
            s = node.get_sisters()
            for kid in c:
                child.append(kid.name)
            for sis in s:
                sisters.append(sis.name)
    return child ,parent, sisters

#function to get derived SNVs
def get_defining_snvs(lineage, parent, file_snvs):
    parent_snvs_list = []# for when a parent is missing from the data
    for i,row in file_snvs.iterrows():
        if i == lineage: #row["Lineage"]
            lineage_snvs_list = row["SNVs"] #row["SNVs"].split(",") #when not using usher
            lineage_snvs_list = list(set(lineage_snvs_list))
        if i == parent: #row["Lineage"]
            parent_snvs_list = row["SNVs"] #row["SNVs"].split(",") #when not using usher
            parent_snvs_list = list(set(parent_snvs_list))
    final_snvs_list = [snv for snv in lineage_snvs_list if snv not in parent_snvs_list]
    return final_snvs_list

#function to get lineages associated with derived snvs
def snvs_lineage_sisters(list_snvs, sisters, file_snvs, sample):
    snv_lin_list = []
    for snvs in list_snvs:
        info = {}
        info[snvs] =[]
        for i,row in file_snvs.iterrows():
            if snvs in row["SNVs"]: #row["SNVs"].split(","): #when not using usher
                if i in sisters or i == sample: #row["Lineage"]
                    info[snvs].append(i)
        snv_lin_list.append(info)
    return snv_lin_list

def snvs_lineage_all(list_snvs, file_snvs):
    snv_lin_list = []
    for snvs in list_snvs:
        info = {}
        info[snvs] =[]
        for i,row in file_snvs.iterrows():
            if snvs in row["SNVs"]: #row["SNVs"].split(","): #when not using usher
                info[snvs].append(i) #row["Lineage"]
        snv_lin_list.append(info)
    return snv_lin_list



checked_file = check_missing_parent(file_snvs,tree)
result = store_info(checked_file,tree)
result.to_pickle("child_parent_info_"+version+"_all.pkl") # to load directly into lineage_assingment
#result.to_json("child_parent_snv_info_delta.json")
result.to_csv("child_parent_info_"+version+"_all.tsv", sep="\t", index=False) #output_file argument 
