# Running instructions

## Change working directory

In [None]:
cd ~/workspace/genome_data/annotations/e_coli/gene_sequences/fhub_gene
source ~/workspace/alfred-data-analysis/.venv_gwas/bin/activate

## Obtain variants from the multiple sequence alignment files

In [None]:
snp-sites -v -o variants.vcf aligned.fna

## Patristic distances

### Extract distance from phylogeny

In [None]:
python ~/workspace/pyseer/scripts/phylogeny_distance.py tree.nwk  > phylogeny_dists.tsv

### Perform GWAS

In [None]:
pyseer --phenotypes /home/vmadmin/workspace/ehr_data/data/full_cohort/tube_id_mortality.pheno --vcf variants.vcf --distances phylogeny_dists.tsv --lineage --max-dimensions 6 --min-af 0.06 --max-af 0.94 > mortality_SNPs.txt

### Format output

In [None]:
cat <(echo "#CHR SNP BP minLOG10(P) log10(p) r^2") <(paste <(sed '1d' mortality_SNPs.txt | cut -d "_" -f 2) <(sed '1d' mortality_SNPs.txt | cut -f 4) | awk '{p = -log($2)/log(10); print "1",".",$1,p,p,"0"}' ) | tr ' ' '\t' > mortality_snps.plot

## Distance from root

### Extract distance from phylogeny

In [None]:
python ~/workspace/pyseer/scripts/phylogeny_distance.py --lmm tree.nwk  > phylogeny_K.tsv

### Perform GWAS

In [None]:
python ~/workspace/pyseer/pyseer-runner.py --lmm --phenotypes ~/workspace/ehr_data/data/full_cohort/tube_id_mortality.pheno --vcf variants.vcf --similarity phylogeny_K.tsv --phenotype-column death_30_day --output-patterns mortality_SNP_patterns.txt > mortality_SNPs_lmm.txt

### Analyse output

#### Count the number of patterns to control for multiple testing

In [None]:
python ~/workspace/pyseer/scripts/count_patterns.py mortality_SNP_patterns.txt

Output:

Patterns:	98

Threshold:	5.10E-04

In [6]:
library(data.table)


gono_gwas <- fread('~/workspace/genome_data/annotations/e_coli/gene_sequences/fhub_gene/mortality_SNPs_lmm.txt', data.table = FALSE)
head(gono_gwas)

Unnamed: 0_level_0,variant,af,filter-pvalue,lrt-pvalue,beta,beta-std-err,variant_h2,notes
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1,1_592_T_C,0.841,0.962,0.565,-0.0238,0.0413,0.0199,
2,1_595_C_A,0.0107,0.0227,0.0281,0.232,0.105,0.0757,
3,1_603_C_T,0.0261,4.81e-05,8.16e-05,0.283,0.0715,0.135,
4,1_610_C_T,0.0249,0.902,0.828,-0.0166,0.0764,0.00751,
5,1_618_A_G,0.0273,9.38e-05,0.000161,0.265,0.0699,0.13,
6,1_619_C_T,0.805,0.787,0.893,-0.00574,0.0425,0.00466,


In [4]:
gono_gwas <- gono_gwas[order(gono_gwas$`lrt-pvalue`),]
gono_gwas <- gono_gwas[!grepl("bad-chisq", gono_gwas$notes),]

# threshold form running count_patterns in pyseer
sig_threshold <- 0.05/(98)
sig_threshold

In [5]:
sum(gono_gwas$`lrt-pvalue`<sig_threshold)
sig_hits <- gono_gwas[gono_gwas$`lrt-pvalue`<sig_threshold,]
sig_hits

Unnamed: 0_level_0,variant,af,filter-pvalue,lrt-pvalue,beta,beta-std-err,variant_h2,notes
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
27,1_844_C_T,0.0238,1.03e-05,1.66e-05,0.322,0.0743,0.148,
3,1_603_C_T,0.0261,4.81e-05,8.16e-05,0.283,0.0715,0.135,
5,1_618_A_G,0.0273,9.38e-05,0.000161,0.265,0.0699,0.13,
29,1_856_T_C,0.0665,0.00019,0.000186,0.176,0.0468,0.128,
30,1_858_A_G,0.0689,0.000344,0.000338,0.167,0.0463,0.123,
31,1_861_T_C,0.0689,0.000344,0.000338,0.167,0.0463,0.123,
32,1_864_G_C,0.0689,0.000344,0.000338,0.167,0.0463,0.123,
101,1_1902_A_G,0.0143,0.000328,0.00051,0.322,0.0922,0.12,


## Pairwise distance matrix produced using mash

### Create the mash sketches

In [None]:
mash sketch -s 10000 -o mash_sketch /home/vmadmin/workspace/genome_data/fasta/ECOLI/*fasta

### Calculate distances between all pairs of samples

In [None]:
mash dist mash_sketch.msh mash_sketch.msh| square_mash > mash.tsv

In [None]:
python ~/workspace/pyseer/scree_plot.py mash.tsv

In [None]:
sed -i 's/_short//g' mash.tsv

### Perform GWAS

In [None]:
python ~/workspace/pyseer/pyseer-runner.py --phenotypes ~/workspace/ehr_data/data/full_cohort/tube_id_mortality.pheno --vcf variants.vcf --distances mash.tsv --phenotype-column death_30_day --output-patterns mortality_SNP_patterns_mash.txt --max-dimensions 6 --min-af 0.085 --max-af 0.915 > mortality_SNPs_mash.txt