In [2]:
import pandas as pd
import numpy as np
import os

This file enables matching of P. falciparum GENE IDs with orthologous P. reichenowi or P. praefalciparum ortholog GENE IDs based on ortholog groups pulled from PlasmoDB/OrthoMCL (in files referenced below). Files are then split into appropriately sized txt files to input into PlasmoDB. The search strategy (as described below) will allow you to pull CDS fasta files for each gene. Resulting files can be renamed and used as input for divergence.py, which will align orthologs, keeping the best alignment and tallying NS, S, and FFD sites and sequence changes corresponding to each alignment. 

In [None]:
os.chdir('') # input path to data directory 

In [5]:
#orthos = pd.read_csv('Pfal_Ppraefal_orthos.csv') # for comparison with P. praefalciparum
orthos = pd.read_csv('Pfal_Preichenowi_orthos.csv') # for comparison with P. reichenowi
orthos
# orthos downloaded from plasmodb ortholog search of Pf3D7 and ppraefal G01 OR pRiechenowi reference 

Unnamed: 0,Gene ID,source_id,Organism,Ortholog count,Paralog count,Ortholog Group,Genomic Location (Gene)
0,PF3D7_0100100,PF3D7_0100100.1,Plasmodium falciparum 3D7,2211,65,OG6_104345,"Pf3D7_01_v3:29,510..37,126(+)"
1,PF3D7_0100200,PF3D7_0100200.1,Plasmodium falciparum 3D7,4601,156,OG6_100719,"Pf3D7_01_v3:38,982..40,207(-)"
2,PF3D7_0100300,PF3D7_0100300.1,Plasmodium falciparum 3D7,2211,65,OG6_104345,"Pf3D7_01_v3:42,367..46,507(-)"
3,PF3D7_0100400,PF3D7_0100400.1,Plasmodium falciparum 3D7,4601,156,OG6_100719,"Pf3D7_01_v3:50,363..51,636(+)"
4,PF3D7_0100600,PF3D7_0100600.1,Plasmodium falciparum 3D7,4601,156,OG6_100719,"Pf3D7_01_v3:53,778..55,006(-)"
...,...,...,...,...,...,...,...
11083,PRCDC_API04700,PRCDC_API04700.1,Plasmodium reichenowi CDC,18,0,OG6_532771,"PrCDC_API_v3:22,413..22,568(-)"
11084,PRCDC_API04800,PRCDC_API04800.1,Plasmodium reichenowi CDC,45,0,OG6_104113,"PrCDC_API_v3:22,575..23,987(-)"
11085,PRCDC_MIT01400,PRCDC_MIT01400.1,Plasmodium reichenowi CDC,54,0,OG6_104028,"PrCDC_MIT_v3:732..1,481(-)"
11086,PRCDC_MIT02100,PRCDC_MIT02100.1,Plasmodium reichenowi CDC,48,0,OG6_102770,"PrCDC_MIT_v3:1,943..3,469(+)"


In [None]:
# input path to gene lists of interest--e.g. genes detected in the assay with initial breadth estimates 
# (these estimates are overwritten / corrected later once DE filter is included, but for now, they should include
# all genes in the assay to pipe into dN/dS analysis)  
os.chdir('') # insert path to gene set of interest 
stages = ['gene_sets_assay/' + str(i) for i in range(1,7)]
stage_genes = []
for stage in stages: 
  # read in stage-specific gene set
  fname = stage + '.txt'
  gene_set = np.loadtxt(fname, dtype = str)
  stage_genes.append(gene_set)

stagedf = pd.DataFrame(stage_genes).T
#stagedf.columns = ['ookinete', 'gametocyte', 'sporozoite', 'schizont', 'trophozoite', 'ring']
stagedf.columns = [str(i) for i in range(1,7)]
stagedf = pd.melt(stagedf)
stagedf = stagedf[~pd.isnull(stagedf['value'])]

In [None]:
stagedf = stagedf.merge(orthos, left_on='value', right_on='Gene ID')

In [None]:
#ortho_prae = orthos[orthos['Organism']=='Plasmodium praefalciparum strain G01']
ortho_prae = orthos[orthos['Organism']=='Plasmodium reichenowi CDC']
ortho_prae = ortho_prae[['Ortholog Group', 'source_id']]
stagedf = stagedf.merge(ortho_prae, left_on='Ortholog Group', right_on='Ortholog Group')
stagedf.columns

In [17]:
#stagedf = stagedf.rename(columns={"source_id_x":"Pfal_gene", "source_id_y":"Ppraefal_gene", "variable":"stage"}).drop(columns=(["Ortholog count", "Paralog count", "Organism", "Genomic Location (Gene)", "value", "Ortholog Group"]))
stagedf = stagedf.rename(columns={"source_id_x":"Pfal_gene", "source_id_y":"Preich_gene", "variable":"stage"}).drop(columns=(["Ortholog count", "Paralog count", "Organism", "Genomic Location (Gene)", "Ortholog Group", "value"]))

In [None]:
stagedf.drop_duplicates(inplace=True)
stagedf # calling ID variable "stage" even though it may be e.g. breadth

In [None]:
stagedf['Pfal_gene'].nunique()
stagedf['Preich_gene'].nunique()

In [None]:
#write to csv
stagedf.to_csv("breadth_orthologs_reichenowi_simplified.csv", index=None)
#stagedf.to_csv("breadth_orthologs_praefal_simplified.csv", index=None)

In [None]:
# extract gene names for each
!cut -d',' -f3 breadth_orthologs_reichenowi_simplified.csv | sort | uniq > breadth_pfal_r_ortho.txt
!cut -d',' -f4 breadth_orthologs_reichenowi_simplified.csv | sort | uniq > breadth_preich_ortho.txt

In [None]:
!wc -l breadth_preich_ortho.txt
!sort breadth_preich_ortho.txt | uniq | wc -l

Orthologs can be entered into PlasmoDB search of 3D7 and P. praefalciparum G01 reference genomes, respectively. ('Identify Genes based on List of IDs' option after clicking dataset). 
Results --> Download --> FASTA --> Spliced Genomic Region --> CDS

PlasmoDB: Searches --> Genes --> Annotation, curation and Identifiers --> List of IDs --> enter .txt files and download output as documented above. 
Breadth gene sets too long for PlasmoDB download: split into 3 (below). 
Note that for searches above 2000 genes in length, gene lists may need to be split into multiple entries

In [None]:
# split into smaller files for PlasmoDB search (limited gene list size allowed)
# preich 
!head -2000 breadth_preich_ortho.txt > breadth_preich_ortho_1.txt
!head -4000 breadth_preich_ortho.txt | tail -2000 > breadth_preich_ortho_2.txt
!tail -1401 breadth_preich_ortho.txt > breadth_preich_ortho_3.txt

# pfal w/ preich matches
!head -2000 breadth_pfal_r_ortho.txt > breadth_pfal_r_ortho_1.txt
!head -4000 breadth_pfal_r_ortho.txt | tail -2000 > breadth_pfal_r_ortho_2.txt
!tail -1004 breadth_pfal_r_ortho.txt > breadth_pfal_r_ortho_3.txt