In [1]:
import pandas as pd

In [6]:
## download genome and gff
%%bash
g="GCF_000591075.1"
curl -OJX GET "https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/$g/download?include_annotation_type=GENOME_FASTA&include_annotation_type=GENOME_GFF&hydrated=FULLY_HYDRATED&filename=$g.zip" -H "Accept: application/zip"
unzip -o $g.zip
rm $g.zip

curl: Saved to filename 'GCF_000591075.1.zip'
Archive:  GCF_000591075.1.zip
  inflating: README.md               
  inflating: ncbi_dataset/data/assembly_data_report.jsonl  
  inflating: ncbi_dataset/data/GCF_000591075.1/GCF_000591075.1_Eaff_2.0_genomic.fna  
  inflating: ncbi_dataset/data/GCF_000591075.1/genomic.gff  
  inflating: ncbi_dataset/data/dataset_catalog.json  
  inflating: md5sum.txt              


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  124M    0  124M    0     0  15.7M      0 --:--:--  0:00:07 --:--:-- 8595k


In [7]:
## index genome with STAR
%%bash
singularity exec /cvmfs/singularity.galaxyproject.org/s/t/star:2.7.9a--h9ee0642_0 STAR \
  --runThreadN 64 \
  --runMode genomeGenerate \
  --genomeDir ncbi_dataset/data/GCF_000591075.1/STAR \
  --genomeFastaFiles ncbi_dataset/data/GCF_000591075.1/GCF_000591075.1_Eaff_2.0_genomic.fna \
  --sjdbGTFfile ncbi_dataset/data/GCF_000591075.1/genomic.gff \
  --sjdbOverhang 100

	/usr/local/bin/STAR --runThreadN 64 --runMode genomeGenerate --genomeDir ncbi_dataset/data/GCF_000591075.1/STAR --genomeFastaFiles ncbi_dataset/data/GCF_000591075.1/GCF_000591075.1_Eaff_2.0_genomic.fna --sjdbGTFfile ncbi_dataset/data/GCF_000591075.1/genomic.gff --sjdbOverhang 100
	STAR version: 2.7.9a   compiled: 2021-05-04T09:43:56-0400 vega:/home/dobin/data/STAR/STARcode/STAR.master/source
Sep 13 17:24:41 ..... started STAR run
Sep 13 17:24:41 ... starting to generate Genome files
Sep 13 17:24:47 ..... processing annotations GTF
Sep 13 17:24:54 ... starting to sort Suffix Array. This may take a long time...
Sep 13 17:24:58 ... sorting Suffix Array chunks and saving them to disk...
Sep 13 17:25:30 ... loading chunks from disk, packing SA...
Sep 13 17:25:44 ... finished generating suffix array
Sep 13 17:25:44 ... generating Suffix Array index
Sep 13 17:26:43 ... completed Suffix Array index
Sep 13 17:26:43 ..... inserting junctions into the genome indices
Sep 13 17:27:34 ... writing G



In [1]:
## run rnaseq pipeline on each accession
import pandas as pd
sra=pd.read_csv("SraRunTable_Eurytemora.csv",index_col=0)
for x in list(sra.index):
    !sbatch rna_pipe_paired.sh "$x" "GCF_000591075.1"

In [54]:
import os
dfcounts=pd.DataFrame()
for x in os.listdir('RNA_seq_results'):
    df=pd.read_csv(f"RNA_seq_results/{x}/aligned.tsv",sep="\t",index_col=0)
    df.columns=[x]
    dfcounts=pd.concat([dfcounts,df],axis=1)
  


In [64]:
## compile counts and metadata for submission to deseq2 
no_bacteria=list(sra[sra.treatment.str.contains('not')].index)
F10=list(sra[sra.treatment.str.contains('F10')].index)
ordalii=list(sra[sra.treatment.str.contains('ordalii')].index)
alls=no_bacteria+F10+ordalii

dfmeta=pd.DataFrame(index=alls)
for index, row in dfmeta.iterrows():
    if index in F10:
        dfmeta.loc[index, 'treat']='F10'
    elif index in no_bacteria:
        dfmeta.loc[index, 'treat']='not'
    elif index in ordalii:
        dfmeta.loc[index, 'treat']='ord'
dfcounts.loc[:,alls].to_csv('combined_eurytemora_counts.tsv',sep='\t')
dfmeta.to_csv('eurytemora_meta.tsv',sep='\t')

In [6]:
import pandas as pd
##load deseq2 results of non-symbiotic vs non-treated and display 
deseq=pd.read_csv("ord_v_not.csv",index_col=0)
df=pd.read_csv('SI_table2.tsv',sep="\t")
deseq.loc[set(df[(df.species.str.contains('Eurytemora'))&(df.cluster.isin([14,23,60]))].gene),:]

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
LOC111698135,716.382597,0.11242,0.22667,0.495964,0.61992,1.0
LOC111713211,1654.995113,0.080725,0.209839,0.384697,0.700462,1.0
LOC111712908,331.005634,-0.004296,0.297097,-0.01446,0.988463,1.0
LOC111705930,1218.196533,0.06821,0.169409,0.402636,0.687216,1.0


In [7]:
import pandas as pd
##load deseq2 results of symbiotic vs non-treated and display 
deseq=pd.read_csv("F10_v_not.csv",index_col=0)
df=pd.read_csv('SI_table2.tsv',sep="\t")
deseq.loc[set(df[(df.species.str.contains('Eurytemora'))&(df.cluster.isin([14,23,60]))].gene),:]

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
LOC111698135,716.382597,0.266272,0.226599,1.175076,0.239964,0.999924
LOC111713211,1654.995113,0.02082,0.209903,0.09919,0.920987,0.999924
LOC111712908,331.005634,0.660978,0.296188,2.231613,0.025641,0.999924
LOC111705930,1218.196533,0.435901,0.169197,2.576294,0.009987,0.685189
