In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy as scp

# Getting the protein sequences to run DeepSig with

Downloaded the translated amino-acid sequence fasta from https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M28/gencode.vM28.pc_translations.fa.gz on 3/8/2022

In [2]:
with open('../../preprocessing/BICCN_preprocessing/mop_genes.txt') as f:
    mop_genes = set([g.strip() for g in f.readlines()])
    
with open('../../preprocessing/vz_Brainmap_preprocessing/genes.txt') as f:
    vz_brainmap_genes = set([g.strip() for g in f.readlines()])

with open('../../preprocessing/SeqFishplus_preprocessing/seq_fish_plus_genes.txt') as f:
    seq_genes = set([g.strip() for g in f.readlines()])

with open('../../preprocessing/CZB_kidneyliver_preprocessing/all_genes.txt') as f:
    czb_genes = set([g.strip() for g in f.readlines()])

with open('../../preprocessing/vz_Livershowcase_preprocessing/all_genes.txt') as f:
    vz_liver_genes = set([g.strip() for g in f.readlines()])


    
genes = (
    mop_genes
        .union(vz_brainmap_genes)
        .union(seq_genes)
        .union(czb_genes)
        .union(vz_liver_genes)
)
print(len(genes))

10612


In [3]:
import pysam

path = '/oak/stanford/groups/horence/rob/isoform_localizations/deepsig/gencode.vM28.pc_translations.fa'

seqs = {}

with pysam.FastaFile(path) as fa:
    for ref in fa.references:
        gene = ref.split('|')[-2]
        if gene not in genes:
            continue
        
        seq = fa.fetch(reference=ref)
        
        #use the longest isoform for the signal peptide prediction        
        if (gene not in seqs) or (len(seq) > len(seqs[gene])):
            seqs[gene] = seq
            
        
len(seqs)

10194

In [4]:
#Nearly all the MOp genes were found
print(len(mop_genes))
len(mop_genes.intersection(seqs.keys()))

252


238

In [5]:
#Most of the Viz genes were found
print(len(vz_brainmap_genes))
len(vz_brainmap_genes.intersection(seqs.keys()))

649


478

In [6]:
#Most of the Seq genes were found
print(len(seq_genes))
len(seq_genes.intersection(seqs.keys()))

10000


9772

In [7]:
#Most CZB liver genes found
print(len(czb_genes))
len(czb_genes.intersection(seqs.keys()))

307


292

In [8]:
#Most vz liver genes found
print(len(vz_liver_genes))
len(vz_liver_genes.intersection(seqs.keys()))

346


341

In [9]:
with open('/oak/stanford/groups/horence/rob/isoform_localizations/deepsig/sub_prots.fa','w') as fa:
    for gene,seq in seqs.items():
        fa.write('>{}\n{}\n'.format(gene,seq))

# Notes on running Deepsig

Working in this directory: /oak/stanford/groups/horence/rob/isoform_localizations/deepsig

Have a virtualenv with deepsig installed: source .venv/bin/activate

(need to ml python/3.9.0 too)

then deepsig -h to get the help menu

need to set the following environment variable
/oak/stanford/groups/horence/rob/isoform_localizations/deepsig$ export DEEPSIG_ROOT=$(pwd)

I ran deepsig with: deepsig -f sub_prots.fa -k euk -o results.txt

Then copied the outputs to SRRS/inputs/deepsig_results.csv