In [3]:
import pandas as pd
import numpy as np

In [4]:
from Bio.Seq import Seq
from Bio import SeqIO
#from Bio.Alphabet import IUPAC
import sys
# import pyensembl
import os
#https://towardsdatascience.com/a-simple-guide-to-command-line-arguments-with-argparse-6824c30ab1c3


In [5]:
# SK: Dictionary of protein names to sequences
proteins = {}
for record in SeqIO.parse("../raw_files/gencode.v36.pc_translations.fa", "fasta"):
    name = record.id.split("|")[1].split(".")[0]
    proteins[name] = str(record.seq)
    

# SK: Dictionary of protein names to CDS dna transcript

dna_transcripts = {}
for record in SeqIO.parse("../raw_files/gencode.v36.pc_transcripts.fa", "fasta"):
	name = record.id.split("|")[0].split(".")[0]
	record_c = record.id.split("|")
	for i in record_c:
		if "CDS" in i:
			coords = i.replace("CDS:","")
	start = int(coords.split("-")[0])
	end = int(coords.split("-")[1])
	dna_seq = str(record.seq)[start-1:end]
	dna_transcripts[name] = dna_seq

# Plan:
1. Use cds coordinates, create df with genomic coordinates as one column and corresponding nt as second column
2. Use position in variant file to resplace the value in the nt column
3. Translate the nt column

In [44]:
ENST = "ENST00000262238"

In [60]:
# Loading in the variants for that ENST
variants = pd.read_csv("../outputs/mutations/domains_expanded_iWES_v2_variants_snv_classified/" + ENST + ".bed", sep = "\t", header = None)
no_syn_variants = variants[variants[22] == "No-Syn"]

# The strand the gene is located on
strand = no_syn_variants[13].iloc[1]

# WT nt and AA seq using all dna transcripts
wt_nt_seq = dna_transcripts[ENST]
wt_AA_seq = str(Seq(wt_nt_seq).translate())

# Adjusting for strand
if strand == "-":
    wt_nt_seq = str(Seq(wt_nt_seq).complement())

# Splitting nucleotides -> df
nt_df = pd.DataFrame({"nt" : [*wt_nt_seq]})

# Reading in the cds coordinates
cds_bed = pd.read_csv("../outputs/mutations/cds_bed_format/" + ENST, sep = "\t", header = None)
cds_bed = cds_bed[[1, 2]]
cds_bed[1] += 1

# Adjusting cds coordinates for strand
if strand == "-":
    cds_bed = cds_bed.sort_values(by = 1, ascending = False)
else:
    cds_bed = cds_bed.sort_values(by = 1, ascending = True)

# Expanding coordinates per position, in order
range_col = []
for start, end in zip(cds_bed[1], cds_bed[2]):
    if strand == "-":
        range_col += list(range(end, start - 1, -1))
    else:
        range_col += list(range(start, end + 1))
nt_df["gen_pos"] = range_col
nt_df = nt_df.set_index("gen_pos")

# Genomic variant name
no_syn_variants["name"] = "g." + no_syn_variants[2].astype(str) + no_syn_variants[17] + ">" + no_syn_variants[18]

known_ADs = pd.read_csv("../../output/known_ADs_considering_isoforms_and_canonical.csv")
wt_ad_aa_seq = known_ADs[known_ADs["uniprotID"] == uniprotID]["ProteinRegionSeq"]


# Replacing wt with variants, saving as df
names = []
TF_seqs = []
for i in no_syn_variants.index:
    var_pos = no_syn_variants[2].loc[i]
    wt_nt = no_syn_variants[17].loc[i]
    var_nt = no_syn_variants[18].loc[i]
    g_name = no_syn_variants["name"].loc[i]

    nt_df_var_copy = nt_df.copy(deep = True)

    if nt_df_var_copy.at[var_pos, "nt"] == wt_nt:
        nt_df_var_copy.at[var_pos, "nt"] = var_nt
    else:
        print("mismatch!")

    new_nt_seq = "".join(nt_df_var_copy["nt"])
    if strand == "-":
        new_nt_seq = Seq(new_nt_seq).complement()
    new_AA_seq = str(Seq(new_nt_seq).translate())

    for i in range(len(new_AA_seq)):
        if new_AA_seq[i] != wt_AA_seq[i]:
            prot_change_descrip = "p." + str(i + 1)+ new_AA_seq[i] + "REP" + wt_AA_seq[i]
    if not (new_AA_seq in wt_ad_aa_seq):
        names.append(g_name + "(" + prot_change_descrip + ")")
        TF_seqs.append(new_AA_seq)
    
variant_TF_seqs_df = pd.DataFrame({"name": names,
             "TF_seq": TF_seqs})


# Reading in uniprot ID - ENST code mappings
uniprotID_ENST_mapping = pd.read_csv("../../data/SFARI_TFs_with_ENST.csv")
uniprotID_ENST_mapping = uniprotID_ENST_mapping[["uniprotID", "ENST"]]
uniprotID_ENST_mapping["ENST"] = uniprotID_ENST_mapping["ENST"].str.split(".").str[0]
uniprotID_ENST_mapping_dict= dict(zip(uniprotID_ENST_mapping["ENST"], uniprotID_ENST_mapping["uniprotID"]))
uniprotID_ENST_mapping_dict['ENST00000434704'] = 'O60479'
uniprotID = uniprotID_ENST_mapping_dict[ENST]

# Saving ADs from full sequence
AD_count = 1
for i in known_ADs[known_ADs["uniprotID"] == uniprotID].index:
    start = known_ADs[known_ADs["uniprotID"] == uniprotID]["Start"].loc[i]
    end = known_ADs[known_ADs["uniprotID"] == uniprotID]["End"].loc[i]
    gene  = known_ADs[known_ADs["uniprotID"] == uniprotID]["Gene"].loc[i]
    variant_TF_seqs_df["AD_" + str(AD_count)] = variant_TF_seqs_df["TF_seq"].str[start - 1:end]
    
    # Drop rows with match to wt
        
    AD_count += 1

for column in list(variant_TF_seqs_df)[2:]:
    ofile = open("../outputs/AD_variant_fasta/" + gene + "_" + uniprotID + "_" + column, "w")
    for i in variant_TF_seqs_df.index:
        ofile.write(">" + variant_TF_seqs_df["name"].loc[i] + "\n" + variant_TF_seqs_df[column].loc[i] + "\n")
    ofile.close()

variant_TF_seqs_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_syn_variants["name"] = "g." + no_syn_variants[2].astype(str) + no_syn_variants[17] + ">" + no_syn_variants[18]


Unnamed: 0,name,TF_seq,AD_1,AD_2
0,g.100239245A>C(p.1LREPM),LASGDTLYIATDGSEMPAEIVELHEIEVETIPVETIETTVVGEEEE...,LASGDTLYIATDGSEMPAEIVELHEIEVETIPVETIETTVVGEEEE...,RTHVRIHTGDRPYVCPFDGCNKKFAQS
1,g.100239246T>A(p.1KREPM),KASGDTLYIATDGSEMPAEIVELHEIEVETIPVETIETTVVGEEEE...,KASGDTLYIATDGSEMPAEIVELHEIEVETIPVETIETTVVGEEEE...,RTHVRIHTGDRPYVCPFDGCNKKFAQS
2,g.100239246T>G(p.1RREPM),RASGDTLYIATDGSEMPAEIVELHEIEVETIPVETIETTVVGEEEE...,RASGDTLYIATDGSEMPAEIVELHEIEVETIPVETIETTVVGEEEE...,RTHVRIHTGDRPYVCPFDGCNKKFAQS
3,g.100239251T>G(p.3AREPS),MAAGDTLYIATDGSEMPAEIVELHEIEVETIPVETIETTVVGEEEE...,MAAGDTLYIATDGSEMPAEIVELHEIEVETIPVETIETTVVGEEEE...,RTHVRIHTGDRPYVCPFDGCNKKFAQS
4,g.100239251T>A(p.3TREPS),MATGDTLYIATDGSEMPAEIVELHEIEVETIPVETIETTVVGEEEE...,MATGDTLYIATDGSEMPAEIVELHEIEVETIPVETIETTVVGEEEE...,RTHVRIHTGDRPYVCPFDGCNKKFAQS
5,g.100239264T>A(p.7HREPL),MASGDTHYIATDGSEMPAEIVELHEIEVETIPVETIETTVVGEEEE...,MASGDTHYIATDGSEMPAEIVELHEIEVETIPVETIETTVVGEEEE...,RTHVRIHTGDRPYVCPFDGCNKKFAQS
6,g.100239266T>A(p.8NREPY),MASGDTLNIATDGSEMPAEIVELHEIEVETIPVETIETTVVGEEEE...,MASGDTLNIATDGSEMPAEIVELHEIEVETIPVETIETTVVGEEEE...,RTHVRIHTGDRPYVCPFDGCNKKFAQS
7,g.100239270T>A(p.9NREPI),MASGDTLYNATDGSEMPAEIVELHEIEVETIPVETIETTVVGEEEE...,MASGDTLYNATDGSEMPAEIVELHEIEVETIPVETIETTVVGEEEE...,RTHVRIHTGDRPYVCPFDGCNKKFAQS
8,g.100239292G>A(p.16IREPM),MASGDTLYIATDGSEIPAEIVELHEIEVETIPVETIETTVVGEEEE...,MASGDTLYIATDGSEIPAEIVELHEIEVETIPVETIETTVVGEEEE...,RTHVRIHTGDRPYVCPFDGCNKKFAQS
9,g.100239301G>C(p.19DREPE),MASGDTLYIATDGSEMPADIVELHEIEVETIPVETIETTVVGEEEE...,MASGDTLYIATDGSEMPADIVELHEIEVETIPVETIETTVVGEEEE...,RTHVRIHTGDRPYVCPFDGCNKKFAQS


In [46]:
uniprotID

'P25490'

In [47]:
exon_coords = set(np.arange(37020692, 37020802, 1)) | set(np.arange(37014997, 37015052, 1))

In [48]:
nt_df[[_ in exon_coords for _ in nt_df.index]]

Unnamed: 0_level_0,nt
gen_pos,Unnamed: 1_level_1


In [42]:
Seq("".join(nt_df[[_ in exon_coords for _ in nt_df.index]]["nt"])).translate()

Seq('LYLLT*SNPPKNTYLPVRAYIRRSPITLNE*FHSPGRPSRPHTVTVARIAVKVVV')

In [43]:
wt_ad_aa_seq

'GRDLASTTLPGYPPHVPPAGQGSYSAPTLTGMVPGSEFSGSPYSHPQYSSYNDSW'