In [94]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import subprocess

Choosing to track the smallest TF with both an AD and a DBD.

1. Loading in the AA coordinates.

In [95]:
TF_coords = pd.read_csv("../../data/SFARI_TFs_with_knownADs_coords.csv", index_col = 0)
AD_coords = pd.read_csv("../../data/SFARI_ADs_AA_coords_redone.csv", index_col = 0)
DBD_coords = pd.read_csv("../../data/SFARI_TFs_with_known_ADs_DBD_coords.csv", index_col = 0)

In [96]:
TF_coords.sort_values(by = "End")

Unnamed: 0,uniprotID,Start,End
36,Q9BXK1,1,252
49,P35548,1,267
62,O95096,1,273
37,O75840,1,302
69,P78337,1,314
64,P32242,1,354
21,Q06889,1,387
65,Q02548,1,391
104,P25490,1,414
66,P26367,1,422


In [97]:
relev_TF_coords = TF_coords[TF_coords["uniprotID"] == "P35548"]
relev_TF_coords

Unnamed: 0,uniprotID,Start,End
49,P35548,1,267


In [98]:
relev_AD_coords = AD_coords[AD_coords["uniprotID"] == "P35548"]
relev_AD_coords

Unnamed: 0,uniprotID,Start,End
23,P35548,234,267


In [99]:
relev_DBD_coords = DBD_coords[DBD_coords["uniprotID"] == "P35548"]
relev_DBD_coords

Unnamed: 0,uniprotID,Start,End
18,P35548,142,201


2. Using ensembl to obtain the full TF/domain coordinates.

In [100]:
biomart_output = pd.read_csv("../data/P35548_mart_export.txt", sep = "\t")
biomart_output

Unnamed: 0,Gene stable ID,Gene stable ID version,Transcript stable ID,Transcript stable ID version,Exon region start (bp),Exon region end (bp),CDS start,CDS end,Genomic coding start,Genomic coding end
0,ENSG00000120149,ENSG00000120149.9,ENST00000239243,ENST00000239243.7,174724582,174725038,1,379,174724660,174725038
1,ENSG00000120149,ENSG00000120149.9,ENST00000239243,ENST00000239243.7,174729159,174730896,380,804,174729159,174729583


In [101]:
(max(biomart_output["CDS end"]) - min(biomart_output["CDS start"])) / 3

267.6666666666667

In [102]:
relev_TF_coords

Unnamed: 0,uniprotID,Start,End
49,P35548,1,267


In [103]:
267 * 3

801

3. Comparing this output to the full TF cds_bed_format output

In [104]:
# Bed file format: 
# O based, [start, end)
# ex.) "HelloWorld" from 1 to 5 is ello

# Ensembl file format:
# 1 based, [start, end]
# ex.) "HelloWorld" from 2 to 5 is ello

In [105]:
# File shows:

# 5	174724659	174725038	P35548	ENST00000239243	+
# 5	174729158	174729580	P35548	ENST00000239243	+

# Same end coordinates: good
# Start coordinates - 1: good 

4. Translate one of the variants to its position in the sequence

In [106]:
# Checked a few with https://www.ncbi.nlm.nih.gov/CCDS/CcdsBrowse.cgi?REQUEST=CCDS&DATA=CCDS4392
# Seemed to be positioned correctly

In [107]:
start = 174724660
from Bio.Seq import Seq

seq = "ATGGCTTCTCCGTCCAAAGGCAATGACTTGTTTTCGCCCGACGAGGAGGGCCCAGCAGTGGTGGCCGGAC"

def translate(seq):
    seq_obj = Seq(seq)
    return str(seq_obj.translate())

In [108]:
translate(seq)



'MASPSKGNDLFSPDEEGPAVVAG'

In [109]:
import difflib


def new_seq_diff(pos, new_nt):
    delta = pos - start
    seq2 = seq[:delta] + new_nt + seq[delta + 1:]
    new_seq = translate(seq2)

    output_list = [li for li in difflib.ndiff(new_seq, translate(seq)) if li[0] != ' ']
    return output_list

In [110]:
new_seq_diff(174724661, "C")

['- T', '+ M']

In [111]:
new_seq_diff(174724686, "G")

['- E', '+ D']

In [112]:
# These match the sequence what's expected