In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import subprocess

P78337 is the uniprotID that is causing the indexing error in the variant analysis notebook.

In [2]:
# First, downloading the coordinates
TF_coords = pd.read_csv("../../data/SFARI_TFs_with_knownADs_coords.csv", index_col = 0)
AD_coords = pd.read_csv("../../data/SFARI_ADs_AA_coords_redone.csv", index_col = 0)
DBD_coords = pd.read_csv("../../data/SFARI_TFs_with_known_ADs_DBD_coords.csv", index_col = 0)

In [3]:
TF_coords.sort_values(by = "End")

Unnamed: 0,uniprotID,Start,End
36,Q9BXK1,1,252
49,P35548,1,267
62,O95096,1,273
37,O75840,1,302
69,P78337,1,314
64,P32242,1,354
21,Q06889,1,387
65,Q02548,1,391
104,P25490,1,414
66,P26367,1,422


In [4]:
relev_TF_coords = TF_coords[TF_coords["uniprotID"] == "P78337"]
relev_TF_coords

Unnamed: 0,uniprotID,Start,End
69,P78337,1,314


In [5]:
relev_AD_coords = AD_coords[AD_coords["uniprotID"] == "P78337"]
relev_AD_coords

Unnamed: 0,uniprotID,Start,End
40,P78337,234,283


In [6]:
relev_DBD_coords = DBD_coords[DBD_coords["uniprotID"] == "P78337"]
relev_DBD_coords

Unnamed: 0,uniprotID,Start,End
31,P78337,89,148


In [7]:
# Next, getting genomic coordinates from ENST
biomart_output = pd.read_csv("../data/P78337_mart_export.txt", sep = "\t")
biomart_output

Unnamed: 0,Gene stable ID,Gene stable ID version,Transcript stable ID,Transcript stable ID version,Genomic coding start,Genomic coding end,CDS start,CDS end,cDNA coding start,cDNA coding end
0,ENSG00000069011,ENSG00000069011.16,ENST00000265340,ENST00000265340.12,135033713.0,135033881.0,1.0,169.0,348.0,516.0
1,ENSG00000069011,ENSG00000069011.16,ENST00000265340,ENST00000265340.12,135031276.0,135031508.0,170.0,402.0,517.0,749.0
2,ENSG00000069011,ENSG00000069011.16,ENST00000265340,ENST00000265340.12,135028779.0,135029321.0,403.0,945.0,750.0,1292.0
3,ENSG00000069011,ENSG00000069011.16,ENST00000506438,ENST00000506438.5,135031276.0,135031508.0,170.0,402.0,304.0,536.0
4,ENSG00000069011,ENSG00000069011.16,ENST00000506438,ENST00000506438.5,,,,,,
5,ENSG00000069011,ENSG00000069011.16,ENST00000506438,ENST00000506438.5,135033713.0,135033881.0,1.0,169.0,135.0,303.0
6,ENSG00000069011,ENSG00000069011.16,ENST00000506438,ENST00000506438.5,135028779.0,135029321.0,403.0,945.0,537.0,1079.0


In [10]:
# Randomly choose the first ENST ID
output_rows = biomart_output[biomart_output["Transcript stable ID"] == "ENST00000265340"]
output_rows

Unnamed: 0,Gene stable ID,Gene stable ID version,Transcript stable ID,Transcript stable ID version,Genomic coding start,Genomic coding end,CDS start,CDS end,cDNA coding start,cDNA coding end
0,ENSG00000069011,ENSG00000069011.16,ENST00000265340,ENST00000265340.12,135033713.0,135033881.0,1.0,169.0,348.0,516.0
1,ENSG00000069011,ENSG00000069011.16,ENST00000265340,ENST00000265340.12,135031276.0,135031508.0,170.0,402.0,517.0,749.0
2,ENSG00000069011,ENSG00000069011.16,ENST00000265340,ENST00000265340.12,135028779.0,135029321.0,403.0,945.0,750.0,1292.0


In [11]:
# The bed file says

# 5	135033712	135033881	P78337	ENST00000265340	-
# 5	135031275	135031508	P78337	ENST00000265340	-
# 5	135028781	135029321	P78337	ENST00000265340	-

# Negative strand, so makes sense that the last genomic coding start 
# is + 3 - 1 to account for the stop codon and then bed file

In [None]:
# Looked at a few variants and seems to work

In [12]:
len("ATGGAGGTGGACACCGAGGAGAAGCGGCATCGCACGCGGTCCAAAGGGGTTCGAGTTCCCGTGGAACCAGCCATACAAGAGCTGTTCAGCTGTCCCACCCCTGGCTGTGACGGCAGTGGTCATGTCAGTGGCAAATATGCAAGACACAGAAGTGTATATGGTTGTCCCTTGGCGAAAAAAAGAAAAACACAAGATAAACAGCCCCAGGAACCTGCTCCTAAACGAAAGCCATTTGCCGTGAAAGCAGACAGCTCCTCAGTGGATGAGTGTGACGACAGTGATGGGACTGAGGACATGGATGAGAAGGAGGAGGATGAGGGGGAGGAGTACTCCGAGGACAATGATGAGCCAGGGGATGAGGACGAGGAGGACGAGGAGGGGGACCGGGAGGAGGAGGAGGAGATCGAGGAGGAGGATGAGGACGATGACGAGGATGGAGAAGATGTGGAGGATGAAGAAGAGGAAGAGGAGGAGGAGGAGGAGGAGGAAGAGGAAGAAGAAAACGAAGACCATCAAATGAATTGTCACAATACTCGAATAATGCAAGACACAGAAAAGGATGATAACAATAATGACGAATATGACAATTACGATGAACTGGTGGCCAAGTCATTGTTAAACCTCGGCAAAATCGCTGAGGATGCAGCCTACCGGGCCAGGACTGAGTCAGAAATGAACAGCAATACCTCCAATAGTCTGGAAGACGATAGTGACAAAAACGAAAACCTGGGTCGGAAAAGTGAGTTGAGTTTAGACTTAGACAGTGATGTTGTTAGAGAAACAGTGGACTCCCTTAAACTATTAGCCCAAGGACACGGTGTTGTGCTCTCAGAAAACATGAATGACAGAAATTATGCAGACAGCATGTCGCAGCAAGACAGTAGAAATATGAATTACGTCATGTTGGGGAAGCCCATGAACAACGGACTCATGGAAAAGATGGTGGAGGAGAGCGATGAGGAGGTGTGTCTGAGCAGTCTGGAGTGTTTGAGGAATCAGTGCTTCGACCTGGCCAGGAAGCTCAGTGAGACCAACCCGCAGGAGAGGAATCCGCAGCAGAACATGAACATCCGTCAGCATGTCCGGCCAGAAGAGGACTTCCCCGGAAGGACGCCGGACAGAAACTACTCGGACATGCTGAACCTCATGCGGCTGGAGGAGCAGTTGAGCCCCCGGTCGAGAGTGTTTGCCAGCTGTGCGAAGGAGGATGGGTGTCATGAGCGGGACGACGATACCACCTCTGTGAACTCGGACAGGTCTGAAGAGGTGTTCGACATGACCAAGGGGAACCTGACCCTGCTGGAGAAAGCCATCGCTTTGGAAACGGAAAGAGCAAAGGCCATGAGGGAGAAGATGGCCATGGAAGCTGGGAGGAGGGACAATATGAGGTCATATGAGGACCAGTCTCCGAGACAACTTCCCGGGGAGGACAGAAAGCCTAAATCCAGTGACAGCCATGTCAAAAAGCCATACTATGATCCCTCAAGAACAGAAAAGAAAGAGAGCAAGTGTCCAACCCCCGGGTGTGATGGAACCGGCCACGTAACTGGGCTGTACCCACATCACCGCAGCCTGTCCGGATGCCCGCACAAAGATAGGGTCCCTCCAGAAATCCTTGCCATGCATGAAAGTGTCCTCAAGTGCCCCACTCCGGGCTGCACGGGGCGCGGGCATGTCAACAGCAACAGGAACTCCCACCGAAGCCTCTCCGGATGCCCGATCGCTGCAGCAGAGAAACTGGCCAAGGCACAGGAAAAGCACCAGAGCTGCGACGTGTCCAAGTCCAGCCAGGCCTCGGACCGCGTGCTCAGGCCAATGTGCTTTGTGAAGCAGCTGGAGATTCCTCAGTATGGCTACAGAAACAATGTCCCCACAACTACGCCGCGTTCCAACCTGGCCAAGGAGCTCGAGAAATATTCCAAGACCTCGTTTGAATACAACAGTTACGACAACCATACTTATGGCAAGCGAGCCATAGCTCCCAAGGTGCAAACCAGGGATATATCCCCCAAAGGATATGATGATGCGAAGCGGTACTGCAAGGACCCCAGCCCCAGCAGCAGCAGCACCAGCAGCTACGCGCCCAGCAGCAGCAGCAACCTGAGCTGCGGCGGGGGCAGCAGCGCCAGCAGCACGTGCAGCAAGAGCAGCTTCGACTACACGCACGACATGGAGGCGGCCCACATGGCGGCCACCGCCATCCTCAACCTGTCCACGCGCTGCCGCGAGATGCCGCAGAACCTGAGCACCAAGCCGCAGGACCTGTGCGCCACGCGGAACCCTGACATGGAGGTGGATGAGAACGGGACCCTGGACCTCAGCATGAACAAGCAGAGGCCGCGGGACAGCTGCTGCCCCATCCTGACCCCTCTGGAGCCCATGTCCCCCCAGCAGCAGGCAGTGATGAACAACCGGTGTTTCCAGCTGGGCGAGGGCGACTGCTGGGACTTGCCCGTAGACTACACCAAAATGAAACCCCGGAGGATAGACGAGGACGAGTCCAAAGACATTACTCCAGAAGACTTGGACCCATTCCAGGAGGCTCTAGAAGAAAGACGGTATCCCGGGGAGGTGACCATCCCAAGTCCCAAACCCAAGTACCCTCAGTGCAAGGAGAGCAAAAAGGACTTAATAACTCTGTCTGGCTGCCCCCTGGCGGACAAAAGCATTCGAAGTATGCTGGCCACCAGCTCCCAAGAACTCAAGTGCCCCACGCCTGGCTGTGATGGTTCTGGACATATCACCGGCAATTATGCTTCTCATCGGAGCCTTTCAGGTTGCCCAAGAGCAAAGAAAAGTGGTATCAGGATAGCACAGAGCAAAGAAGATAAAGAAGATCAAGAACCCATCAGGTGTCCGGTCCCCGGGTGCGACGGCCAGGGCCACATCACTGGGAAGTACGCGTCCCATCGCAGCGCCTCCGGGTGCCCCTTGGCGGCCAAGAGGCAGAAAGACGGGTACCTGAATGGCTCCCAGTTCTCCTGGAAGTCGGTCAAGACGGAAGGCATGTCCTGCCCCACGCCAGGATGCGACGGCTCAGGCCACGTCAGCGGCAGCTTCCTCACACACCGCAGCTTGTCAGGATGCCCGAGAGCCACGTCAGCGATGAAGAAGGCAAAGCTTTCTGGAGAGCAGATGCTGACCATCAAACAGCGGGCCAGCAACGGTATAGAAAATGATGAAGAAATCAAACAGTTAGATGAAGAAATCAAGGAGCTAAATGAATCCAATTCCCAGATGGAAGCCGATATGATTAAACTCAGAACTCAGATTACCACGATGGAGAGCAACCTGAAGACCATCGAAGAGGAGAACAAAGTGATTGAGCAGCAGAACGAGTCTCTCCTCCACGAGCTGGCGAACCTGAGCCAGTCTCTGATCCACAGCCTGGCTAACATCCAGCTGCCGCACATGGATCCAATCAATGAACAAAATTTTGATGCTTACGTGACTACTTTGACGGAAATGTATACAAATCAAGATCGTTATCAGAGTCCAGAAAATAAAGCCCTACTGGAAAATATAAAGCAGGCTGTGAGAGGAATTCAGGTCTGA")

3555

In [1]:
from Bio.Seq import Seq

seq = "ATGGAGGTGGACACCGAGGAGAAGCGGCATCGCACGCGGTCCAAAGGGGTTCGAGTTCCCGTGGAACCAGCCATACAAGAGCTGTTCAGCTGTCCCACCCCTGGCTGTGACGGCAGTGGTCATGTCAGTGGCAAATATGCAAGACACAGAAGTGTATATGGTTGTCCCTTGGCGAAAAAAAGAAAAACACAAGATAAACAGCCCCAGGAACCTGCTCCTAAACGAAAGCCATTTGCCGTGAAAGCAGACAGCTCCTCAGTGGATGAGTGTGACGACAGTGATGGGACTGAGGACATGGATGAGAAGGAGGAGGATGAGGGGGAGGAGTACTCCGAGGACAATGATGAGCCAGGGGATGAGGACGAGGAGGACGAGGAGGGGGACCGGGAGGAGGAGGAGGAGATCGAGGAGGAGGATGAGGACGATGACGAGGATGGAGAAGATGTGGAGGATGAAGAAGAGGAAGAGGAGGAGGAGGAGGAGGAGGAAGAGGAAGAAGAAAACGAAGACCATCAAATGAATTGTCACAATACTCGAATAATGCAAGACACAGAAAAGGATGATAACAATAATGACGAATATGACAATTACGATGAACTGGTGGCCAAGTCATTGTTAAACCTCGGCAAAATCGCTGAGGATGCAGCCTACCGGGCCAGGACTGAGTCAGAAATGAACAGCAATACCTCCAATAGTCTGGAAGACGATAGTGACAAAAACGAAAACCTGGGTCGGAAAAGTGAGTTGAGTTTAGACTTAGACAGTGATGTTGTTAGAGAAACAGTGGACTCCCTTAAACTATTAGCCCAAGGACACGGTGTTGTGCTCTCAGAAAACATGAATGACAGAAATTATGCAGACAGCATGTCGCAGCAAGACAGTAGAAATATGAATTACGTCATGTTGGGGAAGCCCATGAACAACGGACTCATGGAAAAGATGGTGGAGGAGAGCGATGAGGAGGTGTGTCTGAGCAGTCTGGAGTGTTTGAGGAATCAGTGCTTCGACCTGGCCAGGAAGCTCAGTGAGACCAACCCGCAGGAGAGGAATCCGCAGCAGAACATGAACATCCGTCAGCATGTCCGGCCAGAAGAGGACTTCCCCGGAAGGACGCCGGACAGAAACTACTCGGACATGCTGAACCTCATGCGGCTGGAGGAGCAGTTGAGCCCCCGGTCGAGAGTGTTTGCCAGCTGTGCGAAGGAGGATGGGTGTCATGAGCGGGACGACGATACCACCTCTGTGAACTCGGACAGGTCTGAAGAGGTGTTCGACATGACCAAGGGGAACCTGACCCTGCTGGAGAAAGCCATCGCTTTGGAAACGGAAAGAGCAAAGGCCATGAGGGAGAAGATGGCCATGGAAGCTGGGAGGAGGGACAATATGAGGTCATATGAGGACCAGTCTCCGAGACAACTTCCCGGGGAGGACAGAAAGCCTAAATCCAGTGACAGCCATGTCAAAAAGCCATACTATGATCCCTCAAGAACAGAAAAGAAAGAGAGCAAGTGTCCAACCCCCGGGTGTGATGGAACCGGCCACGTAACTGGGCTGTACCCACATCACCGCAGCCTGTCCGGATGCCCGCACAAAGATAGGGTCCCTCCAGAAATCCTTGCCATGCATGAAAGTGTCCTCAAGTGCCCCACTCCGGGCTGCACGGGGCGCGGGCATGTCAACAGCAACAGGAACTCCCACCGAAGCCTCTCCGGATGCCCGATCGCTGCAGCAGAGAAACTGGCCAAGGCACAGGAAAAGCACCAGAGCTGCGACGTGTCCAAGTCCAGCCAGGCCTCGGACCGCGTGCTCAGGCCAATGTGCTTTGTGAAGCAGCTGGAGATTCCTCAGTATGGCTACAGAAACAATGTCCCCACAACTACGCCGCGTTCCAACCTGGCCAAGGAGCTCGAGAAATATTCCAAGACCTCGTTTGAATACAACAGTTACGACAACCATACTTATGGCAAGCGAGCCATAGCTCCCAAGGTGCAAACCAGGGATATATCCCCCAAAGGATATGATGATGCGAAGCGGTACTGCAAGGACCCCAGCCCCAGCAGCAGCAGCACCAGCAGCTACGCGCCCAGCAGCAGCAGCAACCTGAGCTGCGGCGGGGGCAGCAGCGCCAGCAGCACGTGCAGCAAGAGCAGCTTCGACTACACGCACGACATGGAGGCGGCCCACATGGCGGCCACCGCCATCCTCAACCTGTCCACGCGCTGCCGCGAGATGCCGCAGAACCTGAGCACCAAGCCGCAGGACCTGTGCGCCACGCGGAACCCTGACATGGAGGTGGATGAGAACGGGACCCTGGACCTCAGCATGAACAAGCAGAGGCCGCGGGACAGCTGCTGCCCCATCCTGACCCCTCTGGAGCCCATGTCCCCCCAGCAGCAGGCAGTGATGAACAACCGGTGTTTCCAGCTGGGCGAGGGCGACTGCTGGGACTTGCCCGTAGACTACACCAAAATGAAACCCCGGAGGATAGACGAGGACGAGTCCAAAGACATTACTCCAGAAGACTTGGACCCATTCCAGGAGGCTCTAGAAGAAAGACGGTATCCCGGGGAGGTGACCATCCCAAGTCCCAAACCCAAGTACCCTCAGTGCAAGGAGAGCAAAAAGGACTTAATAACTCTGTCTGGCTGCCCCCTGGCGGACAAAAGCATTCGAAGTATGCTGGCCACCAGCTCCCAAGAACTCAAGTGCCCCACGCCTGGCTGTGATGGTTCTGGACATATCACCGGCAATTATGCTTCTCATCGGAGCCTTTCAGGTTGCCCAAGAGCAAAGAAAAGTGGTATCAGGATAGCACAGAGCAAAGAAGATAAAGAAGATCAAGAACCCATCAGGTGTCCGGTCCCCGGGTGCGACGGCCAGGGCCACATCACTGGGAAGTACGCGTCCCATCGCAGCGCCTCCGGGTGCCCCTTGGCGGCCAAGAGGCAGAAAGACGGGTACCTGAATGGCTCCCAGTTCTCCTGGAAGTCGGTCAAGACGGAAGGCATGTCCTGCCCCACGCCAGGATGCGACGGCTCAGGCCACGTCAGCGGCAGCTTCCTCACACACCGCAGCTTGTCAGGATGCCCGAGAGCCACGTCAGCGATGAAGAAGGCAAAGCTTTCTGGAGAGCAGATGCTGACCATCAAACAGCGGGCCAGCAACGGTATAGAAAATGATGAAGAAATCAAACAGTTAGATGAAGAAATCAAGGAGCTAAATGAATCCAATTCCCAGATGGAAGCCGATATGATTAAACTCAGAACTCAGATTACCACGATGGAGAGCAACCTGAAGACCATCGAAGAGGAGAACAAAGTGATTGAGCAGCAGAACGAGTCTCTCCTCCACGAGCTGGCGAACCTGAGCCAGTCTCTGATCCACAGCCTGGCTAACATCCAGCTGCCGCACATGGATCCAATCAATGAACAAAATTTTGATGCTTACGTGACTACTTTGACGGAAATGTATACAAATCAAGATCGTTATCAGAGTCCAGAAAATAAAGCCCTACTGGAAAATATAAAGCAGGCTGTGAGAGGAATTCAGGTCTGA"

In [2]:
Seq(seq).reverse_complement()

Seq('TCAGACCTGAATTCCTCTCACAGCCTGCTTTATATTTTCCAGTAGGGCTTTATT...CAT')

# Actually, it is Q9UL68 with the error - was looking at the wrong printed row of the output