In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import subprocess
from scipy.stats import chisquare
import os

# import scipy.stats as stats

In [2]:
d_domains = "../outputs/mutations/domains_expanded_iWES_v2_variants_snv_classified/"
files = os.listdir(d_domains)
ENST_codes = [f.replace(".bed", "") for f in files]
ENST_codes

['ENST00000356073',
 'ENST00000348066',
 'ENST00000379044',
 'ENST00000246672',
 'ENST00000312233',
 'ENST00000317216',
 'ENST00000056233',
 'ENST00000335670',
 'ENST00000303329',
 'ENST00000373036',
 'ENST00000359486',
 'ENST00000389506',
 'ENST00000315869',
 'ENST00000377022',
 'ENST00000265340',
 'ENST00000377142',
 'ENST00000561208',
 'ENST00000339562',
 'ENST00000262238',
 'ENST00000428368',
 'ENST00000437473',
 'ENST00000373294',
 'ENST00000241001',
 'ENST00000372583',
 'ENST00000395324',
 'ENST00000403491',
 'ENST00000318003',
 'ENST00000250916',
 'ENST00000309446',
 'ENST00000355311',
 'ENST00000592199',
 'ENST00000355995',
 'ENST00000331340',
 'ENST00000434704',
 'ENST00000367265',
 'ENST00000398919',
 'ENST00000380013',
 'ENST00000348332',
 'ENST00000262518',
 'ENST00000264637',
 'ENST00000239243',
 'ENST00000342988',
 'ENST00000358127',
 'ENST00000282549',
 'ENST00000341099']

In [7]:
uniprotID_ENST_mapping = pd.read_csv("../../data/SFARI_TFs_with_ENST.csv")
uniprotID_ENST_mapping = uniprotID_ENST_mapping[["uniprotID", "ENST"]]
uniprotID_ENST_mapping["ENST"] = uniprotID_ENST_mapping["ENST"].str.split(".").str[0]
uniprotID_ENST_mapping_dict= dict(zip(uniprotID_ENST_mapping["ENST"], uniprotID_ENST_mapping["uniprotID"]))
uniprotID_ENST_mapping_dict['ENST00000434704'] = 'O60479'
uniprotID_ENST_mapping_dict

{'ENST00000349014': 'Q9H2P0',
 'ENST00000247087': 'Q5TGY3',
 'ENST00000334344': 'Q68CP9',
 'ENST00000303329': 'Q9HBZ2',
 'ENST00000379044': 'Q96QS3',
 'ENST00000374690': 'P10275',
 'ENST00000368346': 'Q9NR48',
 'ENST00000392783': 'Q9UIF8',
 'ENST00000642384': 'Q9H165',
 'ENST00000377022': 'Q86V15',
 'ENST00000318003': 'Q6P1N0',
 'ENST00000348066': 'O94983',
 'ENST00000361283': 'Q96JM3',
 'ENST00000575354': 'Q96RK0',
 'ENST00000264010': 'P49711',
 'ENST00000292535': 'P39880',
 'ENST00000292538': 'Q13948',
 'ENST00000261726': 'O14529',
 'ENST00000382409': 'O75398',
 'ENST00000234198': 'Q07687',
 'ENST00000355311': 'Q9H4W6',
 'ENST00000317216': 'Q06889',
 'ENST00000297375': 'P19622',
 'ENST00000341099': 'Q92731',
 'ENST00000283268': 'Q8TBJ5',
 'ENST00000313071': 'P55316',
 'ENST00000318789': 'Q9H334',
 'ENST00000350908': 'O15409',
 'ENST00000312233': 'Q8NBF1',
 'ENST00000398919': 'P11308',
 'ENST00000573035': 'P78347',
 'ENST00000012134': 'P31629',
 'ENST00000372583': 'Q5T1R4',
 'ENST0000

In [6]:
# Uses bedtools jaccard function to get nucleotide length of CDS
def return_cds_len(folder, code):
    subprocess.call("bedtools jaccard -a ../outputs/mutations/" + folder + "/sorted/" + code + ".bed \
    -b ../outputs/mutations/" + folder + "/sorted/" + code + ".bed \
    > ../outputs/mutations/" + folder + "/lengths/" + code + ".bed", shell = True) 
    cds_len = pd.read_csv("../outputs/mutations/" + folder + "/lengths/" + code + ".bed", sep = "\t")["intersection"][0]
    return cds_len

def sort_folder(folder):
    directory = "../outputs/mutations/" + folder 
    files = os.listdir(directory)
    i = 1
    for file in files:  
        if os.path.isdir(directory + "/" + file):
            # skip directories
            continue
        # print(i, file) # gene.bed
        name = file.split(".")[0] # gene
        # print("bedtools sort -i " + directory + "/" + file + " > ../outputs/mutations/cds_bed_format_sorted/" + name + ".bed")
        subprocess.call("bedtools sort -i " + directory + "/" + file + " > " + directory + "/sorted/" + name + ".bed", shell = True)
        i+=1

In [27]:
def generate_df(ENST_codes, uniprotID_ENST_mapping_dict, var_freq_threshold):
    tf_lengths, uniprotIDs, AD_lengths, cds_no_syns, AD_no_syns = [],[],[],[],[]
    DBD_lengths, DBD_no_syns = [], []
    lengths = []

    i = 1
    # Building a dataframe
    for ENST_code in ENST_codes:
        print(i, ENST_code)
        i += 1

        # Sort domains
        sort_folder("domains_bed_format")

        # Get uniprotID
        uniprotID = uniprotID_ENST_mapping_dict[ENST_code]
        uniprotIDs.append(uniprotID)

        # Get TF length
        tf_lengths.append(return_cds_len("cds_bed_format", ENST_code))

        # Get sum of domain lengths
        lengths.append(return_cds_len("domains_bed_format", uniprotID))
        
        # Count # rows that are non-syn and below variant threshold in CDS clinvar annotated
        cds_clinvar = pd.read_csv("../outputs/mutations/cds_expanded_iWES_v2_variants_snv_classified/" + ENST_code + ".bed", sep = "\t", header = None)
        cds_no_syn_count = cds_clinvar[(cds_clinvar[22] == "No-Syn") & (cds_clinvar[19] <= var_freq_threshold)]
        cds_no_syns.append(cds_no_syn_count)

        # Count # rows that are non-syn in AD clinvar annotated
        AD_clinvar = pd.read_csv("../outputs/mutations/domains_expanded_iWES_v2_variants_snv_classified/" + ENST_code + ".bed", sep = "\t", header = None)
        if sum(AD_clinvar[22].str.contains("No-Syn")) > 0:
            AD_no_syn_count = AD_clinvar[(AD_clinvar[22] == "No-Syn") & (AD_clinvar[19] <= var_freq_threshold)]
        else: 
            AD_no_syn_count = 0
        AD_no_syns.append(AD_no_syn_count)
        
        results = pd.DataFrame(data = {"uniprotID": uniprotIDs,
                               "TF_cds_length": tf_lengths,
                               "AD_cds_length" : lengths, 
              "TF_missense" : cds_no_syns, 
              "AD_missense" : AD_no_syns
                              })
        return results

In [28]:
generate_df(ENST_codes, ENST_uniprot_dict, uniprotID_ENST_mapping_dict, 1)

NameError: name 'ENST_uniprot_dict' is not defined