In [1]:
#importing libraries
import pandas as pd
import matplotlib
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt
import numpy as np
import statsmodels
from statsmodels.stats import multitest
import statistics
from math import sqrt
from statsmodels.stats.power import TTestIndPower
from SigProfilerExtractor import sigpro as sig
import os.path
import glob
import fnmatch
import os
import shutil
from SigProfilerMatrixGenerator.scripts import CNVMatrixGenerator as scna
from SigProfilerMatrixGenerator.scripts import SigProfilerMatrixGeneratorFunc as matGen

#display plots in notebook
%matplotlib inline

#default figure settings
plt.rcParams["pdf.fonttype"] = 42
plt.rcParams["axes.spines.right"] = False
plt.rcParams["axes.spines.top"] = False
plt.rcParams["figure.autolayout"] = True

def listdir(dirname, pattern="*"):
    return fnmatch.filter(os.listdir(dirname), pattern)

#run this only once at the start
#from SigProfilerMatrixGenerator import install as genInstall
#genInstall.install('GRCh37')

In [59]:
starfus_final_dict = {}

for x in listdir("/temp_data/RNA_data", "TL*_T_RSQ1"):
    current_data = pd.read_csv("/temp_data/RNA_data/{}/STARFUS_{}/star-fusion.fusion_predictions.abridged.tsv".format(x,x), sep = "\t", index_col = 0)[["JunctionReadCount", "LeftBreakpoint", "RightBreakpoint"]]
    current_data["Sample"] = x
    starfus_final_dict[x]= current_data

starfus_final_df = pd.concat(starfus_final_dict.values(),keys=starfus_final_dict.keys()).apply(pd.to_numeric, errors = "ignore")
starfus_final_df = starfus_final_df.reset_index(level=0, drop=True)
#removing anything with low reads
starfus_final_df = starfus_final_df.loc[starfus_final_df["JunctionReadCount"] > 10]
starfus_final_df.index.name = "Fusion"


In [69]:
#list of all fusions found
final_list = []
for x in set(starfus_final_df.index):
    fusion_tobe_considered = starfus_final_df.loc[x]
    if (len(set(fusion_tobe_considered["Sample"])) > 1):
        if ("JunctionReadCount" not in fusion_tobe_considered.index):
            final_list.append(x)
    
starfus_final_df = starfus_final_df.loc[final_list] 
starfus_final_df

Unnamed: 0_level_0,JunctionReadCount,LeftBreakpoint,RightBreakpoint,Sample
Fusion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
E2F4--RPL14,13,chr16:67229785:+,chr3:40503542:+,TL-22-86QRKCES_T_RSQ1
E2F4--RPL14,15,chr16:67229785:+,chr3:40503542:+,TL-22-354IF4A9_T_RSQ1
KDM5A--WNK1,24,chr12:438001:-,chr12:971249:+,TL-22-PKA8ZUD2_T_RSQ1
KDM5A--WNK1,30,chr12:438001:-,chr12:971249:+,TL-22-SA4HH23W_T_RSQ1
CTD-2206G10.1--TPT1P1,22,chr5:31908806:+,chr21:33213004:+,TL-22-PKA8ZUD2_T_RSQ1
CTD-2206G10.1--TPT1P1,18,chr5:31908806:+,chr21:33213004:+,TL-22-SA4HH23W_T_RSQ1
RPA3-AS1--GLCCI1,12,chr7:7841374:+,chr7:8043538:+,TL-22-DHERTUS6_T_RSQ1
RPA3-AS1--GLCCI1,12,chr7:7841374:+,chr7:8043538:+,TL-22-JICYR8PP_T_RSQ1
FGFR3--AC016773.1,2349,chr4:1808661:+,chr4:1741429:+,TL-22-BK3DDUYG_T_RSQ1
FGFR3--AC016773.1,634,chr4:1808661:+,chr4:1741429:+,TL-22-TMY85WNT_T_RSQ1


for x in listdir("/media/rin/UTUCproject/DNA_Pipeline_outputs", "*TL*_T*"):
    print(x)
    current_vcf_path = "/media/rin/UTUCproject/Data_Analysis/{}".format(x)
    matGen.SigProfilerMatrixGeneratorFunc(x, 'GRCh37', current_vcf_path)
    current_SBS96_path = "/media/rin/UTUCproject/Data_Analysis/{}/output/SBS/{}.SBS96.all".format(x,x)
    current_ID83_path = "/media/rin/UTUCproject/Data_Analysis/{}/output/ID/{}.ID83.all".format(x,x)
    current_extractor_path = "/media/rin/UTUCproject/Data_Analysis/{}".format(x)
    sig.sigProfilerExtractor("matrix", current_extractor_path, current_SBS96_path, reference_genome="GRCh37", minimum_signatures=1, maximum_signatures=1)
    sig.sigProfilerExtractor("matrix", current_extractor_path, current_ID83_path, reference_genome="GRCh37", minimum_signatures=1, maximum_signatures=1)
    

pt_names = listdir("/media/rin/UTUCproject/DNA_Pipeline_outputs", "*TL*_T*")
keep_list = []
GATK_final_dict_ID = {}

for x in range(0, len(pt_names)):
    #ID
    ID83_data = pd.read_csv("/media/rin/UTUCproject/Data_Analysis/{}/ID83/Suggested_Solution/COSMIC_ID83_Decomposed_Solution/De_Novo_map_to_COSMIC_ID83.csv".format(pt_names[x]), sep = ",").apply(pd.to_numeric, errors = "ignore").loc[:, " Global NMF Signatures"]
    current_data = ID83_data[0].split("(")
    first_percent_list = []
    first_ID_list = [ID83_data[0].split("(")[0]]
    for y in range(0, len(current_data)):
        first_percent_list.append(current_data[y].split(")")[0])
        if (y>0):
            first_ID_list.append(current_data[y].split(")")[1])
            
    first_percent_list.pop(0)
    first_ID_list = list(filter(None, first_ID_list))
    final_ID_list = [q.split(" ")[-2] for q in first_ID_list] 
    print()
    print("final_ID_list:", final_ID_list)
    final_percent_list = [c.split("%")[0] for c in first_percent_list]
    print("final_percent_list:", final_percent_list)
    
    if "Signature" not in final_ID_list:
        keep_list.append(pt_names[x])
        GATK_final_dict_ID[pt_names[x]]= pd.DataFrame(columns = final_ID_list)
        GATK_final_dict_ID[pt_names[x]].loc[pt_names[x]] = final_percent_list
    
GATK_final_df_ID = pd.concat(GATK_final_dict_ID.values(),keys=GATK_final_dict_ID.keys()).apply(pd.to_numeric, errors = "coerce")
GATK_final_df_ID.index = keep_list