In [20]:
import pandas as pd
import sys
import os
import json
def get_true_filename(filename):
    try:
        base = sys._MEIPASS
    except Exception:
        base = os.path.abspath(".")
    return os.path.join(base, filename)

class gene_count_table:
    def __init__(self) -> None:
        self.count_table = pd.read_csv(get_true_filename('count_table.csv'))
        self.count_table.index = self.count_table['Geneid']
        self.alternative_splicing_table = pd.read_csv(get_true_filename('all_samples_psi.csv'))
        with open(get_true_filename('sample_metadata.json'),'r') as f:
            self.sample_metadata = json.load(f)
        self.default_list = ['CamK_1',
            'CamK_2', 'CamK_3', 'CamK_4', 'CamK_W2', 'CamK_W3', 'CamK_W4',
            'CamK_W5', 'Grik_W3', 'Grik_W4', 'Grik_W5', 'Grik_W6', 'PV_1', 'PV_2',
            'PV_3', 'PV_4', 'Scnn_1', 'Scnn_2', 'Scnn_3', 'Scnn_4', 'SST_1',
            'SST_2', 'SST_3', 'SST_4', 'SST_W2', 'SST_W3', 'SST_W5', 'SST_W1',
            'VIP_W1', 'VIP_W2', 'VIP_W3', 'VIP_W4']
        for col in self.default_list:
            self.count_table[col] = (self.count_table[col] / self.count_table[col].sum()) * 1_000_000
        with open(get_true_filename('ensembl_to_gene_symbol.json'),'r') as f:
            self.ensembl_to_gene_symbol = json.load(f)
        self.gene_symbol_to_ensembl =  {v: k for k, v in self.ensembl_to_gene_symbol.items()}
  
    def return_df(self, gene_list: list, columns_selected: list = []) -> pd.DataFrame:
        if columns_selected == []:
            columns_selected = self.default_list
        df_list = []
        for gene in gene_list:
            if gene not in self.count_table.index:
                raise ValueError('Gene not found')
            gene_count_data = self.count_table.loc[gene]
            genesym = gene_count_data.loc['gene symbol']
            for col, val in zip(gene_count_data.index,gene_count_data):
                if col not in columns_selected: continue
                df_list.append([col,gene,genesym,self.sample_metadata[col]['brainregion'],self.sample_metadata[col]['type'],val])
        return pd.DataFrame(df_list, columns=['Sample','ENSEMBL Gene ID', 'Gene Symbol','Brainregion','Celltype','gene count (counts per million)'])
    
    def return_alternative_splicing_df(self, parent, gene_list, columns_selected) -> pd.DataFrame:
        df_list = []
        for gene in gene_list:
            gene_alternative_splice_events = self.alternative_splicing_table[self.alternative_splicing_table['Ensembl Gene ID'] == gene]
            if gene_alternative_splice_events.shape[0] < 1:
                parent.no_splice_events()
                continue
            # Ensembl Gene ID	splicing_event	chr	bp_position
            for _,splice_event in gene_alternative_splice_events.iterrows():
                genesym = self.ensembl_to_gene_symbol[gene]
                splice_event_type = splice_event['splicing_event']
                chr = splice_event['chr']
                bp_position = splice_event['bp_position']
                for col, val in zip(splice_event.index,splice_event):
                    if col not in columns_selected: continue
                    df_list.append([col,gene,genesym,self.sample_metadata[col]['brainregion'],self.sample_metadata[col]['type'],splice_event_type, chr, bp_position, val])
        return pd.DataFrame(df_list, columns=['Sample','ENSEMBL Gene ID', 'Gene Symbol','Brainregion','Celltype','splice event type', 'chr', 'event position','psi'])

In [21]:
x = gene_count_table()
default_list = ['CamK_1',
            'CamK_2', 'CamK_3', 'CamK_4', 'CamK_W2', 'CamK_W3', 'CamK_W4',
            'CamK_W5', 'Grik_W3', 'Grik_W4', 'Grik_W5', 'Grik_W6', 'PV_1', 'PV_2',
            'PV_3', 'PV_4', 'Scnn_1', 'Scnn_2', 'Scnn_3', 'Scnn_4', 'SST_1',
            'SST_2', 'SST_3', 'SST_4', 'SST_W2', 'SST_W3', 'SST_W5', 'SST_W1',
            'VIP_W1', 'VIP_W2', 'VIP_W3', 'VIP_W4']

In [24]:
x.return_alternative_splicing_df(None,['ENSMUSG00000061601','ENSMUSG00000040118'],default_list)['splice event type'].unique()

array(['A3', 'A5', 'RI', 'SE'], dtype=object)

In [4]:
x.alternative_splicing_table.columns

Index(['Ensembl Gene ID', 'splicing_event', 'chr', 'bp_position', 'strand',
       'CamK_1', 'CamK_2', 'CamK_3', 'CamK_4', 'CamK_W2', 'CamK_W3', 'CamK_W4',
       'CamK_W5', 'Grik_W3', 'Grik_W4', 'Grik_W5', 'Grik_W6', 'PV_1', 'PV_2',
       'PV_3', 'PV_4', 'SST_1', 'SST_2', 'SST_3', 'SST_4', 'SST_W2', 'SST_W3',
       'SST_W5', 'SST_WT1', 'Scnn_1', 'Scnn_2', 'Scnn_3', 'Scnn_4', 'VIP_W1',
       'VIP_W2', 'VIP_W3', 'VIP_W4'],
      dtype='object')

In [7]:
x.alternative_splicing_table[x.alternative_splicing_table['Ensembl Gene ID'] == 'ENSMUSG00000061601']

Unnamed: 0,Ensembl Gene ID,splicing_event,chr,bp_position,strand,CamK_1,CamK_2,CamK_3,CamK_4,CamK_W2,...,SST_W5,SST_WT1,Scnn_1,Scnn_2,Scnn_3,Scnn_4,VIP_W1,VIP_W2,VIP_W3,VIP_W4
63815,ENSMUSG00000061601,A3,5,14764612-14768832:14764612-14768871,+,0.584825,0.639167,0.667805,0.708868,0.487203,...,0.559385,0.545243,0.563493,0.610302,0.654478,0.643791,0.672318,0.682307,0.751471,0.622645
63816,ENSMUSG00000061601,A5,5,14732401-14762431:14731924-14762431,+,0.415175,0.360833,0.332195,0.291132,0.512797,...,0.440615,0.454757,0.436507,0.389698,0.345522,0.356209,0.327682,0.317693,0.248529,0.377355
63817,ENSMUSG00000061601,RI,5,14724969:14730027-14730384:14732401,+,0.000683,0.0,0.0,0.001211,0.006322,...,0.000799,0.004425,0.008292,0.0,0.005704,0.0,0.007399,0.011693,0.012264,0.004687
63818,ENSMUSG00000061601,SE,5,14825458-14828553:14828579-14838412,+,0.999717,1.0,1.0,0.999647,0.996758,...,0.999648,0.997988,0.99638,1.0,0.998029,1.0,0.997575,0.996285,0.996952,0.998231
