## RNA seq data class

### REQUIREMENTS

- Install mysql.connector, https://pypi.org/project/mysql-connector-python/
- Install sqlalchemy, https://pypi.org/project/SQLAlchemy/1.3.5/ 

### TO-DO LIST

- function to return all data, function to return the specified normalized data
    - a lot of the 'unknown' normalizations are RAW
- function to check for duplicate columns...or check uniqueness of each column
    - some GSEs seem to be entered twice or more...
        - e.g. 'naive_t_cells'

In [1]:
import re
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import mysql.connector

In [38]:
class RNAseq_data:
    # import modules
    pd = __import__('pandas')
    np = __import__('numpy')
    re = __import__('re')
    pk = __import__('pickle')
    #from sqlalchemy import create_engine 
    import sqlalchemy as sqal
    import mysql.connector
    
    #engine = sqal.create_engine('mysql+mysqlconnector://dream_user:dream_sql_pw@192.168.144.21/test_dream')
    engine = sqal.create_engine('mysql+mysqlconnector://Simon:Bane@localhost/test_dream')
    
    # basic attributes
    def __init__(self, ct=None, norm='FPKM',  scope='fine'):
        
        course_bcells = ['b_cells', 'naive_b_cells', 'activated_b_cells', 'memory_b_cells']
        course_cd4 = ['']

        cell_types = ['fibroblast', 'Naive CD4 T cells', 'PBMC', 'endothelial',
       'Monocytes', 'Macrophage', 'B cells', 'GC B cells', 'CD4 T cells',
       'T cells', 'Granulocytes', 'Memory CD4 T cells', 'NK cells',
       'Basophils', 'Central Memory CD8 T cells', 'Effector CD4 T cells',
       'Effector Memory CD8 T cells', 'Follicular helper T cells',
       'Memory B cells', 'Myeloid Dendritic cells', 'Naive B cells',
       'Naive CD8 T cells', 'Neutrophils', 'gamma delta T cells',
       'Th1 cells', 'Th17 cells', 'Th2 cells', 'Tregs', 'Plasmablast',
       'CD8 T cells', 'Plasmacytoid Dendritic Cells', 'Dendritic cells',
       'Activated T cells', 'White blood cells', 'Eosinophils',
       'Naive T cells', 'Central Memory', 'Effector Memory',
       'Central Memory T cells', 'Memory CD8 T cells', 'Plasma cells',
       'Memory T cells', 'NKT cells', 'Central Memory CD4 T cells',
       'Effector Memory T cells', 'Activated B cells',
       'Naive T effector cells']
        cell_types = list(map(lambda x: self.re.sub(' ','_',x.lower()), cell_types))
        norm_types = ['FPKM','RPKM','TPM','RAW','unknown']
        
        if any(param == None for param in [ct, norm]):
            print('Instantiate class with: RNAseq_data(CELL TYPE, NORMALIZATION, SCOPE) \n \n'
                  'where CELL TYPE is a list of cell types. Must be one or more of:'+str(cell_types)+'\n\n'
                  'where NORMALIZATION is how the counts are normalized. Must be one of: '+str(norm_types)+'\n\n'
                  'where SCOPE is the cell type specificity. Must be either \'fine\'(default) or \'coarse\'')
            return
        
        with open('CellType.pkl', 'rb') as ctDictFile:
            cellDict = self.pk.load(ctDictFile)
            coarse_ctDict = cellDict['Coarse']['Main']
            coarseCells = coarse_ctDict.keys()
        
        if isinstance(ct,(list,)):
            self.ct = ct
        else:
            print('Usage Error: cell type must be a list with one or more of: '+str(cell_types))
            return
        
        if norm in norm_types:
            self.norm = norm
        else:
            print('Usage Error: normalization method must be one of: '+str(norm_types))
            return
        
        self.ctDict = {}
        if scope in ['coarse','fine']:
            self.scope = scope
            if scope == 'coarse':
                if all(cell in coarseCells for cell in ct):
                    for cell in ct:
                        self.ctDict[cell] = coarse_ctDict[cell]
                else:
                    print('Usage Error: Specified cell type not a coarse cell type \n')
                    print('Must be one or more of: \n'+str(coarseCells))
                    return
        else:
            print('Usage Error: scope must be of type \'coarse\' or \'fine\' (default)')
            return
    
    def mergeTest():
        #check too make sure not too many NAs exist...
        pass
    
    def allData(self):
        df_dict = {}
        if len(self.ctDict) > 0:
            for celltype in self.ctDict.keys():
                token = 0
                for subcell in self.ctDict[celltype]:
                    try:
                        df = self.pd.read_sql_table(subcell, con=self.engine, )
                        df.drop('index', 1, inplace=True)
                        df.set_index('gene_symbol_sql', inplace=True)
                        if token == 0:
                            token = 1
                            df_dict[celltype] = df
                        else:
                            df_dict[celltype] = df_dict[celltype].join(df, how='inner')
                    except:
                        pass
        else: 
            for celltype in self.ct:
                df = self.pd.read_sql_table(celltype, con=self.engine, )
                df.drop('index', 1, inplace=True)
                df_dict[celltype] = df
        return df_dict
    
    def normData(self):
        df_dict = {}
        if len(self.ctDict) > 0:
            for celltype in self.ctDict.keys():
                token = 0
                for subcell in self.ctDict[celltype]:
                    try:
                        df = self.pd.read_sql_table(subcell, con=self.engine)
                        df.drop('index', 1, inplace=True)
                        df.set_index('gene_symbol_sql', inplace=True)
                        sampleNames = df.select_dtypes(exclude=['object']).columns
                        sampleNames_idx = list(map(lambda x: self.re.search('norm_(.*)',x).group(1) == self.norm, sampleNames))
                        sampleNames_norm = sampleNames[sampleNames_idx]
                        if len(sampleNames_norm) == 1:
                            df = None
                            continue
                        else:
                            df = df[sampleNames_norm]
                        if token == 0:
                            token = 1
                            df_dict[celltype] = df
                        else:
                            df_dict[celltype] = df_dict[celltype].join(df, how='inner')
                    except:
                        pass
        else:
            for celltype in self.ct:
                df = self.pd.read_sql_table(celltype, con=self.engine)
                df.drop('index', 1, inplace=True)
                sampleNames = df.select_dtypes(exclude=['object']).columns.to_numpy()
                sampleNames_idx = list(map(lambda x: self.re.search('norm_(.*)',x).group(1) == self.norm, sampleNames))
                sampleNames_norm = sampleNames[sampleNames_idx]
                sampleNames_norm = self.np.insert(sampleNames_norm,0,'gene_symbol_sql')
                if len(sampleNames_norm) == 1:
                    df = None
                else: 
                    df = df[sampleNames_norm]
                df_dict[celltype] = df
        return df_dict
        
    def mergeCellTypes(self):
        # holds the merged df and cell type list
        merge_dict = {}
        merge_dict['cellTypes'] = [] 
        initial = 0
        ct_dfs = self.normData()
        for celltype in ct_dfs.keys():
            if initial == 1:
                merge_df = ct_dfs[celltype]
                merge_df.set_index(merge_df['gene_symbol_sql'], inplace=True)
                merge_df.drop(['gene_symbol_sql'], 1, inplace=True)
                df = df.join(merge_df, how = 'inner')
                merge_dict['cellTypes'].extend([celltype] * len(merge_df.columns))
            else:
                df = ct_dfs[celltype]
                df.set_index(df['gene_symbol_sql'], inplace=True)
                df.drop(['gene_symbol_sql'], 1, inplace=True)
                merge_dict['cellTypes'].extend([celltype] * len(df.columns))
                initial = 1
        df.dropna(inplace=True)
        merge_dict['merged_df'] = df
        return merge_dict

In [39]:
test=RNAseq_data(['b'],'FPKM','coarse')
# test_df=test.mergeCellTypes()

In [40]:
coarse_b=test.normData()

In [17]:
merge_df = test_df['merged_df']
merge_df.shape

(4658, 202)

In [41]:
coarse_b['b'].shape

(52648, 95)

In [61]:
engine = create_engine('mysql+mysqlconnector://dream_user:dream_sql_pw'
                               '@DESKTOP-0JOU0MR:3306/test_dream')

In [2]:
engine=create_engine('mysql+mysqlconnector://Simon:Bane@localhost/test_dream')

In [3]:
engine.table_names()

['b_cells',
 'basophils',
 'big_test',
 'cd4_t_cells',
 'cd8_t_cells',
 'dendritic_cells',
 'effector_memory_cd8_t_cells',
 'effector_memory_t_cells',
 'endothelial',
 'eosinophils',
 'fibroblast',
 'gc_b_cells',
 'granulocytes',
 'macrophage',
 'memory_b_cells',
 'memory_cd4_t_cells',
 'memory_t_cells',
 'monocytes',
 'myeloid_dendritic_cells',
 'naive_b_cells',
 'naive_cd4_t_cells',
 'naive_cd8_t_cells',
 'naive_t_cells',
 'neutrophils',
 'nk_cells',
 'pbmc',
 'plasma_cells',
 'plasmacytoid_dendritic_cells',
 'small_test',
 't_cells',
 'th17_cells',
 'th1_cells',
 'th2_cells',
 'tregs']

In [8]:
sql_df=pd.read_sql_table('t_cells', con=engine)
print(sql_df.shape)
sql_df.sample(10)

NameError: name 'engine' is not defined

In [15]:
sql_df.dropna().sample(10)

Unnamed: 0,index,gene_symbol_sql,gse_GSE117614__gsm_GSM3305196__norm_FPKM,gse_GSE117614__gsm_GSM3305197__norm_FPKM,gse_GSE117614__gsm_GSM3305198__norm_FPKM,gse_GSE117614__gsm_GSM3305199__norm_FPKM,gse_GSE117614__gsm_GSM3305202__norm_FPKM,gse_GSE117614__gsm_GSM3305203__norm_FPKM,gse_GSE117614__gsm_GSM3305204__norm_FPKM,gse_GSE117614__gsm_GSM3305205__norm_FPKM,...,gse_GSE118179__gsm_GSM3320138__norm_unknown,gse_GSE118179__gsm_GSM3320139__norm_unknown,gse_GSE118179__gsm_GSM3320140__norm_unknown,gse_GSE118179__gsm_GSM3320141__norm_unknown,gse_GSE119705__gsm_GSM3381065__norm_unknown,gse_GSE119705__gsm_GSM3381066__norm_unknown,gse_GSE119705__gsm_GSM3381067__norm_unknown,gse_GSE119705__gsm_GSM3381068__norm_unknown,gse_GSE119705__gsm_GSM3381069__norm_unknown,gse_GSE119705__gsm_GSM3381070__norm_unknown
36465,36465,SNORD38B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20322,20322,PINX1,5.83979,6.45386,5.24723,4.95371,8.68512,7.30189,6.98417,6.64048,...,14.28,17.32,12.17,13.78,8.0,1.0,2.0,8.0,3.0,0.0
47553,47553,PDE11A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.09,0.02,0.37,0.15,2.0,3.0,1.0,1.0,1.0,4.0
36276,36276,SNORA16A,0.0,0.0,274.395,0.0,0.0,0.0,0.0,195.772,...,21.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14199,14199,HSPA14,17.9952,20.7837,20.535,20.1181,20.5374,16.6422,17.5753,20.618,...,17.03,13.69,16.69,15.18,9.0,6.0,7.0,9.0,6.0,7.0
48054,48054,SFTA3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,3.0,1.0,5.0,0.0
18079,18079,MPV17L,0.0,0.0,0.0,0.230617,0.0,0.0,0.119203,0.102368,...,0.0,0.0,0.0,0.0,4.0,1.0,0.0,2.0,0.0,1.0
19791,19791,PARL,17.4159,17.6714,16.8588,17.8236,22.8162,16.7645,21.3134,15.1694,...,44.24,43.57,41.63,42.73,4.0,1.0,2.0,1.0,0.0,2.0
35193,35193,SCARNA4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19433,19433,OR1F1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
