### MICROARRAY DATA OBJECT ###

- Changes in version 2
    - cell type aggregated data are serialized in the picklejar
    - cell type data frames are transformed--GSMs are indices

- Modeled after the RNAseq object
    - Should be very similar
- Coarse cells
    - B cells, CD4 T cells, CD8 T cells, NK cells, neutrophils, monocytic-lineage cells (i.e., the aggregate contribution of monocytes, myeloid dendritic cells, and macrophages), fibroblasts, and endothelial cells
- Fine cells
    - memory B cells, naive B cells, memory CD4 T cells, naive CD4 T cells, regulatory T cells, memory CD8 T cells, naive CD8 T cells, NK cells, neutrophils, monocytes, myeloid dendritic cells, macrophages, fibroblasts, and endothelial cells
    - are 'CD4_T_cells' considered naive CD4 T cells?
        - same question with 'B_cells'
    - are 'Granulocytes' considered neutrophils?
    - 'T_cells'...
    - 'Central_Memory'...
    - 'Basophils'...
    - 'Mast_cells'...

In [1]:
import re
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import mysql.connector
import pickle

In [4]:
cellTypes = ['NK_cells', 'fibroblast', 'Monocytes', 'CD8_T_cells', 'B_cells',
       'Dendritic_cells', 'endothelial', 'PBMC', 'Memory_CD4_T_cells',
       'Th17_cells', 'Naive_CD4_T_cells', 'Th1_cells', 'Th2_cells',
       'CD4_T_cells', 'Macrophage', 'Neutrophils', 'Granulocytes',
       'T_cells', 'Central_Memory', 'Plasmacytoid_Dendritic_Cells',
       'Plasma_cells', 'Tregs', 'Basophils', 'Mast_cells',
       'Naive_B_cells', 'GC_B_cells', 'Memory_B_cells', 'Plasmablast',
       'Immature_B_cells', 'White_blood_cells', 'Activated_B_cells',
       'Immature_Dendritic_cells', 'Myeloid_Dendritic_cells',
       'Eosinophils', 'Effector_Memory_T_cells', 'Central_Memory_T_cells',
       'Effector_Memory', 'Plasma_B_cells', 'Naive_CD8_T_cells',
       'Central_Memory_CD4_T_cells', 'Effector_Memory_CD4_T_cells',
       'gamma_delta_T_cells', 'Resting_B_cells', 'Activated_T_cells',
       'Memory_T_cells', 'Naive_T_cells', 'NKT_cells',
       'Effector_Memory_CD8_T_cells', 'Memory_CD8_T_cells',
       'Follicular_helper_T_cells', 'Th1Th17_cells',
       'Central_Memory_CD8_T_cells', 'Activated_Macrophages',
       'Effector_CD4_T_cells', 'pre-GC_B_cells']

In [5]:
ctDict = {'Coarse':{'CD4.T.cells':['Memory_CD4_T_cells','Th17_cells','Naive_CD4_T_cells',
                                   'Th1_cells','Th2_cells','CD4_T_cells','Tregs',
                                  'Central_Memory_CD4_T_cells','Effector_Memory_CD4_T_cells',
                                  'Follicular_helper_T_cells','Th1Th17_cells',
                                  'Effector_CD4_T_cells'],
                   'CD8.T.cells':['CD8_T_cells','Naive_CD8_T_cells','Effector_Memory_CD8_T_cells',
                                 'Memory_CD8_T_cells','Central_Memory_CD8_T_cells'],
                   'NK.cells':['NK_cells',],
                   'B.cells':['B_cells','Naive_B_cells','GC_B_cells','Memory_B_cells',
                             'Immature_B_cells','Activated_B_cells','Plasma_B_cells',
                             'Resting_B_cells','pre-GC_B_cells'],
                   'monocytic.lineage':['Monocytes','Dendritic_cells','Macrophage',
                                        'Immature_Dendritic_cells','Myeloid_Dendritic_cells',
                                       'Activated_Macrophages'],
                   'neutrophils':['Neutrophils','Granulocytes'],
                   'endothelial.cells':['endothelial'],
                   'fibroblasts':['fibroblast']},
          'Fine':{'memory.B.cells':['Memory_B_cells'],
                 'naive.B.cells':['Naive_B_cells','Immature_B_cells','Resting_B_cells'],
                 'memory.CD4.T.cells':['Memory_CD4_T_cells','Central_Memory_CD4_T_cells',
                                      'Effector_Memory_CD4_T_cells'],
                 'naive.CD4.T.cells':['Naive_CD4_T_cells','CD4_T_cells'],
                 'reg.T.cells':['Tregs'],
                 'memory.CD8.T.cells':['Effector_Memory_CD8_T_cells','Memory_CD8_T_cells',
                                      'Central_Memory_CD8_T_cells'],
                 'naive.CD8.T.cells':['CD8_T_cells','Naive_CD8_T_cells'],
                 'NK.cells':['NK_cells'],
                 'neutrophils':['Neutrophils'],
                 'monocytes':['Monocytes'],
                 'myeloid.dendritic.cell':['Dendritic_cells','Myeloid_Dendritic_cells'],
                 'macrophage':['Macrophage','Activated_Macrophages'],
                 'fibroblast':['fibroblast'],
                 'endothelial':['endothelial']},
          'Generic':['PBMC','Granulocytes','T_cells','Central_Memory','Plasma_cells',
                    'White_blood_cells','Effector_Memory_T_cells','Central_Memory_T_cells',
                    'Effector_Memory','Activated_T_cells','Memory_T_cells','Naive_T_cells']
}

In [6]:
for key1, v1 in ctDict.items():
    if isinstance(v1, (dict,)):
        for key2, v2 in v1.items():
            v1[key2] = list(map(lambda x: x.lower(), v2))
    else:
        ctDict[key1] = list(map(lambda x: x.lower(), v1))

In [2]:
# with open('picklejar/ctDict.pckl', 'wb') as picklefile:
#     pickle.dump(ctDict, picklefile)
with open('picklejar/ctDict.pckl', 'rb') as picklefile:
    ctDict = pickle.load(picklefile)

In [3]:
ctDict['Coarse'].keys()

dict_keys(['CD4.T.cells', 'CD8.T.cells', 'NK.cells', 'B.cells', 'monocytic.lineage', 'neutrophils', 'endothelial.cells', 'fibroblasts'])

### IMPORTANT TODO ###

- update normalization types
    - What are the differences between microarray normalization types?

In [41]:
class microarray_data:
    # import modules
    pd = __import__('pandas')
    np = __import__('numpy')
    re = __import__('re')
    pk = __import__('pickle')
    #from sqlalchemy import create_engine 
    import sqlalchemy as sqal
    import GEOparse
    import mysql.connector
    import ntpath
    
    #engine = sqal.create_engine('mysql+mysqlconnector://dream_user:dream_sql_pw@192.168.144.21/test_dream')
    engine = sqal.create_engine('mysql+mysqlconnector://Simon:Bane@localhost/microarray_data_v2')
    
    # basic attributes
    def __init__(self, ct=None, norm='RMA',  scope='coarse'):
        
        # load in some info...
        cell_types = ['NK_cells', 'fibroblast', 'Monocytes', 'CD8_T_cells', 'B_cells',
       'Dendritic_cells', 'endothelial', 'PBMC', 'Memory_CD4_T_cells',
       'Th17_cells', 'Naive_CD4_T_cells', 'Th1_cells', 'Th2_cells',
       'CD4_T_cells', 'Macrophage', 'Neutrophils', 'Granulocytes',
       'T_cells', 'Central_Memory', 'Plasmacytoid_Dendritic_Cells',
       'Plasma_cells', 'Tregs', 'Basophils', 'Mast_cells',
       'Naive_B_cells', 'GC_B_cells', 'Memory_B_cells', 'Plasmablast',
       'Immature_B_cells', 'White_blood_cells', 'Activated_B_cells',
       'Immature_Dendritic_cells', 'Myeloid_Dendritic_cells',
       'Eosinophils', 'Effector_Memory_T_cells', 'Central_Memory_T_cells',
       'Effector_Memory', 'Plasma_B_cells', 'Naive_CD8_T_cells',
       'Central_Memory_CD4_T_cells', 'Effector_Memory_CD4_T_cells',
       'gamma_delta_T_cells', 'Resting_B_cells', 'Activated_T_cells',
       'Memory_T_cells', 'Naive_T_cells', 'NKT_cells',
       'Effector_Memory_CD8_T_cells', 'Memory_CD8_T_cells',
       'Follicular_helper_T_cells', 'Th1Th17_cells',
       'Central_Memory_CD8_T_cells', 'Activated_Macrophages',
       'Effector_CD4_T_cells', 'pre-GC_B_cells']
        cell_types = list(map(lambda x: x.lower(), cell_types))
        
        norm_types = ['RMA','gcRMA','RMA-quantile','MAS5','unknown']
        
        if any(param == None for param in [ct, norm]):
            print('Instantiate class with: RNAseq_data(CELL TYPE, NORMALIZATION, SCOPE) \n \n'
                  'where CELL TYPE is a list of cell types. Must be one or more of:'+str(cell_types)+'\n\n'
                  'where NORMALIZATION is how the counts are normalized. Must be one of: '+str(norm_types)+'\n\n'
                  'where SCOPE is the cell type specificity. Must be either \'fine\'(default) or \'coarse\'')
            return
        
        with open('picklejar/ctDict.pckl', 'rb') as ctDictFile:
            cellDict = self.pk.load(ctDictFile)
            self.allCtDict = cellDict
            coarse_ctDict = cellDict['Coarse']
            coarseCells = coarse_ctDict.keys()
        
        self.sampleMeta = self.pd.read_sql_table('master', con = self.engine)
        self.sampleMeta['cell.type'] = self.sampleMeta['cell.type'].apply(lambda x: x.lower())
            
        # finished loading....    
        ###########################################################
        
        if isinstance(ct,(list,)):
            self.ct = ct
        else:
            print('Usage Error: cell type must be a list with one or more of: '+str(cell_types))
            return
        
        if norm in norm_types:
            self.norm = norm
        else:
            print('Usage Error: normalization method must be one of: '+str(norm_types))
            return
        
        self.ctDict = {}
        if scope in ['coarse','fine']:
            self.scope = scope
            if scope == 'coarse':
                if all(cell in coarseCells for cell in ct):
                    for cell in ct:
                        self.ctDict[cell] = coarse_ctDict[cell]
                else:
                    print('Usage Error: Specified cell type not a coarse cell type \n')
                    print('Must be one or more of: \n'+str(coarseCells))
                    return
        else:
            print('Usage Error: scope must be of type \'coarse\' or \'fine\' (default)')
            return


#    ~~~ NEED TO UPDATE ~~~
#
#     def allData(self, ct):
#         df_dict = {}
#         # for working with 'Coarse' cells
#         if len(self.ctDict) > 0:
#             if ct != self.ct:
#                 ct_query = ct
#             else:
#                 ct_query = self.ctDict.keys()
#             for celltype in ct_query:
#                 token = 0
#                 for subcell in self.ctDict[celltype]:
#                     try:
#                         df = self.pd.read_sql_table(subcell, con=self.engine)
#                         if sum(df.duplicated(subset='gene_symbol_sql')) > 0:
#                             df.drop_duplicates(subset='gene_symbol_sql', inplace=True)
#                         df.drop('index', 1, inplace=True)
#                         df.set_index('gene_symbol_sql', inplace=True)
#                         df = self.condenseGSE(df)
#                         if token == 0:
#                             token = 1
#                             df_dict[celltype] = df
#                         else:
#                             df_dict[celltype] = df_dict[celltype].join(df, how='inner')
#                     except:
#                         pass
#         # for working with 'Fine' cells
#         else: 
#             for celltype in ct:
#                 df = self.pd.read_sql_table(celltype, con=self.engine)
#                 if sum(df.duplicated(subset='gene_symbol_sql')) > 0:
#                     df.drop_duplicates(subset='gene_symbol_sql', inplace=True)
#                 df.drop('index', 1, inplace=True)
#                 df.set_index('gene_symbol_sql', inplace=True)
#                 df = self.condenseGSE(df)
#                 df_dict[celltype] = df
#         return df_dict


    # this function no longer cares how data was normalized
    def normData(self, ct):
#         if len(ct) == 1:
        df_path = self.ntpath.join('picklejar',ct+'.pckl')
        with open(df_path, 'rb') as f:
            df = self.pk.load(f)
        return df.T

#    ~~~ NEED TO UPDATE ~~~
#
#     def mergeCellTypes(self):
#         # holds the merged df and cell type list
#         merge_dict = {}
#         merge_dict['cellTypes'] = [] 
#         initial = 0
#         ct_dfs = self.normData()
#         for celltype in ct_dfs.keys():
#             if initial == 1:
#                 merge_df = ct_dfs[celltype]
#                 df = df.join(merge_df, how = 'inner')
#                 merge_dict['cellTypes'].extend([celltype] * len(merge_df.columns))
#             else:
#                 df = ct_dfs[celltype]
#                 merge_dict['cellTypes'].extend([celltype] * len(df.columns))
#                 initial = 1
#         df.dropna(inplace=True)
#         merge_dict['merged_df'] = df
#         return merge_dict
    
    # takes argument of cell type--but should know whether coarse of fine
#     def convertUnknown(self, ct_df, forMixing = 0):
#         unknown_gse = self.getNormalizationPlus(ct_df)
#         unknown_gsms = []
#         for gse, norm in unknown_gse.items():
#             if norm == 'RAW':
#                 gsmCols = ct_df.columns[ct_df.columns.str.contains(gse)]
#                 unknown_gsms.extend(gsmCols)
#         if forMixing == 1:
#             if any(ct_df.columns.str.contains('RAW')):
#                 raw_gsms = ct_df.columns[ct_df.columns.str.contains('RAW')]
#                 unknown_gsms.extend(raw_gsms)
#         df_unknown = ct_df[unknown_gsms]
#         df_unknown.dropna(inplace=True)
#         if forMixing == 0:
#             df_unknown = df_unknown.loc[df_unknown.index.intersection(self.genelens.index)]
#             df_unknown = self.convertRawtoFPKM(df_unknown, self.genelens)
#         return df_unknown
        
    # INTERNAL USE ONLY
    # this function is for making random mixtures with only 1 target ct
    def createMixRatio(self, ctList, mix):
        # I'm specifically writing this to only work with an object instantiated with
        # all coarse cell types--and ctList is length 1 with only the ct of interest
        random_total = 1-mix[0]
        # need to randomly mix the remaining 7 CTs
        other_mix = self.np.random.uniform(0, 1, len(ctList) - 1)
        other_mix = random_total * other_mix / other_mix.sum()
        new_mix = self.np.insert(other_mix, 0, mix[0])
        return new_mix
        
    def createMixDF(self, ctList, n_samp=10, mix = None):
        if mix is None:
            mix = [1/len(ctList)] * len(ctList)
        
        # usage is where you only specify 1 cell type
        if len(mix) == 1:
            other_ct = list(self.ctDict.keys())
            other_ct.remove(ctList[0])
            ctList.extend(other_ct)
            train_ctData = self.normData(ct=ctList[0])
            all_dfs = [train_ctData]
            for ct in other_ct:
                ct_df = self.normData(ct=ct)
                all_dfs.append(ct_df)
            res_mixture=[]
            for i in range(n_samp):
                mix_df_list = []
                for cell_n in range(len(ctList)):
                    cell_df = all_dfs[cell_n].sample(1, axis=1)
                    colname = cell_df.columns.values[0]
                    cell_df.rename(columns={colname:ctList[cell_n]}, inplace = True)
                    mix_df_list.append(cell_df)
                mix_df = self.pd.concat(mix_df_list, axis = 1, join = 'inner')
#                 print(mix_df.head())

                # mix
                if len(mix) == len(ctList):
                    ct_mix = dict(zip(ctList,mix))
                else:
                    rand_mix = self.createMixRatio(ctList, mix)
                    ct_mix = dict(zip(ctList,rand_mix))
#                 print(str(ct_mix))
                for ct, ratio in ct_mix.items():
                    mix_df.loc[:,ct] = mix_df[ct]*ratio
#                 print(mix_df.head())

                mixed_counts = mix_df.apply(sum, 1)
                res_mixture.append(self.pd.DataFrame(mixed_counts, columns=['sample'+str(i)]))
        
        mix_sample_df = pd.concat(res_mixture, axis = 1, join = 'inner')
        
        return mix_sample_df
        
        
        

In [42]:
test=microarray_data(['CD4.T.cells', 'CD8.T.cells', 'NK.cells', 'B.cells', 'monocytic.lineage', 
                      'neutrophils', 'endothelial.cells', 'fibroblasts'], scope='coarse')

In [56]:
# Need to do 70, 60, and 50 as well
test_mix = test.createMixDF(['CD4.T.cells'], n_samp = 100, mix=[0])

In [57]:
# test_data.keys()
test_mix.shape

(13830, 100)

In [58]:
test_mix.to_csv('mixed_dfs/cd4tcells_coarse_0.csv')

In [26]:
del test0

In [12]:
df.shape

(22900, 22)

In [3]:
with open('picklejar/ctDict.pckl','rb') as f:
    ctd = pickle.load(f)

In [4]:
ctd['Coarse'].keys()

dict_keys(['CD4.T.cells', 'CD8.T.cells', 'NK.cells', 'B.cells', 'monocytic.lineage', 'neutrophils', 'endothelial.cells', 'fibroblasts'])