In [3]:
import pickle
import pandas as pd
import numpy as np
from microarray_from_sql import microarray_data
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D
# from collections import OrderedDict
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import GEOparse
import re
%matplotlib inline

['CD4.T.cells', 'CD8.T.cells', 'NK.cells', 'B.cells', 'monocytic.lineage', 'neutrophils', 'endothelial.cells', 'fibroblasts']

#### TRYING RMA NORMALIZATION ####

In [4]:
coarse_data = microarray_data(['CD4.T.cells', 'CD8.T.cells', 'NK.cells', 'B.cells', 
                               'monocytic.lineage', 'neutrophils', 'endothelial.cells', 'fibroblasts'],
                             scope = 'coarse')

In [5]:
coarseAll = coarse_data.mergeCellTypes()

In [7]:
# coarseAll.keys()
coarseAll['merged_df'].shape

(5166, 1203)

In [8]:
test_df = coarseAll['merged_df'].copy()
cellType = coarseAll['cellTypes'].copy()

In [136]:
tsne = TSNE(n_components=3, verbose=1, perplexity=100, n_iter=2000)
tsne_results = tsne.fit_transform(test_df.drop(outliers,1).T)

[t-SNE] Computing 301 nearest neighbors...
[t-SNE] Indexed 1102 samples in 0.227s...
[t-SNE] Computed neighbors for 1102 samples in 13.624s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1102
[t-SNE] Computed conditional probabilities for sample 1102 / 1102
[t-SNE] Mean sigma: 14.296913
[t-SNE] KL divergence after 250 iterations with early exaggeration: 57.649426
[t-SNE] KL divergence after 1000 iterations: 0.250728


In [137]:
fig = px.scatter_3d(pd.concat([pd.DataFrame(tsne_results, columns=['1','2','3']),pd.Series(cellType_trim,name='cellType')],1),
                    x='1', y='2', z='3',
                    color='cellType', opacity=0.7)
fig.show()

In [66]:
# There seems to be a weird population of data that's very different...
# outliers = np.where(tsne_results[:,0]>12)
# tsne_results[:,2]
outliers = test_df.columns[np.where(tsne_results[:,0]>12)]
outliers

Index(['GSE_GSE26050__GSM_GSM639727__NORM_RMA',
       'GSE_GSE9378__GSM_GSM254455__NORM_RMA',
       'GSE_GSE9378__GSM_GSM254456__NORM_RMA',
       'GSE_GSE12845__GSM_GSM322403__NORM_gcRMA',
       'GSE_GSE12845__GSM_GSM322405__NORM_gcRMA',
       'GSE_GSE12845__GSM_GSM322402__NORM_gcRMA',
       'GSE_GSE12845__GSM_GSM322401__NORM_gcRMA',
       'GSE_GSE12845__GSM_GSM322406__NORM_gcRMA',
       'GSE_GSE12845__GSM_GSM322404__NORM_gcRMA',
       'GSE_GSE12845__GSM_GSM322407__NORM_gcRMA',
       ...
       'GSE_GSE47796__GSM_GSM1159731__NORM_RMA',
       'GSE_GSE47796__GSM_GSM1159733__NORM_RMA',
       'GSE_GSE47796__GSM_GSM1159767__NORM_RMA',
       'GSE_GSE47796__GSM_GSM1159768__NORM_RMA',
       'GSE_GSE47796__GSM_GSM1159732__NORM_RMA',
       'GSE_GSE47796__GSM_GSM1159765__NORM_RMA',
       'GSE_GSE47796__GSM_GSM1159766__NORM_RMA',
       'GSE_GSE47796__GSM_GSM1159769__NORM_RMA',
       'GSE_GSE47796__GSM_GSM1159772__NORM_RMA',
       'GSE_GSE47796__GSM_GSM1159770__NORM_RMA'],
      

In [72]:
test_df.drop(outliers,1).shape

(5166, 1102)

In [129]:
# cellType[list(np.where(tsne_results[:,0]>12))]
cellType_trim = [cellType[i] for i in np.arange(0,len(cellType)) if i not in np.where(tsne_results[:,0]>12)[0]]
# for i in np.where(tsne_results[:,0]>12)[0]:
#     print(i)
#     print(cellType[i[0]])

In [138]:
# Looks pretty good after removing the outliers
out_df = test_df.drop(outliers,1).T
out_df['cellType'] = cellType_trim
out_df.to_csv('R/RMA_coarse_081419.csv')

##### TRYING MAS5 NORMALIZATION ####

In [15]:
coarse_data = microarray_data(['CD4.T.cells', 'CD8.T.cells', 'NK.cells', 'B.cells', 
                               'monocytic.lineage', 'neutrophils', 'endothelial.cells', 'fibroblasts'],
                             norm = 'MAS5', scope = 'coarse')

In [16]:
coarseAll = coarse_data.mergeCellTypes()

In [18]:
coarseAll['merged_df'].shape

(7062, 778)

In [19]:
test_df_MAS5 = coarseAll['merged_df'].copy()
cellType_MAS5 = coarseAll['cellTypes'].copy()

In [28]:
tsne = TSNE(n_components=3, verbose=1, perplexity=80, n_iter=2000)
tsne_results = tsne.fit_transform(test_df_MAS5.T)

[t-SNE] Computing 241 nearest neighbors...
[t-SNE] Indexed 778 samples in 0.216s...
[t-SNE] Computed neighbors for 778 samples in 7.991s...
[t-SNE] Computed conditional probabilities for sample 778 / 778
[t-SNE] Mean sigma: 37.064786
[t-SNE] KL divergence after 250 iterations with early exaggeration: 52.687218
[t-SNE] KL divergence after 1100 iterations: 0.575276


In [29]:
fig = px.scatter_3d(pd.concat([pd.DataFrame(tsne_results, columns=['1','2','3']),pd.Series(cellType_MAS5,name='cellType')],1),
                    x='1', y='2', z='3',
                    color='cellType', opacity=0.7)
fig.show()