In [1]:
from preprocess_dataset import load_dataset
from preprocess_dataset import cut_out_zero_rows
from preprocess_dataset import cut_out_zero_cols
import pandas as pd
import scipy
import scipy.stats as stats
import numpy as np
import lda
import pickle
from sklearn.model_selection import train_test_split

In [2]:
data, genes, cells = load_dataset()
print("Data shape:", data.shape)

Data shape: (4645, 22712)


In [3]:
def compute_stats(data):
    tmp = np.count_nonzero(data, axis=1)
    print("Average/Min/Max non-zero count for each cell: ", np.mean(tmp), np.min(tmp), np.max(tmp))
    tmp = np.count_nonzero(data, axis=0)
    print("Average/Min/Max non-zero count for each gene: ", np.mean(tmp), np.min(tmp), np.max(tmp))
    if np.max(tmp) == data.shape[0]: print("there exists a gene active in all cells!")
    tmp = np.max(data, axis=0)
    print("Average/Min/Max max-expression value for each gene: ", np.mean(tmp), np.min(tmp), np.max(tmp))
    print("Number of genes with max-expression of 1:", np.where(tmp==1)[0].shape[0])
    # mean non-zero expression value of each gene
    tmp = np.true_divide(data.sum(0),(data!=0).sum(0))
    print("Average/Min/Max mean non-zero expression value for each gene: ", np.mean(tmp), np.min(tmp), np.max(tmp))
    
def normalize_along_columns(data):
    return (data - data.mean(axis=0)) / data.std(axis=0)

In [4]:
compute_stats(data)
# data = normalize_along_columns(data)
# compute_stats(data)

Average/Min/Max non-zero count for each cell:  4349.26437029 1310 13154
Average/Min/Max non-zero count for each gene:  889.500396266 1 4642
Average/Min/Max max-expression value for each gene:  162.237847834 1 62129
Number of genes with max-expression of 1: 35
Average/Min/Max mean non-zero expression value for each gene:  12.8953778105 1.0 2317.61788618


In [5]:
print(data.shape)
data_trn,data_te,_,_ = train_test_split(data, np.ones(data.shape[0]), test_size=0.10, random_state=42)
column_dropout = 0.1
rand_idx = np.random.randint(data_trn.shape[1], size=int(data_trn.shape[1]*column_dropout))
trimmed_data_trn = data_trn[:,rand_idx]
trimmed_data_te = data_te[:,rand_idx]
print(trimmed_data_trn.shape, trimmed_data_te.shape)

(4645, 22712)
(4180, 2271) (465, 2271)


In [6]:
trimmed_data_te = cut_out_zero_rows(pd.DataFrame(trimmed_data_te)).as_matrix()
keep_cols = ~(pd.DataFrame(trimmed_data_te) == 0).all(axis=0)
trimmed_data_te = cut_out_zero_cols(pd.DataFrame(trimmed_data_te)).as_matrix()

In [9]:
n_clusters = 10
n_data = len(cells)
n_feats = len(genes)

array(['IL1RAP', 'ALKBH4', 'LOC390660', ..., 'NPB', 'ERCC6L', 'DIDO1'], dtype=object)

In [11]:
model = lda.LDA(n_topics=n_clusters, n_iter=1500, random_state=1)
model.fit(trimmed_data_te)

INFO:lda:n_documents: 465
INFO:lda:vocab_size: 2112
INFO:lda:n_words: 2995588
INFO:lda:n_topics: 10
INFO:lda:n_iter: 1500
INFO:lda:<0> log likelihood: -25886689
INFO:lda:<10> log likelihood: -22410101
INFO:lda:<20> log likelihood: -20800175
INFO:lda:<30> log likelihood: -20468089
INFO:lda:<40> log likelihood: -20346738
INFO:lda:<50> log likelihood: -20281003
INFO:lda:<60> log likelihood: -20232114
INFO:lda:<70> log likelihood: -20203262
INFO:lda:<80> log likelihood: -20183186
INFO:lda:<90> log likelihood: -20157842
INFO:lda:<100> log likelihood: -20143347
INFO:lda:<110> log likelihood: -20133000
INFO:lda:<120> log likelihood: -20125792
INFO:lda:<130> log likelihood: -20120753
INFO:lda:<140> log likelihood: -20114893
INFO:lda:<150> log likelihood: -20112212
INFO:lda:<160> log likelihood: -20107824
INFO:lda:<170> log likelihood: -20102149
INFO:lda:<180> log likelihood: -20099670
INFO:lda:<190> log likelihood: -20094489
INFO:lda:<200> log likelihood: -20098684
INFO:lda:<210> log likelihoo

<lda.lda.LDA at 0x7f350d89e0b8>

In [12]:
from sklearn.externals import joblib
joblib.dump(da,'data_trn.pickle')
joblib.dump(model, 'model.pickle') 

['small_subset_lda.pickle']

In [15]:
n_top_words = 12
topic_word = model.topic_word_
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(genes[rand_idx][keep_cols])[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: RPS27A CORO1A UBB TOB1 PABPC1 IL2RG RHOH RCSD1 EIF4B IFITM2 PPIA NDUFB9
Topic 1: DUSP2 DUSP2 CORO1A RPS27A IFITM1 IL2RG SRGN TOB1 PPP1CA TSPO UBB UBL5
Topic 2: PRF1 CORO1A UBB IL2RG GZMH SRGN IFITM1 IFITM2 HLA.F PPP1CA DENND2D IRF9
Topic 3: RPS27A ARL6IP5 UGT8 UGT8 PGM5P2 GSG1 UBQLN2 MCFD2 UBQLN2 MCFD2 TNFAIP8L1 PRICKLE2.AS3
Topic 4: RNASE1 IFITM2 SRGN NPC2 CCL14 S100A9 UBB TMEM176B SEPP1 IER3 PPIA CRIP2
Topic 5: PSME1 PSME1 UBB PABPC1 PPIA STAT1 TPM3 IFIT3 EDF1 DNPH1 MORF4L1 UBL5
Topic 6: RGS2 SRGN CD8B UBB STAT1 ATP5A1 IFITM1 LITAF RASSF5 DENND2D MOB1A MOB1A
Topic 7: IFITM1 IFITM2 RPS27A SRGN CORO1A HLA.F STAT1 EDF1 IL2RG PABPC1 PPIA TRIM22
Topic 8: GAS5 RPS27A UBB PABPC1 PPIA TSPO CLTA TRMT112 IMPDH2 ATP5A1 EIF3M PGLS
Topic 9: LGALS3 UBB NPC2 PPIA ATP1A1 PKM DUSP6 SPCS1 ERGIC3 ATP6V0B RPS27A BST2
