In [1]:
from preprocess_dataset import load_dataset
import scipy
import scipy.stats as stats
import numpy as np
import lda
from sklearn.model_selection import train_test_split

In [2]:
data, genes, cells = load_dataset()
print("Data shape:", data.shape)

Data shape: (4645, 22712)


In [3]:
def compute_stats(data):
    tmp = np.count_nonzero(data, axis=1)
    print("Average/Min/Max non-zero count for each cell: ", np.mean(tmp), np.min(tmp), np.max(tmp))
    tmp = np.count_nonzero(data, axis=0)
    print("Average/Min/Max non-zero count for each gene: ", np.mean(tmp), np.min(tmp), np.max(tmp))
    if np.max(tmp) == data.shape[0]: print("there exists a gene active in all cells!")
    tmp = np.max(data, axis=0)
    print("Average/Min/Max max-expression value for each gene: ", np.mean(tmp), np.min(tmp), np.max(tmp))
    print("Number of genes with max-expression of 1:", np.where(tmp==1)[0].shape[0])
    # mean non-zero expression value of each gene
    tmp = np.true_divide(data.sum(0),(data!=0).sum(0))
    print("Average/Min/Max mean non-zero expression value for each gene: ", np.mean(tmp), np.min(tmp), np.max(tmp))
    
def normalize_along_columns(data):
    return (data - data.mean(axis=0)) / data.std(axis=0)

In [4]:
compute_stats(data)
# data = normalize_along_columns(data)
# compute_stats(data)

Average/Min/Max non-zero count for each cell:  4349.26437029 1310 13154
Average/Min/Max non-zero count for each gene:  889.500396266 1 4642
Average/Min/Max max-expression value for each gene:  162.237847834 1 62129
Number of genes with max-expression of 1: 35
Average/Min/Max mean non-zero expression value for each gene:  12.8953778105 1.0 2317.61788618


In [6]:
print(data.shape)
data_trn,data_te,_,_ = train_test_split(data, np.ones(data.shape[0]), test_size=0.10, random_state=42)
column_dropout = 0.1
rand_idx = np.random.randint(data_trn.shape[1], size=int(data_trn.shape[1]*column_dropout))
trimmed_data_trn = data_trn[:,rand_idx]
trimmed_data_te = data_te[:,rand_idx]
print(trimmed_data_trn.shape, trimmed_data_te.shape)

(4645, 22712)
(4180, 2271) (465, 2271)


In [8]:
n_clusters = 10
n_data = len(cells)
n_feats = len(genes)

In [9]:
model = lda.LDA(n_topics=n_clusters, n_iter=1500, random_state=1)
model.fit(trimmed_data_te)

INFO:lda:n_documents: 465
INFO:lda:vocab_size: 2271
INFO:lda:n_words: 3896110
INFO:lda:n_topics: 10
INFO:lda:n_iter: 1500


KeyboardInterrupt: 

In [None]:
n_top_words = 8
topic_word = model.topic_word_
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))