In [23]:
import collections

import numpy as np
import pandas as pd
import nltk
import gensim

import kgglcncr.data_import as data_import
import kgglcncr.preprocessing as preprocessing
import kgglcncr.features as features

In [2]:
training_text = data_import.import_training_text()

In [3]:
training_text.head()

Unnamed: 0,ID,Text
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...


In [4]:
#training_text = preprocessing.remove_citations(training_text)
#training_text = preprocessing.lower_case(training_text)

preprocessor = preprocessing.PreprocessingPipeline([preprocessing.remove_citations,
                                                    preprocessing.lower_case])

training_text = preprocessor(training_text)

In [5]:
corpus = training_text.loc[:, 'Text']

In [6]:
corpus = corpus.tolist()

In [7]:
corpus = [nltk.word_tokenize(doc) for doc in corpus]

In [8]:
freq = collections.defaultdict(int)
for doc in corpus:
    for token in doc:
        freq[token] += 1

In [9]:
len(freq)

280156

In [10]:
corpus = [[token for token in doc if freq[token]>1] for doc in corpus]

In [11]:
dictionary = gensim.corpora.Dictionary(corpus)

In [12]:
print(dictionary)

Dictionary(172911 unique tokens: ['gtpaseactivating', 'pbs', '‘estroma’', 'development1', 'f1174l-expressing']...)


In [13]:
len(dictionary)

172911

In [14]:
corpus = [dictionary.doc2bow(doc) for doc in corpus]

In [15]:
tfidf_model, tfidf_corpus = features.gensim_tfidf_vectorizer(corpus, dictionary)

In [16]:
tfidf_corpus

<gensim.interfaces.TransformedCorpus at 0x7f7ca4d60e80>

In [17]:
tfidf_model, tfidf_matrix = features.gensim_tfidf_vectorizer(corpus, dictionary, as_array=True) # support sparsity

In [18]:
tfidf_matrix

<3321x172911 sparse matrix of type '<class 'numpy.float64'>'
	with 6006407 stored elements in Compressed Sparse Row format>

In [19]:
lsi_model, lsi_matrix = features.gensim_lsi_vectorizer(corpus, dictionary, as_array=True)

In [20]:
lsi_matrix.shape

(3321, 100)

In [21]:
lsi_matrix

array([[ 0.01375214,  0.01540729, -0.02547739, ..., -0.00547514,
         0.00380496,  0.00357918],
       [ 0.01831676,  0.02258104, -0.08644725, ..., -0.04254133,
        -0.00678389,  0.02007331],
       [ 0.01831676,  0.02258104, -0.08644725, ..., -0.04254133,
        -0.00678389,  0.02007331],
       ..., 
       [ 0.00963478,  0.01000979, -0.02170232, ..., -0.01399971,
        -0.06204254,  0.03341831],
       [ 0.02482287,  0.02282617, -0.04994173, ..., -0.03918116,
        -0.11681665,  0.03797685],
       [ 0.02428667,  0.02016869, -0.04184014, ..., -0.04007765,
        -0.1284133 ,  0.04204284]], dtype=float32)

In [22]:
lsi_model.print_topic(0,50)

'0.645*"brca1" + 0.288*"brct" + 0.272*"vus" + 0.205*"variants" + 0.150*"deleterious" + 0.145*"brca2" + 0.133*"neutral" + 0.113*"⇓" + 0.111*"nih-pa" + 0.099*"brca" + 0.095*"manuscript" + 0.088*"vuss" + 0.084*"classifi" + 0.079*"odds" + 0.078*"author" + 0.067*".00" + 0.067*"variant" + 0.065*"phosphopeptide" + 0.062*"e2" + 0.060*"m1775r" + 0.055*"causality" + 0.054*"transcriptional" + 0.054*"classification" + 0.052*"rmce" + 0.052*"ed" + 0.052*"117" + 0.051*"bard1" + 0.050*"cisplatin" + 0.046*"risk" + 0.046*"ovarian" + 0.043*"iarc" + 0.042*"cation" + 0.041*"history" + 0.041*"pathogenic" + 0.040*"cient" + 0.040*"breast" + 0.038*"proteolysis" + 0.037*"pmc" + 0.037*"hdr" + 0.037*"loh" + 0.036*"ssa" + 0.036*"classified" + 0.036*"supplementary" + 0.036*"hr" + 0.036*"missense" + 0.035*"0" + 0.035*"yeast" + 0.035*"likelihood" + 0.035*"unclassified" + 0.034*">"'

In [24]:
np.save('../feature_store/test/features.npy', lsi_matrix)

In [25]:
X = np.load('../feature_store/test/features.npy')

In [26]:
X

array([[ 0.01375214,  0.01540729, -0.02547739, ..., -0.00547514,
         0.00380496,  0.00357918],
       [ 0.01831676,  0.02258104, -0.08644725, ..., -0.04254133,
        -0.00678389,  0.02007331],
       [ 0.01831676,  0.02258104, -0.08644725, ..., -0.04254133,
        -0.00678389,  0.02007331],
       ..., 
       [ 0.00963478,  0.01000979, -0.02170232, ..., -0.01399971,
        -0.06204254,  0.03341831],
       [ 0.02482287,  0.02282617, -0.04994173, ..., -0.03918116,
        -0.11681665,  0.03797685],
       [ 0.02428667,  0.02016869, -0.04184014, ..., -0.04007765,
        -0.1284133 ,  0.04204284]], dtype=float32)

In [33]:
X.shape

(3321, 100)

In [27]:
training_variants = data_import.import_training_variants()

In [28]:
training_variants.head()

Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4


In [29]:
y = training_variants['Class'].as_matrix()

In [30]:
y

array([1, 2, 2, ..., 1, 4, 4])

In [31]:
len(y)

3321

In [115]:
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression

In [132]:
clsfr = LogisticRegression(C=10, multi_class='multinomial', solver='lbfgs', random_state=42)

In [133]:
kfoldcv = KFold(n_splits=5)

In [134]:
def binarize(y):
    return np.array([[1 if label == i else 0 for i in range(1,9)] for label in y])

In [135]:
for train_index, test_index in kfoldcv.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clsfr.fit(X_train, y_train)
    y_pred = clsfr.predict(X_test)
    y_pred, y_test = binarize(y_pred), binarize(y_test)
    print(log_loss(y_pred=y_pred, y_true=y_test))

21.1907079235
20.0262483615
22.731092296
24.3956116404
19.5581022959


In [137]:
# not too awesome ;) current best is around 0.5

parameters:

* Classification Algorithm (here, multinomial regression)
* Parameters of classification algorithm (here, for example *C*)
* Topic detection algorithm (LSI, LDA, etc.)
* Parameters of topic detection algorithm (especially, the *number of topics*)
* Preprocessing/Tokenization (remove numeric strings, or strings like "45-56", "23/34", etc., possibly stop words for plain bag of words models)

In [138]:
# preprocessing very crude, only text taken into account, no variant information