## Load data

In [2]:
file_test = 'C:\Data\Miroculus\A3_1_2016_text_inputs\A3_1_2016_binary_data_entities_normalize_rmstopwords_stem_test.tsv'
file_train = 'C:\Data\Miroculus\A3_1_2016_text_inputs\A3_1_2016_binary_data_entities_normalize_rmstopwords_stem_train.tsv'

In [3]:
import pandas as pd
test = pd.read_csv(file_test, quotechar='"', delimiter='\t', header=None, names=['Label', 'Text'])
train = pd.read_csv(file_train, quotechar='"', delimiter='\t', header=None, names=['Label', 'Text'])
test.head; train.head

<bound method NDFrame.head of              Label                                               Text
0      NO_RELATION  result identifi previous report breast cancer-...
1      NO_RELATION  conclus taken togeth mmvarentty1mm/ggvarentty1...
2         RELATION  revers correl mmvarentty1mm ggvarentty1gg also...
3         RELATION  result suggest recognit site mmvarentty1mm mmv...
4         RELATION  issu jci ghosh colleagu identifi uniqu microrn...
5      NO_RELATION  conclus mmvarentty1mm may promot cell ggvarent...
6      NO_RELATION        howev role mmvarentty1mm npc remain unknown
7         RELATION  targetscan predict ggvarentty1gg ggvarentty2gg...
8      NO_RELATION  increas mmvarentty1mm depend upon ggvarentty1g...
9      NO_RELATION  revers correl mmvarentty1mm ggvarentty1gg expr...
10        RELATION  studi identifi mmvarentty1mm mmvarentty2mm mic...
11        RELATION  show mmvarentty1mm oncosuppress microrna lost ...
12     NO_RELATION  newli identifi mmvarentty1mm/ggvarentty1

## Vectorise text data into numerical vector

### Using `CountVectorizer`

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
tf_vectorizer = CountVectorizer(analyzer='word', stop_words='english', ngram_range=(3,3), max_df=0.9, min_df=0.01)
train_tf_vectors = tf_vectorizer.fit_transform(train.Text)

** Note: ** Numbers not removed. Consider removing numbers.

`tf_vectorizer.vocabulary_` is a `dict`

In [5]:
tf_vectorizer.vocabulary_

{'direct target mmvarentty1mm': 0,
 'express ggvarentty1gg ggvarentty2gg': 1,
 'express level mmvarentty1mm': 2,
 'express mmvarentty1mm mmvarentty2mm': 3,
 'gastric cancer cell': 4,
 'ggvarentty1gg ggvarentty2gg ggvarentty3gg': 5,
 'ggvarentty2gg ggvarentty3gg ggvarentty4gg': 6,
 'ggvarentty3gg ggvarentty4gg ggvarentty5gg': 7,
 'ggvarentty4gg ggvarentty5gg ggvarentty6gg': 8,
 'luciferas report assay': 9,
 'mirna mmvarentty1mm mmvarentty2mm': 10,
 'mmvarentty1mm ggvarentty1gg ggvarentty2gg': 11,
 'mmvarentty1mm mmvarentty2mm mmvarentty3mm': 12,
 'mmvarentty2mm mmvarentty3mm mmvarentty4mm': 13,
 'mmvarentty3mm mmvarentty4mm mmvarentty5mm': 14,
 'target gene mmvarentty1mm': 15,
 'target ggvarentty1gg ggvarentty2gg': 16}

In [6]:
train_tf_vectors.shape

(10783, 17)

In [7]:
train_tf_vectors.data

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [8]:
print(train_tf_vectors.nnz) # number of non-zero elements
print(train_tf_vectors.nnz / float(train_tf_vectors.shape[0])) # non-zero per sample
print(train_tf_vectors.nnz / float(train_tf_vectors.shape[0] * train_tf_vectors.shape[1]) * 100) # non-zero per feature space in %

3227
0.29926736529722714
1.7603962664542772


## Data Preprocessing: 
### Attributes and Target

### Convert categorical labels into numerical labels

In [9]:
from sklearn import preprocessing

In [10]:
le = preprocessing.LabelEncoder()
le.fit(train.Label)
list(le.classes_)

['NO_RELATION', 'RELATION']

In [11]:
y_train = le.transform(train.Label);
y_test = le.transform(test.Label);

print(y_train[0:5])
print(le.inverse_transform(y_train[0:5]))

[0 0 1 1 1]
['NO_RELATION' 'NO_RELATION' 'RELATION' 'RELATION' 'RELATION']


`X` is data, `y` is target. `X` has the size of `n_samples` x `n_features`, `y` has `n_samples` x 1 labels

In [12]:
X_train = train_tf_vectors

In [13]:
X_test = tf_vectorizer.transform(test.Text)

In [14]:
X_test.shape

(3595, 17)

In [15]:
X_test.data

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

## Log Res: 

In [16]:
from sklearn import linear_model
clf = linear_model.LogisticRegression(C=1e5, class_weight='balanced')
clf.fit(X_train, y_train)

LogisticRegression(C=100000.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [17]:
y_pred = clf.predict(X_test)

In [18]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.54      0.91      0.68      1804
          1       0.71      0.22      0.34      1791

avg / total       0.62      0.57      0.51      3595



In [19]:
print(metrics.confusion_matrix(y_test, y_pred))

[[1639  165]
 [1396  395]]


### Coeficients of the decision function

In [20]:
clf.coef_

array([[ 1.63366473,  0.83900127, -0.60440172,  0.44442497,  0.44715121,
         0.93432491,  0.61029899,  0.86626551,  0.72013569,  1.3455731 ,
        -0.20726708,  0.65146497, -0.11413643, -0.11272087,  0.50552615,
         1.20698922,  0.80918582]])

In [21]:
len(clf.coef_[0,])

17

Get the index of coef with the maximum value.

In [22]:
clf.coef_.argmax()

0

In [23]:
m = clf.coef_.argmax()
print(list(tf_vectorizer.vocabulary_.keys())[list(tf_vectorizer.vocabulary_.values()).index(m)])

direct target mmvarentty1mm


Get the index of the highest 10 coef value.

**Note** `clf.coef_` is ndarray, see below:

In [24]:
clf.coef_[0]

array([ 1.63366473,  0.83900127, -0.60440172,  0.44442497,  0.44715121,
        0.93432491,  0.61029899,  0.86626551,  0.72013569,  1.3455731 ,
       -0.20726708,  0.65146497, -0.11413643, -0.11272087,  0.50552615,
        1.20698922,  0.80918582])

In [25]:
print(clf.coef_[0][0])
print(clf.coef_[0][1])

1.63366473432
0.839001268965


In [26]:
import numpy as np
topN = np.argsort(clf.coef_[0])[::-1][:20]
print(topN)

[ 0  9 15  5  7  1 16  8 11  6 14  4  3 13 12 10  2]


In [27]:
for v in topN:
    print(list(tf_vectorizer.vocabulary_.keys())[list(tf_vectorizer.vocabulary_.values()).index(v)])


direct target mmvarentty1mm
luciferas report assay
target gene mmvarentty1mm
ggvarentty1gg ggvarentty2gg ggvarentty3gg
ggvarentty3gg ggvarentty4gg ggvarentty5gg
express ggvarentty1gg ggvarentty2gg
target ggvarentty1gg ggvarentty2gg
ggvarentty4gg ggvarentty5gg ggvarentty6gg
mmvarentty1mm ggvarentty1gg ggvarentty2gg
ggvarentty2gg ggvarentty3gg ggvarentty4gg
mmvarentty3mm mmvarentty4mm mmvarentty5mm
gastric cancer cell
express mmvarentty1mm mmvarentty2mm
mmvarentty2mm mmvarentty3mm mmvarentty4mm
mmvarentty1mm mmvarentty2mm mmvarentty3mm
mirna mmvarentty1mm mmvarentty2mm
express level mmvarentty1mm
