## Load data

In [1]:
file_test = 'C:\Data\Miroculus\A3_1_2016_text_inputs\A3_1_2016_binary_data_entities_normalize_rmstopwords_stem_test.tsv'
file_train = 'C:\Data\Miroculus\A3_1_2016_text_inputs\A3_1_2016_binary_data_entities_normalize_rmstopwords_stem_train.tsv'

In [2]:
import pandas as pd
test = pd.read_csv(file_test, quotechar='"', delimiter='\t', header=None, names=['Label', 'Text'])
train = pd.read_csv(file_train, quotechar='"', delimiter='\t', header=None, names=['Label', 'Text'])
test.head; train.head

<bound method NDFrame.head of              Label                                               Text
0      NO_RELATION  result identifi previous report breast cancer-...
1      NO_RELATION  conclus taken togeth mmvarentty1mm/ggvarentty1...
2         RELATION  revers correl mmvarentty1mm ggvarentty1gg also...
3         RELATION  result suggest recognit site mmvarentty1mm mmv...
4         RELATION  issu jci ghosh colleagu identifi uniqu microrn...
5      NO_RELATION  conclus mmvarentty1mm may promot cell ggvarent...
6      NO_RELATION        howev role mmvarentty1mm npc remain unknown
7         RELATION  targetscan predict ggvarentty1gg ggvarentty2gg...
8      NO_RELATION  increas mmvarentty1mm depend upon ggvarentty1g...
9      NO_RELATION  revers correl mmvarentty1mm ggvarentty1gg expr...
10        RELATION  studi identifi mmvarentty1mm mmvarentty2mm mic...
11        RELATION  show mmvarentty1mm oncosuppress microrna lost ...
12     NO_RELATION  newli identifi mmvarentty1mm/ggvarentty1

## Vectorise text data into numerical vector

### Using `CountVectorizer`

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
tf_vectorizer = CountVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), max_df=0.9, min_df=0.01)
train_tf_vectors = tf_vectorizer.fit_transform(train.Text)

** Note: ** Numbers not removed. Consider removing numbers.

`tf_vectorizer.vocabulary_` is a `dict`

In [4]:
tf_vectorizer.vocabulary_

{'001': 0,
 '05': 1,
 '16': 2,
 '3p': 3,
 '5p': 4,
 'act': 5,
 'activ': 6,
 'addit': 7,
 'affect': 8,
 'aim': 9,
 'alter': 10,
 'analys': 11,
 'analysi': 12,
 'analyz': 13,
 'apoptosi': 14,
 'arrest': 15,
 'assay': 16,
 'associ': 17,
 'bind': 18,
 'bioinformat': 19,
 'biomark': 20,
 'blot': 21,
 'breast': 22,
 'breast cancer': 23,
 'cancer': 24,
 'cancer cell': 25,
 'cancer tissu': 26,
 'carcinoma': 27,
 'caus': 28,
 'cell': 29,
 'cell cycl': 30,
 'cell growth': 31,
 'cell line': 32,
 'cell migrat': 33,
 'cell prolifer': 34,
 'chain': 35,
 'chain reaction': 36,
 'chang': 37,
 'clinic': 38,
 'cluster': 39,
 'colon': 40,
 'combin': 41,
 'compar': 42,
 'conclus': 43,
 'conclus mmvarentty1mm': 44,
 'confirm': 45,
 'contribut': 46,
 'control': 47,
 'correl': 48,
 'correl ggvarentty1gg': 49,
 'correl mmvarentty1mm': 50,
 'crc': 51,
 'cycl': 52,
 'data': 53,
 'decreas': 54,
 'decreas express': 55,
 'demonstr': 56,
 'demonstr mmvarentty1mm': 57,
 'depend': 58,
 'detect': 59,
 'determin': 60,
 

In [5]:
train_tf_vectors.shape

(10783, 341)

In [6]:
train_tf_vectors.data

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [7]:
print(train_tf_vectors.nnz) # number of non-zero elements
print(train_tf_vectors.nnz / float(train_tf_vectors.shape[0])) # non-zero per sample
print(train_tf_vectors.nnz / float(train_tf_vectors.shape[0] * train_tf_vectors.shape[1]) * 100) # non-zero per feature space in %

131234
12.170453491607159
3.5690479447528323


## Data Preprocessing: 
### Attributes and Target

### Convert categorical labels into numerical labels

In [8]:
from sklearn import preprocessing

In [9]:
le = preprocessing.LabelEncoder()
le.fit(train.Label)
list(le.classes_)

['NO_RELATION', 'RELATION']

In [10]:
y_train = le.transform(train.Label);
y_test = le.transform(test.Label);

print(y_train[0:5])
print(le.inverse_transform(y_train[0:5]))

[0 0 1 1 1]
['NO_RELATION' 'NO_RELATION' 'RELATION' 'RELATION' 'RELATION']


`X` is data, `y` is target. `X` has the size of `n_samples` x `n_features`, `y` has `n_samples` x 1 labels

In [11]:
X_train = train_tf_vectors

In [12]:
X_test = tf_vectorizer.transform(test.Text)

In [13]:
X_test.shape

(3595, 341)

In [14]:
X_test.data

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

## Log Res: 

In [15]:
from sklearn import linear_model
clf = linear_model.LogisticRegression(C=1e5, class_weight='balanced')
clf.fit(X_train, y_train)

LogisticRegression(C=100000.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [16]:
y_pred = clf.predict(X_test)

In [17]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.73      0.74      0.73      1804
          1       0.73      0.72      0.73      1791

avg / total       0.73      0.73      0.73      3595



In [18]:
print(metrics.confusion_matrix(y_test, y_pred))

[[1326  478]
 [ 499 1292]]


### Coeficients of the decision function

In [19]:
clf.coef_

array([[ 0.07226891, -0.23958631,  0.95081376, -0.03095743, -0.29820422,
         0.07140192,  0.17854329,  0.14072876, -0.01347017, -0.40670343,
         0.2347351 , -0.05041299, -0.11829966, -0.07978175, -0.01986569,
        -0.2541388 , -0.076575  , -0.23541402,  0.14610485,  0.44539902,
        -0.54109665,  1.34708047, -0.20058182,  0.2053167 , -0.41921526,
         0.0455621 ,  0.35061645, -0.54706411,  0.35355901, -0.0432551 ,
         0.58688697, -0.09390101,  0.74804887,  0.09223176, -0.07992314,
         0.66856296, -1.29499692, -0.15951544, -0.20907029,  0.27893341,
         0.23789606,  0.1104066 , -0.21950584,  0.07577404, -0.0174021 ,
         0.16298987,  0.06221673,  0.05742714, -0.08807342,  0.77072882,
         0.38481673, -0.06683054, -0.0267069 , -0.10986448,  0.28134874,
        -0.48967618, -0.03562234,  0.03718128,  0.39929988, -0.07342503,
         0.29680462, -0.23599366,  0.1304277 , -0.00354241,  0.43900867,
         0.15113361,  0.7517656 , -0.71635569, -0.4

In [20]:
len(clf.coef_[0,])

341

Get the index of coef with the maximum value.

In [21]:
clf.coef_.argmax()

338

In [22]:
m = clf.coef_.argmax()
print(list(tf_vectorizer.vocabulary_.keys())[list(tf_vectorizer.vocabulary_.values()).index(m)])

western


Get the index of the highest 10 coef value.

**Note** `clf.coef_` is ndarray, see below:

In [23]:
clf.coef_[0]

array([ 0.07226891, -0.23958631,  0.95081376, -0.03095743, -0.29820422,
        0.07140192,  0.17854329,  0.14072876, -0.01347017, -0.40670343,
        0.2347351 , -0.05041299, -0.11829966, -0.07978175, -0.01986569,
       -0.2541388 , -0.076575  , -0.23541402,  0.14610485,  0.44539902,
       -0.54109665,  1.34708047, -0.20058182,  0.2053167 , -0.41921526,
        0.0455621 ,  0.35061645, -0.54706411,  0.35355901, -0.0432551 ,
        0.58688697, -0.09390101,  0.74804887,  0.09223176, -0.07992314,
        0.66856296, -1.29499692, -0.15951544, -0.20907029,  0.27893341,
        0.23789606,  0.1104066 , -0.21950584,  0.07577404, -0.0174021 ,
        0.16298987,  0.06221673,  0.05742714, -0.08807342,  0.77072882,
        0.38481673, -0.06683054, -0.0267069 , -0.10986448,  0.28134874,
       -0.48967618, -0.03562234,  0.03718128,  0.39929988, -0.07342503,
        0.29680462, -0.23599366,  0.1304277 , -0.00354241,  0.43900867,
        0.15113361,  0.7517656 , -0.71635569, -0.47629861,  0.45

In [24]:
print(clf.coef_[0][0])
print(clf.coef_[0][1])

0.0722689080744
-0.239586311844


In [25]:
import numpy as np
topN = np.argsort(clf.coef_[0])[::-1][:20]
print(topN)

[338  21 153 308 249  98   2 206  99 312 208 314  49 273 230  66  32 108
 313 151]


In [26]:
for v in topN:
    print(list(tf_vectorizer.vocabulary_.keys())[list(tf_vectorizer.vocabulary_.values()).index(v)])


western
blot
invers
taken
prostat
gene ggvarentty1gg
16
mmvarentty3mm mmvarentty4mm
gene mmvarentty1mm
target ggvarentty1gg
mmvarentty4mm mmvarentty5mm
target mmvarentty1mm
correl ggvarentty1gg
repress
partial
directli
cell line
ggvarentty1gg protein
target ggvarentty2gg
interact
