## Load data

In [2]:
file_test = 'C:\Data\Miroculus\A3_1_2016_text_inputs\A3_1_2016_binary_data_entities_normalize_rmstopwords_stem_test.tsv'
file_train = 'C:\Data\Miroculus\A3_1_2016_text_inputs\A3_1_2016_binary_data_entities_normalize_rmstopwords_stem_train.tsv'

In [3]:
import pandas as pd
test = pd.read_csv(file_test, quotechar='"', delimiter='\t', header=None, names=['Label', 'Text'])
train = pd.read_csv(file_train, quotechar='"', delimiter='\t', header=None, names=['Label', 'Text'])
test.head; train.head

<bound method NDFrame.head of              Label                                               Text
0      NO_RELATION  result identifi previous report breast cancer-...
1      NO_RELATION  conclus taken togeth mmvarentty1mm/ggvarentty1...
2         RELATION  revers correl mmvarentty1mm ggvarentty1gg also...
3         RELATION  result suggest recognit site mmvarentty1mm mmv...
4         RELATION  issu jci ghosh colleagu identifi uniqu microrn...
5      NO_RELATION  conclus mmvarentty1mm may promot cell ggvarent...
6      NO_RELATION        howev role mmvarentty1mm npc remain unknown
7         RELATION  targetscan predict ggvarentty1gg ggvarentty2gg...
8      NO_RELATION  increas mmvarentty1mm depend upon ggvarentty1g...
9      NO_RELATION  revers correl mmvarentty1mm ggvarentty1gg expr...
10        RELATION  studi identifi mmvarentty1mm mmvarentty2mm mic...
11        RELATION  show mmvarentty1mm oncosuppress microrna lost ...
12     NO_RELATION  newli identifi mmvarentty1mm/ggvarentty1

## Vectorise text data into numerical vector

### Using `CountVectorizer`

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
tf_vectorizer = CountVectorizer(analyzer='word', stop_words='english', ngram_range=(1,1), max_df=0.9, min_df=0.01)
train_tf_vectors = tf_vectorizer.fit_transform(train.Text)

** Note: ** Numbers not removed. Consider removing numbers.

`tf_vectorizer.vocabulary_` is a `dict`

In [5]:
tf_vectorizer.vocabulary_

{'001': 0,
 '05': 1,
 '16': 2,
 '3p': 3,
 '5p': 4,
 'act': 5,
 'activ': 6,
 'addit': 7,
 'affect': 8,
 'aim': 9,
 'alter': 10,
 'analys': 11,
 'analysi': 12,
 'analyz': 13,
 'apoptosi': 14,
 'arrest': 15,
 'assay': 16,
 'associ': 17,
 'bind': 18,
 'bioinformat': 19,
 'biomark': 20,
 'blot': 21,
 'breast': 22,
 'cancer': 23,
 'carcinoma': 24,
 'caus': 25,
 'cell': 26,
 'chain': 27,
 'chang': 28,
 'clinic': 29,
 'cluster': 30,
 'colon': 31,
 'combin': 32,
 'compar': 33,
 'conclus': 34,
 'confirm': 35,
 'contribut': 36,
 'control': 37,
 'correl': 38,
 'crc': 39,
 'cycl': 40,
 'data': 41,
 'decreas': 42,
 'demonstr': 43,
 'depend': 44,
 'detect': 45,
 'determin': 46,
 'develop': 47,
 'differ': 48,
 'differenti': 49,
 'direct': 50,
 'directli': 51,
 'diseas': 52,
 'downregul': 53,
 'downstream': 54,
 'ectop': 55,
 'effect': 56,
 'elev': 57,
 'enhanc': 58,
 'epitheli': 59,
 'evalu': 60,
 'evid': 61,
 'examin': 62,
 'express': 63,
 'factor': 64,
 'famili': 65,
 'final': 66,
 'format': 67,
 'f

In [6]:
train_tf_vectors.shape

(10783, 233)

In [7]:
train_tf_vectors.data

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [8]:
print(train_tf_vectors.nnz) # number of non-zero elements
print(train_tf_vectors.nnz / float(train_tf_vectors.shape[0])) # non-zero per sample
print(train_tf_vectors.nnz / float(train_tf_vectors.shape[0] * train_tf_vectors.shape[1]) * 100) # non-zero per feature space in %

101851
9.44551609014189
4.053869566584503


## Data Preprocessing: 
### Attributes and Target

### Convert categorical labels into numerical labels

In [9]:
from sklearn import preprocessing

In [10]:
le = preprocessing.LabelEncoder()
le.fit(train.Label)
list(le.classes_)

['NO_RELATION', 'RELATION']

In [11]:
y_train = le.transform(train.Label);
y_test = le.transform(test.Label);

print(y_train[0:5])
print(le.inverse_transform(y_train[0:5]))

[0 0 1 1 1]
['NO_RELATION' 'NO_RELATION' 'RELATION' 'RELATION' 'RELATION']


`X` is data, `y` is target. `X` has the size of `n_samples` x `n_features`, `y` has `n_samples` x 1 labels

In [12]:
X_train = train_tf_vectors

In [13]:
X_test = tf_vectorizer.transform(test.Text)

In [14]:
X_test.shape

(3595, 233)

In [15]:
X_test.data

array([1, 1, 2, ..., 1, 1, 1], dtype=int64)

## Log Res: 

In [16]:
from sklearn import linear_model
clf = linear_model.LogisticRegression(C=1e5, class_weight='balanced')
clf.fit(X_train, y_train)

LogisticRegression(C=100000.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [17]:
y_pred = clf.predict(X_test)

In [18]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.71      0.73      0.72      1804
          1       0.72      0.71      0.71      1791

avg / total       0.72      0.72      0.72      3595



In [19]:
print(metrics.confusion_matrix(y_test, y_pred))

[[1315  489]
 [ 525 1266]]


### Coeficients of the decision function

In [20]:
clf.coef_

array([[ -1.60030090e-02,  -2.38254074e-01,   7.84509452e-01,
         -1.06175508e-01,  -5.01240271e-01,   5.39438522e-02,
          1.00129507e-01,   1.42094355e-01,   1.78047970e-03,
         -3.73957350e-01,   1.53713418e-01,  -9.33409397e-02,
         -1.50474940e-01,   1.05350848e-02,  -3.44762742e-02,
         -1.75948470e-01,   1.60760528e-02,  -2.77309459e-01,
          2.16794536e-01,   5.48282550e-01,  -5.88849445e-01,
          1.12736273e+00,  -2.52484060e-02,  -3.35002925e-01,
         -4.87978179e-01,   2.90266545e-01,  -9.98169412e-02,
          2.59060019e-01,  -1.92870980e-01,  -2.53116895e-01,
          2.27985974e-01,   2.68898366e-01,   1.15485484e-01,
         -3.26028522e-01,   6.30374774e-02,   2.39763631e-01,
          1.00923783e-01,   9.23161216e-02,   5.14224947e-02,
          1.05154203e-02,   5.26876657e-01,  -8.44316618e-02,
          1.77237527e-01,  -5.25027500e-02,   2.95671625e-01,
         -1.37840077e-01,   2.88351516e-01,  -2.19595177e-01,
        

In [21]:
len(clf.coef_[0,])

233

Get the index of coef with the maximum value.

In [22]:
clf.coef_.argmax()

21

In [23]:
m = clf.coef_.argmax()
print(list(tf_vectorizer.vocabulary_.keys())[list(tf_vectorizer.vocabulary_.values()).index(m)])

blot


Get the index of the highest 10 coef value.

**Note** `clf.coef_` is ndarray, see below:

In [24]:
clf.coef_[0]

array([ -1.60030090e-02,  -2.38254074e-01,   7.84509452e-01,
        -1.06175508e-01,  -5.01240271e-01,   5.39438522e-02,
         1.00129507e-01,   1.42094355e-01,   1.78047970e-03,
        -3.73957350e-01,   1.53713418e-01,  -9.33409397e-02,
        -1.50474940e-01,   1.05350848e-02,  -3.44762742e-02,
        -1.75948470e-01,   1.60760528e-02,  -2.77309459e-01,
         2.16794536e-01,   5.48282550e-01,  -5.88849445e-01,
         1.12736273e+00,  -2.52484060e-02,  -3.35002925e-01,
        -4.87978179e-01,   2.90266545e-01,  -9.98169412e-02,
         2.59060019e-01,  -1.92870980e-01,  -2.53116895e-01,
         2.27985974e-01,   2.68898366e-01,   1.15485484e-01,
        -3.26028522e-01,   6.30374774e-02,   2.39763631e-01,
         1.00923783e-01,   9.23161216e-02,   5.14224947e-02,
         1.05154203e-02,   5.26876657e-01,  -8.44316618e-02,
         1.77237527e-01,  -5.25027500e-02,   2.95671625e-01,
        -1.37840077e-01,   2.88351516e-01,  -2.19595177e-01,
         7.21475667e-03,

In [25]:
print(clf.coef_[0][0])
print(clf.coef_[0][1])

-0.0160030089972
-0.238254074159


Get top N words

In [26]:
import numpy as np
topN = np.argsort(clf.coef_[0])[::-1][:20]
print(topN)

[ 21 100 130   2 181 211 148  98  50 167  19  40 185 188  74 192  51  53
 227 203]


In [27]:
for v in topN:
    print(list(tf_vectorizer.vocabulary_.keys())[list(tf_vectorizer.vocabulary_.values()).index(v)])


blot
invers
mmvarentty5mm
16
repress
target
partial
interact
direct
protein
bioinformat
cycl
restor
revers
ggvarentty2gg
rt
directli
downregul
utr
site


| Number of training Sentences | word          | example               |
| ---------------------------- |:-------------:|:---------------------:|         
| 217                          | blot          | western blot          |
| 273                          | invers        | invers correl, invers relation |
| 182                          | mmvarentty5mm | mmvarentty1mm mmvarentty2mm mmvarentty1mm25a mmvarentty4mm mmvarentty5mm|
| 277                          | 16            | ggvarentty1gg-16 |
| 369                          | repress       | mmvarentty1mm repress, repress ggvarentty1gg|
| 2480                         | target        | mmvarentty1mm target, target ggvarentty1gg|
| 109                          | partial       | partial target, partial revers | 
| 150                          | interact      | interact ggvarentty1gg |
| 760                          | direct        | directli bind, directli target |
| 635                          | protein       | protein express, protein level |
| 124                          | bioinformat   | bioinformat analysi |
| 210                          | cycl          | cell cycl |
| 179                          | restor        | restor express ggvarentty1gg, restor mmvarentty1mm |
| 236                          | revers        | revers transcript |
| 4                            | rt            |rt -qpcr, quantit rt ggvarentty1gg, transcript rt -pcr, rt profil pcr array|
| 348                          | directli      | directli target, directli bind |
| 723                          | downregul     | mmvarentty1mm downregul, ggvarentty2gg downregul|
| 188                          | utr           | 3'utr ggvarentty1gg |
| 221                          | site          | bind site, seed site, target site |



Get bottom N words

In [30]:
bottomN = np.argsort(clf.coef_[0])[::1][:20]
print(bottomN)

[170 189 205 151 122 132 108  20 140 134 150  83 172   4  86  52  24 131
 118 228]


In [31]:
for v in bottomN:
    print(list(tf_vectorizer.vocabulary_.keys())[list(tf_vectorizer.vocabulary_.values()).index(v)])


reaction
risk
stage
pcr
mice
model
liver
biomark
non
molecular
patient
healthi
recent
5p
howev
diseas
carcinoma
mmvarentty6mm
metastasi
valid
