## Load data

In [30]:
file_test = 'C:\Data\Miroculus\A3_1_2016_text_inputs\A3_1_2016_binary_data_entities_normalize_rmstopwords_stem_test.tsv'
file_train = 'C:\Data\Miroculus\A3_1_2016_text_inputs\A3_1_2016_binary_data_entities_normalize_rmstopwords_stem_train.tsv'

In [31]:
import pandas as pd
test = pd.read_csv(file_test, quotechar='"', delimiter='\t', header=None, names=['Label', 'Text'])
train = pd.read_csv(file_train, quotechar='"', delimiter='\t', header=None, names=['Label', 'Text'])
test.head; train.head

<bound method NDFrame.head of              Label                                               Text
0      NO_RELATION  result identifi previous report breast cancer-...
1      NO_RELATION  conclus taken togeth mmvarentty1mm/ggvarentty1...
2         RELATION  revers correl mmvarentty1mm ggvarentty1gg also...
3         RELATION  result suggest recognit site mmvarentty1mm mmv...
4         RELATION  issu jci ghosh colleagu identifi uniqu microrn...
5      NO_RELATION  conclus mmvarentty1mm may promot cell ggvarent...
6      NO_RELATION        howev role mmvarentty1mm npc remain unknown
7         RELATION  targetscan predict ggvarentty1gg ggvarentty2gg...
8      NO_RELATION  increas mmvarentty1mm depend upon ggvarentty1g...
9      NO_RELATION  revers correl mmvarentty1mm ggvarentty1gg expr...
10        RELATION  studi identifi mmvarentty1mm mmvarentty2mm mic...
11        RELATION  show mmvarentty1mm oncosuppress microrna lost ...
12     NO_RELATION  newli identifi mmvarentty1mm/ggvarentty1

## Vectorise text data into numerical vector

### Using `CountVectorizer`

In [32]:
from sklearn.feature_extraction.text import CountVectorizer
tf_vectorizer = CountVectorizer(analyzer='word', stop_words='english', ngram_range=(2,2), max_df=0.9, min_df=0.01)
train_tf_vectors = tf_vectorizer.fit_transform(train.Text)

** Note: ** Numbers not removed. Consider removing numbers.

`tf_vectorizer.vocabulary_` is a `dict`

In [33]:
tf_vectorizer.vocabulary_

{'breast cancer': 0,
 'cancer cell': 1,
 'cancer tissu': 2,
 'cell cycl': 3,
 'cell growth': 4,
 'cell line': 5,
 'cell migrat': 6,
 'cell prolifer': 7,
 'chain reaction': 8,
 'conclus mmvarentty1mm': 9,
 'correl ggvarentty1gg': 10,
 'correl mmvarentty1mm': 11,
 'decreas express': 12,
 'demonstr mmvarentty1mm': 13,
 'direct target': 14,
 'directli target': 15,
 'downregul ggvarentty1gg': 16,
 'downregul mmvarentty1mm': 17,
 'effect mmvarentty1mm': 18,
 'express ggvarentty1gg': 19,
 'express ggvarentty2gg': 20,
 'express level': 21,
 'express mmvarentty1mm': 22,
 'gastric cancer': 23,
 'gene express': 24,
 'gene ggvarentty1gg': 25,
 'gene mmvarentty1mm': 26,
 'ggvarentty1gg cell': 27,
 'ggvarentty1gg express': 28,
 'ggvarentty1gg gene': 29,
 'ggvarentty1gg ggvarentty2gg': 30,
 'ggvarentty1gg induc': 31,
 'ggvarentty1gg mmvarentty1mm': 32,
 'ggvarentty1gg mrna': 33,
 'ggvarentty1gg protein': 34,
 'ggvarentty1gg regul': 35,
 'ggvarentty1gg target': 36,
 'ggvarentty2gg cell': 37,
 'ggvaren

In [34]:
train_tf_vectors.shape

(10783, 108)

In [35]:
train_tf_vectors.data

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [36]:
print(train_tf_vectors.nnz) # number of non-zero elements
print(train_tf_vectors.nnz / float(train_tf_vectors.shape[0])) # non-zero per sample
print(train_tf_vectors.nnz / float(train_tf_vectors.shape[0] * train_tf_vectors.shape[1]) * 100) # non-zero per feature space in %

29383
2.724937401465269
2.523090186541916


## Data Preprocessing: 
### Attributes and Target

### Convert categorical labels into numerical labels

In [37]:
from sklearn import preprocessing

In [38]:
le = preprocessing.LabelEncoder()
le.fit(train.Label)
list(le.classes_)

['NO_RELATION', 'RELATION']

In [39]:
y_train = le.transform(train.Label);
y_test = le.transform(test.Label);

print(y_train[0:5])
print(le.inverse_transform(y_train[0:5]))

[0 0 1 1 1]
['NO_RELATION' 'NO_RELATION' 'RELATION' 'RELATION' 'RELATION']


`X` is data, `y` is target. `X` has the size of `n_samples` x `n_features`, `y` has `n_samples` x 1 labels

In [40]:
X_train = train_tf_vectors

In [41]:
X_test = tf_vectorizer.transform(test.Text)

In [42]:
X_test.shape

(3595, 108)

In [43]:
X_test.data

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

## Log Res: 

In [44]:
from sklearn import linear_model
clf = linear_model.LogisticRegression(C=1e5, class_weight='balanced')
clf.fit(X_train, y_train)

LogisticRegression(C=100000.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [45]:
y_pred = clf.predict(X_test)

In [46]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.69      0.76      0.72      1804
          1       0.73      0.65      0.69      1791

avg / total       0.71      0.70      0.70      3595



In [47]:
print(metrics.confusion_matrix(y_test, y_pred))

[[1373  431]
 [ 631 1160]]


### Coeficients of the decision function

In [48]:
clf.coef_

array([[-0.33303924,  0.00645408, -0.20657451,  0.44060459, -0.18557739,
        -0.1673157 ,  0.05561758,  0.08112488, -0.41409544, -0.04720798,
         0.52311038,  0.14795392, -0.16729237,  0.06679747,  0.66817234,
         0.25102087,  0.64412386,  0.19471715,  0.21159313,  0.64504622,
         0.50897821, -0.06472291, -0.03217674, -0.08037165,  0.1283544 ,
         1.04062561,  0.83093581, -0.46118641,  0.83404795,  0.62589316,
         0.80009112,  0.35812912,  0.36216952,  0.72031423,  1.09895227,
         0.50504689,  0.60650616, -0.01525531,  0.89084602,  0.60078494,
         0.51568274,  0.68694638,  0.92731859,  0.46527496,  0.5627918 ,
         0.44008352,  0.61550533,  0.04912107,  0.06917168,  0.34085323,
         0.0463352 ,  0.37660683,  0.11026023,  0.46499273,  0.53485614,
        -0.444088  ,  0.31189353, -0.1177417 ,  0.03448994, -0.41593154,
        -0.69668684,  0.09109787, -0.323124  ,  0.53079439,  0.68930487,
         0.09070634, -0.34366718,  0.05003418,  0.1

In [49]:
len(clf.coef_[0,])

108

Get the index of coef with the maximum value.

In [50]:
clf.coef_.argmax()

101

In [51]:
m = clf.coef_.argmax()
print(list(tf_vectorizer.vocabulary_.keys())[list(tf_vectorizer.vocabulary_.values()).index(m)])

target ggvarentty2gg


Get the index of the highest 10 coef value.

**Note** `clf.coef_` is ndarray, see below:

In [52]:
clf.coef_[0]

array([-0.33303924,  0.00645408, -0.20657451,  0.44060459, -0.18557739,
       -0.1673157 ,  0.05561758,  0.08112488, -0.41409544, -0.04720798,
        0.52311038,  0.14795392, -0.16729237,  0.06679747,  0.66817234,
        0.25102087,  0.64412386,  0.19471715,  0.21159313,  0.64504622,
        0.50897821, -0.06472291, -0.03217674, -0.08037165,  0.1283544 ,
        1.04062561,  0.83093581, -0.46118641,  0.83404795,  0.62589316,
        0.80009112,  0.35812912,  0.36216952,  0.72031423,  1.09895227,
        0.50504689,  0.60650616, -0.01525531,  0.89084602,  0.60078494,
        0.51568274,  0.68694638,  0.92731859,  0.46527496,  0.5627918 ,
        0.44008352,  0.61550533,  0.04912107,  0.06917168,  0.34085323,
        0.0463352 ,  0.37660683,  0.11026023,  0.46499273,  0.53485614,
       -0.444088  ,  0.31189353, -0.1177417 ,  0.03448994, -0.41593154,
       -0.69668684,  0.09109787, -0.323124  ,  0.53079439,  0.68930487,
        0.09070634, -0.34366718,  0.05003418,  0.11574094,  0.05

In [53]:
print(clf.coef_[0][0])
print(clf.coef_[0][1])

-0.333039235313
0.00645408104599


In [54]:
import numpy as np
topN = np.argsort(clf.coef_[0])[::-1][:20]
print(topN)

[101  34  25 102  87 100 107 105  42  89  38  28  26  30  88  85  97  33
  64  41]


In [55]:
for v in topN:
    print(list(tf_vectorizer.vocabulary_.keys())[list(tf_vectorizer.vocabulary_.values()).index(v)])


target ggvarentty2gg
ggvarentty1gg protein
gene ggvarentty1gg
target mmvarentty1mm
regul express
target ggvarentty1gg
western blot
untransl region
ggvarentty3gg express
regul ggvarentty2gg
ggvarentty2gg express
ggvarentty1gg express
gene mmvarentty1mm
ggvarentty1gg ggvarentty2gg
regul ggvarentty1gg
protein level
suppress ggvarentty1gg
ggvarentty1gg mrna
mmvarentty1mm ggvarentty2gg
ggvarentty2gg mrna
