## Load data

In [1]:
filename = "C:\Data\somedata.txt" # Contain positive and negative samples

In [2]:
import pandas as pd
data = pd.read_csv(filename, quotechar='"', delimiter='\t')
#data

## Vectorise text data into numerical vector

### Using `TfidfVectorizer`

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,1), max_df=0.9, min_df=1)
tfidf_vectors = tfidf_vectorizer.fit_transform(data.Sentence)

** Note: ** Numbers not removed. May try **remove numbers, lematize.**

In [None]:
tfidf_vectorizer.vocabulary_

In [5]:
tfidf_vectors.shape

(34990, 11787)

In [6]:
tfidf_vectors.data

array([ 0.37786627,  0.19254191,  0.17388721, ...,  0.36261596,
        0.20614628,  0.20487457])

In [7]:
print(tfidf_vectors.nnz) # number of non-zero elements
print(tfidf_vectors.nnz / float(tfidf_vectors.shape[0])) # non-zero per sample
print(tfidf_vectors.nnz / float(tfidf_vectors.shape[0] * tfidf_vectors.shape[1]) * 100) # non-zero per feature space in %

431739
12.338925407259216
0.10468249263815405


## Data Preprocessing: 
### Attributes and Target

`X` is data, `y` is target. `X` has the size of `n_samples` x `n_features`, `y` has `n_samples` x 1 labels

In [8]:
X = tfidf_vectors

### Convert categorical labels into numerical labels

In [9]:
from sklearn import preprocessing

In [10]:
le = preprocessing.LabelEncoder()
le.fit(data.Label)
list(le.classes_)

['Neg', 'Pos']

In [11]:
y = le.transform(data.Label) 
print(y[0:5])
print(y[(len(y)-5):len(y)])
print(le.inverse_transform(y[0:5]))
print(le.inverse_transform(y[(len(y)-5):len(y)]))

[1 1 1 1 1]
[0 0 0 0 0]
['Pos' 'Pos' 'Pos' 'Pos' 'Pos']
['Neg' 'Neg' 'Neg' 'Neg' 'Neg']


## Data splitting

**Either:** Split data into train and test set. **Or:** load readily split data.

In [12]:
from sklearn.cross_validation import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=8) # random_state is fixed value for reproducibility

In [14]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(23443, 11787)
(11547, 11787)
(23443,)
(11547,)


## SVM: 
### linear kernal

When classes are imbalanced, set `class_weight` to `'balanced'`.

In [15]:
from sklearn.svm import SVC # "Support Vector Classifier"
clf = SVC(kernel='linear', class_weight = 'balanced')
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [16]:
y_svm_linear_pred = clf.predict(X_test)

In [17]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_svm_linear_pred))

             precision    recall  f1-score   support

          0       0.97      0.97      0.97      9673
          1       0.87      0.86      0.87      1874

avg / total       0.96      0.96      0.96     11547



In [18]:
print(metrics.confusion_matrix(y_test, y_svm_linear_pred))

[[9430  243]
 [ 260 1614]]


In [19]:
print(clf.n_support_) # number of support vectors for each class.

[4544 1101]


In [20]:
print(clf.support_) # indices of support vectors
print(len(clf.support_)) 

[   15    17    31 ..., 23426 23429 23435]
5645


In [21]:
print(clf.support_vectors_) # (0, 788) means the first support vector is of feature_names[788], feature_names[6688] and so on.

  (0, 788)	0.334508581932
  (0, 6688)	0.186432449526
  (0, 7663)	0.714128089135
  (0, 9585)	0.329624531452
  (0, 11189)	0.484474654601
  (1, 92)	0.336436716357
  (1, 2060)	0.425731411025
  (1, 6688)	0.431859035414
  (1, 6706)	0.201941118239
  (1, 7810)	0.280013131725
  (1, 8195)	0.324795349738
  (1, 8548)	0.325565704377
  (1, 9957)	0.283056855428
  (1, 9977)	0.329039625923
  (2, 280)	0.244739785246
  (2, 483)	0.135348274614
  (2, 4252)	0.373543991967
  (2, 5051)	0.147763322061
  (2, 5687)	0.295458566666
  (2, 7992)	0.352468214254
  (2, 7993)	0.428678333134
  (2, 9974)	0.36242213174
  (2, 10126)	0.308621057875
  (2, 10300)	0.230676341569
  (2, 11410)	0.145368601933
  :	:
  (5643, 1393)	0.191829832598
  (5643, 1584)	0.177978050135
  (5643, 2042)	0.475545661825
  (5643, 3270)	0.278602062488
  (5643, 4146)	0.308696187528
  (5643, 8013)	0.41180157506
  (5643, 8774)	0.337733574857
  (5643, 10403)	0.335440746579
  (5643, 10997)	0.369193726685
  (5644, 788)	0.132376373359
  (5644, 1393)	0.1164

In [22]:
arr = clf.support_vectors_.toarray(); len(arr)

5645

In [23]:
len(clf.support_)

5645

In [24]:
b_1 = clf.support_vectors_[4543]
b0 = clf.support_vectors_[0]
b1 = clf.support_vectors_[5644]
clf.support_vectors_[0].nnz

5

In [25]:
clf.support_vectors_[0].indices

array([  788,  6688,  7663,  9585, 11189])

In [26]:
len(clf.support_vectors_[0].indices) # length of each support vector in support_vectors

5

In [27]:
print(b_1); print('\n'); print(b0); print('\n'); print(b1)


  (0, 24)	0.253176603594
  (0, 1584)	0.119961763146
  (0, 1981)	0.259877337811
  (0, 2678)	0.338715819544
  (0, 4072)	0.333343111562
  (0, 4157)	0.193119073801
  (0, 5193)	0.352253962419
  (0, 6101)	0.389121416827
  (0, 6378)	0.267592983865
  (0, 9456)	0.405435525294
  (0, 11575)	0.199873299198
  (0, 11576)	0.198499123366


  (0, 788)	0.334508581932
  (0, 6688)	0.186432449526
  (0, 7663)	0.714128089135
  (0, 9585)	0.329624531452
  (0, 11189)	0.484474654601


  (0, 788)	0.132376373359
  (0, 1393)	0.116441355388
  (0, 2489)	0.390236718846
  (0, 3110)	0.228624762536
  (0, 3835)	0.176060826691
  (0, 4262)	0.174996446811
  (0, 5251)	0.163318941539
  (0, 6427)	0.288657820641
  (0, 6434)	0.283545881174
  (0, 6538)	0.170181824469
  (0, 6628)	0.390236718846
  (0, 6734)	0.390236718846
  (0, 6985)	0.219101966499
  (0, 9586)	0.180773261655
  (0, 10676)	0.246034182361
  (0, 10996)	0.193905560136


In [28]:
feature_names = tfidf_vectorizer.get_feature_names()

In [29]:
i = clf.support_[5] #i is indices of support vectors
print(i)
feature_names[i] #11786 is the last index

50


'abstracts'

In [31]:
import numpy as np
np.asarray(feature_names)
len(np.asarray(feature_names))

11787

** Plot this? **

In [32]:
np.asarray(clf.support_vectors_) 

array(<5645x11787 sparse matrix of type '<class 'numpy.float64'>'
	with 68495 stored elements in Compressed Sparse Row format>, dtype=object)

In [35]:
clf.coef_.shape

(1, 11787)

The top word

In [36]:
clf.coef_.max()

12.703813543597244

In [37]:
max_val = clf.coef_.max()
max_index = [i for i in range(clf.coef_.shape[1]) if clf.coef_[0,i] == max_val]
print(max_index)

[3788]


In [38]:
feature_names[max_index[0]]

'expression'

Top 10 words.

Below is a lengthty way. ?? How to access sparse matrix ??

In [39]:
coef = clf.coef_.data; print(coef)
top10_ind = np.argsort(coef)[-10:]; print(top10_ind) # top10 indexes
top10_coef = coef[top10_ind]; print(top10_coef)

[ 0.22481396  0.22481396  0.22481396 ...,  0.23581732 -0.29174322
 -1.27559806]
[808 772 853 666 792 898 822 845 858 895]
[  4.62451548   4.92914911   4.97395186   5.17819179   5.3102506
   5.94092569   6.74814103   6.79936762   8.22061429  12.70381354]


In [40]:
for i in top10_coef:
    print(i)

4.62451547591
4.92914910599
4.97395185537
5.17819179054
5.31025059833
5.94092569482
6.74814103087
6.79936762057
8.22061428955
12.7038135436


In [41]:
top10_index = list()
for i in top10_coef:
    for j in range(clf.coef_.shape[1]):
        if clf.coef_[0,j] == i:
            top10_index.append(j); print(j)

10531
7820
3784
9212
3108
10678
9216
10677
10674
3788


In [42]:
for item in top10_index:
    print(feature_names[item])

suppression
overexpression
expressed
regulated
downregulation
targets
regulation
targeting
target
expression


In [None]:
clf.support_vectors_.data

In [None]:
[clf.support_vectors_.indices]

In [None]:
len(clf.support_vectors_.indices)

In [None]:
clf.n_support_[1]

In [None]:
sv_index_0 = clf.support_vectors_.indices[0:clf.n_support_[0]]; print(sv_index_0)
sv_index_1 = clf.support_vectors_.indices[clf.n_support_[0]:len(clf.support_)]; print(sv_index_1)

In [None]:
sv_index_0 = clf.support_vectors_.indices[0]; print(sv_index_0)
sv_index_1 = clf.support_vectors_.indices[4544]; print(sv_index_1)

In [None]:
len(clf.support_); clf.n_support_[0]

COUNT NUMBER OF WORDS, BY COUNTING NUMBER OF ITEMS (i.e. indexes) IN clf.support_vectors_.indices, for each class.
use mumpy.histogram, use list.count. Then use feature_names[top 5 highest index]

In [None]:
import matplotlib.pyplot as plt
import numpy as np

Compute histogram for `clf.support_vectors_.indices`

In [None]:
hist = np.histogram(clf.support_vectors_.indices) # need to do for hist_neg, hist_pos
#hist

In [None]:
%matplotlib inline
num_bins = tfidf_vectors.shape[1]
plt.hist(clf.support_vectors_.indices, bins=num_bins); # This takes awhile to plot, as tfidf_vectors.shape[1] is 11787 

Separate the support vectors from class 0 and class 1, so that we can look into the top 10 support vectors for each class.

In [None]:
C0_support_ = clf.support_[0:clf.n_support_[0]]; print(C0_support_)
start_i = clf.n_support_[0]; end_i = clf.n_support_[0] + clf.n_support_[1]; print(start_i); print(end_i)
C1_support_ = clf.support_[start_i:end_i]; print(C1_support_)

`C0_support_` contains the support vectors' indices for `class 0`. 

`C1_support_` contains the support vectors' indices for `class 1`. 

In [None]:
a=clf.support_vectors_[C0_support[0]].indices; print(a)
for i in C0_support[1:]:
    b = clf.support_vectors_[i].indices;
    a = np.append(a,b)    
    #print(a)