# SVM Demo

In [26]:
from sklearn import svm
X = [[0, 0], [1, 1]]
y = [0, 1]
clf = svm.SVC()
clf.fit(X, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [7]:
clf.predict([[2., 2.]])

array([1])

In [15]:
# get support vectors
print(clf.support_vectors_)
# get indices of support vectors
print('-'*100)
print(clf.support_)

[[ 0.  0.]
 [ 1.  1.]]
----------------------------------------------------------------------------------------------------
[0 1]


# SVM on Text Data

### Download Data

In [28]:
from sklearn.datasets import fetch_20newsgroups
import matplotlib.pyplot as plt
import numpy as np

categories = ['comp.graphics','comp.os.ms-windows.misc','comp.sys.ibm.pc.hardware','comp.sys.mac.hardware'
             ,'rec.autos','rec.motorcycles','rec.sport.baseball','rec.sport.hockey']

train=fetch_20newsgroups(subset='train', categories= categories,shuffle=True, random_state=42)
test=fetch_20newsgroups(subset='test', categories= categories,shuffle=True, random_state=42)

print("No of Docs in Training:", len(train.data))
print("No of Docs in Testing:", len(test.data))

No of Docs in Training: 4732
No of Docs in Testing: 3150


### Document Term Matrix

In [49]:
from sklearn.feature_extraction.text import CountVectorizer

# Stop Words
from sklearn.feature_extraction import text
stop_words = text.ENGLISH_STOP_WORDS

count_vect_class = CountVectorizer(stop_words='english',min_df=1)
X_dtf = count_vect_class.fit_transform(train.data)
X_dtf.shape 

X_dtf_test = count_vect_class.transform(test.data)
X_dtf_test.shape 


print ('Number of Documents in train: ',X_dtf.shape[0])
print ('Number of Features/terms in train: ',X_dtf.shape[1])
print("-"*100)

print ('Number of Documents in test: ',X_dtf_test.shape[0])
print ('Number of Features/terms in test: ',X_dtf_test.shape[1])

Number of Documents in train:  4732
Number of Features/terms in train:  78910
----------------------------------------------------------------------------------------------------
Number of Documents in test:  3150
Number of Features/terms in test:  78910


### TFIDF

$$TF\times IDF(t,d)=tf(t,d)\times idf(t)$$

$$idf(t)=\log(\frac{n}{df(t)})+1$$

- $tf(t, d)$: term frequency of term $t$ in the document $d$.


- $idf(t)$: inverse document frequency of term $t$ across the document dataset.
 - $df(t)$: # of documents that contain the term $t$.
 - Intuition: words that appear in all documents are useless in classificaiton.

In [51]:
# TFIDF
from sklearn.feature_extraction.text import TfidfTransformer

#Smooth idf, else it adds 1 to denominator and numinator
tfidf_transformer = TfidfTransformer(smooth_idf=False)

X_tfidf = tfidf_transformer.fit_transform(X_dtf)

X_tfidf_test = tfidf_transformer.transform(X_dtf_test)

print ('Shape of the Matrix in train',X_tfidf.shape)
print ('-' * 100)
print('Sample Content in train')
print (X_tfidf.toarray()[:5,:5])

print ('-' * 100)

print ('Shape of the Matrix in test',X_tfidf_test.shape)
print ('-' * 100)
print('Sample Content in test')
print (X_tfidf_test.toarray()[:5,:5])



Shape of the Matrix in train (4732, 78910)
----------------------------------------------------------------------------------------------------
Sample Content in train
[[ 0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.        ]
 [ 0.04174827  0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.        ]]
----------------------------------------------------------------------------------------------------
Shape of the Matrix in test (3150, 78910)
----------------------------------------------------------------------------------------------------
Sample Content in test
[[ 0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.]]


### Dimensionality Reduction NMF
- We don't need very large number of features (78910): Making it to 50 features

In [52]:
from sklearn.decomposition import NMF
model = NMF(n_components=50, init='random', random_state=0)
X_NMF = model.fit_transform(X_tfidf)
print ('Shape of the Matrix in train',X_NMF.shape)
print ('-' * 100)

X_NMF_test = model.fit_transform(X_tfidf_test)
print ('Shape of the Matrix in train',X_NMF_test.shape)
print ('-' * 100)


Shape of the Matrix in train (4732, 50)
----------------------------------------------------------------------------------------------------
Shape of the Matrix in train (3150, 50)
----------------------------------------------------------------------------------------------------


In [53]:
print('Sample Content in train')
print (X_NMF[:5,:5])
print ('-' * 100)
print('Sample Content in test')
print (X_NMF_test[:5,:5])

Sample Content in train
[[ 0.03218267  0.          0.          0.          0.        ]
 [ 0.00677435  0.          0.          0.          0.00148069]
 [ 0.          0.00029637  0.          0.          0.        ]
 [ 0.00670832  0.          0.00044467  0.00237779  0.        ]
 [ 0.          0.          0.          0.          0.        ]]
----------------------------------------------------------------------------------------------------
Sample Content in test
[[ 0.0014235   0.          0.00185034  0.          0.        ]
 [ 0.          0.          0.          0.          0.00337327]
 [ 0.00311383  0.00349846  0.          0.          0.        ]
 [ 0.          0.02869822  0.00295032  0.00126304  0.01520629]
 [ 0.00121601  0.00108368  0.          0.00053357  0.00742699]]


In [96]:
from sklearn.linear_model import SGDClassifier
# X = X_NMF
# Y = Target labels = train.target
print("Shape of input X labels",X_NMF.shape)
print("Shape of input Y labels",train.target.shape)
clf = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42,max_iter=5, tol=None)
#clf = SGDClassifier(max_iter=7)
clf.fit(X_tfidf, train.target)

Shape of input X labels (4732, 50)
Shape of input Y labels (4732,)


SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False)

### Testing on the Test Data

In [98]:
predicted = clf.predict(X_tfidf_test) 
print("Accuracy of SVM is:")
print(np.mean(predicted == test.target))   

Accuracy of SVM is:
0.893333333333


In [99]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])
text_clf.fit(train.data, train.target) 
predicted = text_clf.predict(test.data)

print("Accuracy of Simple MultinomialNB is")
np.mean(predicted == test.target)

Accuracy of Simple MultinomialNB is


0.87619047619047619