# SVM classification

Perform document classification analysis on the ScikitLearn newsgoup dataset

In [1]:
import math
import pandas as pd
import numpy as np

import sklearn
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

%pylab inline
pylab.rcParams['figure.figsize'] = (15, 6)

# Do not use normal form (scietific notation) when printing numbers, exponents can make it harder to compare values
pd.set_option('float_format', '{:f}'.format)

Populating the interactive namespace from numpy and matplotlib


In [2]:
'numpy: {}, pandas: {}, sklearn: {}'.format(np.__version__, pd.__version__, sklearn.__version__)

'numpy: 1.14.3, pandas: 0.23.0, sklearn: 0.19.1'

## Import

In [3]:
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)

## Explore

In [4]:
twenty_train.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR', 'description'])

In [10]:
print(twenty_train.description)

the 20 newsgroups by date dataset


View sample document

In [5]:
print(twenty_train.data[0])

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







View newsgroup names, these are the text representations of the target values

In [6]:
twenty_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

View newsgroup target values in numeric form

In [7]:
twenty_train.target

array([7, 4, 4, ..., 3, 1, 8])

## Prepare

## Train

In [32]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)

# Requires a bag of wors as it's input
# Get a score for every (doc_id, word_id)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# The CountVectorizer plus the TfidfTransformer is equal to the TfidfVectoriser


# the tfidf vectoriser works directly on documents and produces a bag of words wth corresponding tfidf scores


# Use the HashVectoriser instead of CountVectorizer when there is a very large vocabulary of words

Use an SVM estimator to predict the target newsgroup. When the loss goes below the `tol` training stops.

In [23]:
from sklearn.svm import LinearSVC

clf_svc = LinearSVC(penalty="l2", dual=False, tol=1e-3)
clf_svc.fit(X_train_tfidf, twenty_train.target)

LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
     verbose=0)

Chain the transformations

In [24]:
from sklearn.pipeline import Pipeline

clf_svc_pipeline = Pipeline([
    ("vect", CountVectorizer()),
    ("tfidf", TfidfTransformer()),
    ("clf", LinearSVC(penalty="l2", dual=False, tol=0.001))
])

In [25]:
clf_svc_pipeline.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
     verbose=0))])

Get the test data

In [26]:
twenty_test = fetch_20newsgroups(subset="test", shuffle=True)

Pass the test data through the transformation pipeline

In [28]:
predicted = clf_svc_pipeline.predict(twenty_test.data)

## Evaluate

In [11]:
X_train_counts.shape, X_train_tfidf.shape

((11314, 130107), (11314, 130107))

View the Tfidf scores for one document

In [19]:
print(X_train_tfidf[0])

  (0, 56979)	0.0574701540748513
  (0, 75358)	0.3538350134970617
  (0, 123162)	0.25970902457356887
  (0, 118280)	0.21186807208281694
  (0, 50527)	0.05461428658858725
  (0, 124031)	0.10798795154169123
  (0, 85354)	0.03696978508816317
  (0, 114688)	0.06214070986309587
  (0, 111322)	0.019156718024950434
  (0, 123984)	0.036854292634593756
  (0, 37780)	0.3813389125949312
  (0, 68532)	0.07325812342131598
  (0, 114731)	0.1444727551278406
  (0, 87620)	0.0356718631408158
  (0, 95162)	0.03447138409326312
  (0, 64095)	0.035420924271313554
  (0, 98949)	0.16068606055394935
  (0, 90379)	0.01992885995664587
  (0, 118983)	0.03708597805061915
  (0, 89362)	0.06521174306303765
  (0, 79666)	0.10936401252414275
  (0, 40998)	0.07801368196918111
  (0, 92081)	0.09913274493911224
  (0, 76032)	0.01921946305222309
  (0, 4605)	0.06332603952480324
  :	:
  (0, 37565)	0.03431760442478462
  (0, 113986)	0.17691750674853085
  (0, 83256)	0.08844382496462175
  (0, 86001)	0.07000411445838192
  (0, 51730)	0.0971474405797672

Every word is identified by (document_id, word_id) : frequency. Get the word frequencies for the first document.

In [10]:
print(X_train_counts[0])

  (0, 86580)	1
  (0, 128420)	1
  (0, 35983)	1
  (0, 35187)	1
  (0, 66098)	1
  (0, 114428)	1
  (0, 78955)	1
  (0, 94362)	1
  (0, 76722)	1
  (0, 57308)	1
  (0, 62221)	1
  (0, 128402)	2
  (0, 67156)	1
  (0, 123989)	1
  (0, 90252)	1
  (0, 63363)	1
  (0, 78784)	1
  (0, 96144)	1
  (0, 128026)	1
  (0, 109271)	1
  (0, 51730)	1
  (0, 86001)	1
  (0, 83256)	1
  (0, 113986)	1
  (0, 37565)	1
  :	:
  (0, 4605)	1
  (0, 76032)	1
  (0, 92081)	1
  (0, 40998)	1
  (0, 79666)	1
  (0, 89362)	3
  (0, 118983)	1
  (0, 90379)	1
  (0, 98949)	1
  (0, 64095)	1
  (0, 95162)	1
  (0, 87620)	1
  (0, 114731)	5
  (0, 68532)	3
  (0, 37780)	5
  (0, 123984)	1
  (0, 111322)	1
  (0, 114688)	1
  (0, 85354)	1
  (0, 124031)	2
  (0, 50527)	2
  (0, 118280)	2
  (0, 123162)	2
  (0, 75358)	2
  (0, 56979)	3


Get the prediction accurancy, how many predicted are equal to the actual

In [30]:
from sklearn.metrics import accuracy_score

acc_svm = accuracy_score(twenty_test.target, predicted)
acc_svm

0.8534253850238981