In [1]:
from sklearn.datasets import fetch_20newsgroups

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

In [2]:
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)

In [3]:
tfidf_Vect = TfidfVectorizer()
X_train_tfidf = tfidf_Vect.fit_transform(twenty_train.data)

In [4]:
clf = MultinomialNB()
clf.fit(X_train_tfidf, twenty_train.target)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [5]:
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
X_test_tfidf = tfidf_Vect.transform(twenty_test.data)

In [6]:
predicted = clf.predict(X_test_tfidf)

score = metrics.accuracy_score(twenty_test.target, predicted)
print(score)

0.7738980350504514


The accuracy we get is ~77.4% - not bad..

# Task a
### Use SVM

In [12]:
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf-svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42))])

In [13]:
text_clf_svm.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                ('clf-svm',
                 SGDClassifier(alpha=0.001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                  

In [14]:
predicted_svm = text_clf_svm.predict(twenty_test.data)

In [15]:
import numpy as np
np.mean(predicted_svm == twenty_test.target)

0.8240839086563994

## Observation
#### The accuracy has increased from 77.4% to ~82.4% using SVM! A significant jump of 5%

# Task b
### Use bigrams

In [16]:
tfidf_Vect2 = TfidfVectorizer(ngram_range=(1, 2))

X_train_tfidf2 = tfidf_Vect2.fit_transform(twenty_train.data)

In [17]:
clf2 = MultinomialNB()
clf2.fit(X_train_tfidf2, twenty_train.target)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [22]:
X_test_tfidf2 = tfidf_Vect2.transform(twenty_test.data)
predicted2 = clf2.predict(X_test_tfidf2)
score2 = metrics.accuracy_score(twenty_test.target, predicted2)
score2

0.765400955921402

## Observation
#### The accuracy decreased a bit, from 77.4% to 76.5%

# Task c
### Use english stop words

In [23]:
tfidf_Vect3 = TfidfVectorizer(stop_words='english')
X_train_tfidf3 = tfidf_Vect3.fit_transform(twenty_train.data)

clf3 = MultinomialNB()
clf3.fit(X_train_tfidf3, twenty_train.target)

X_test_tfidf3 = tfidf_Vect3.transform(twenty_test.data)

predicted3 = clf3.predict(X_test_tfidf3)

score3 = metrics.accuracy_score(twenty_test.target, predicted3)
score3

0.8169144981412639

## Observation
### Wow! That gave a boost of ~4% from 77.4% to 81.7%

# Go the extra mile
### 'english' stopwords + SVM

In [31]:
from sklearn import linear_model

text_clf_svm2 = linear_model.SGDClassifier()
text_clf_svm2.fit(X_train_tfidf3, twenty_train.target)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [32]:
predicted4 = text_clf_svm2.predict(X_test_tfidf3)

score4 = metrics.accuracy_score(twenty_test.target, predicted4)
score4

0.8507700477960701

## Cool!! We've got 85% accuracy.