In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
from sklearn.feature_extraction.text import TfidfTransformer

In [4]:
from sklearn.naive_bayes import MultinomialNB

In [5]:
from sklearn.pipeline import Pipeline

In [6]:
from sklearn.linear_model import SGDClassifier

In [7]:
from sklearn.model_selection import GridSearchCV

In [32]:
import nltk

In [33]:
from nltk.stem.snowball import SnowballStemmer

In [8]:
import numpy as np

### Loading Training Data

In [9]:
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)

In [10]:
# categories/ target names
print(twenty_train.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [11]:
# First line of the data
print("\n".join(twenty_train.data[0].split("\n")[:3]))

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu


### Vectorization - Bag of Words

In [12]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(11314, 130107)

In [13]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(11314, 130107)

### Multinomial Naive Bayes Classifier

In [14]:
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [15]:
# Doing the above steps with an SKLearn pipeline

In [16]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),])
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

In [17]:
twenty_test = fetch_20newsgroups(subset='test', shuffle=True) #loading the test data
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.7738980350504514

### SVM

In [18]:
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, max_iter=10, random_state=42))])

In [19]:
_ = text_clf_svm.fit(twenty_train.data, twenty_train.target)

In [20]:
predicted_svm = text_clf_svm.predict(twenty_test.data)
np.mean(predicted_svm == twenty_test.target)

0.8240839086563994

### Grid Search

In [21]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),}

In [22]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1) #GS with NB classifier
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)

In [23]:
gs_clf.best_score_

0.9157684864695698

In [24]:
gs_clf.best_params_

{'clf__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

In [25]:
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)],
                  'tfidf__use_idf': (True, False),
                  'clf-svm__alpha': (1e-2, 1e-3),}

In [26]:
gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(twenty_train.data, twenty_train.target)

In [28]:
print(gs_clf_svm.best_score_, gs_clf_svm.best_params_, sep = '\n')

0.9051618841994754
{'clf-svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}


## Training With Cleaned Data

In [29]:
# Removing Stop Words
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),])

In [30]:
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

In [31]:
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.8169144981412639

In [34]:
#stemming
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])


In [35]:
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')
#Multinomial NB Classifier with stemming
text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect),
                             ('tfidf', TfidfTransformer()),
                             ('mnb', MultinomialNB(fit_prior=False)),])

In [36]:
text_mnb_stemmed = text_mnb_stemmed.fit(twenty_train.data, twenty_train.target)
predicted_mnb_stemmed = text_mnb_stemmed.predict(twenty_test.data)

In [37]:
np.mean(predicted_mnb_stemmed == twenty_test.target)

0.8167817312798725