# Supervised ML Pipeline for text classification using scikit-learn

In [52]:
# Scikit Learn Packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB #Naive Bayes Classifier
from sklearn.linear_model import SGDClassifier #SVM Classifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups #Dataset

# NLTK Packages 
import nltk
from nltk.stem.snowball import SnowballStemmer

# Computational Packages
import numpy as np
import pandas as pd

# Visualisation Packages
import matplotlib.pyplot as plt

In [60]:
# Importing the dataset
newsgroups_train = fetch_20newsgroups(subset='train', shuffle=True)
newsgroups_test = fetch_20newsgroups(subset='test', shuffle=True)
# Validating data sets
print ("\n -- Train -- \n")
print (newsgroups_train.target_names)
print ("\n -- Test -- \n")
print (newsgroups_test.target_names)


 -- Train -- 

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']

 -- Test -- 

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


# Preprocessing Stages

In [54]:
count_vec = CountVectorizer()
train_cv_x = count_vec.fit_transform(newsgroups_train.data)
print ("Feature Shape  = %s" % str(train_cv_x.shape))

Feature Shape  = (11314, 130107)


In [55]:
tf_idf_transformer = TfidfTransformer()
train_tfidf_x = tf_idf_transformer.fit_transform(train_cv_x)
print ("Shape of the TF-IDF Matrix = %s" % str(train_tfidf_x.shape))

Shape of the TF-IDF Matrix = (11314, 130107)


In [56]:
# Stemming Code

stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

stemmed_count_vec = StemmedCountVectorizer(stop_words='english')

In [57]:
# NB Classifier for training on the text data
clf = MultinomialNB().fit(train_tfidf_x, newsgroups_train.target)

In [63]:
# Building Pipeline for the Classifier
doc_classifier_nb = Pipeline([('vectorizer', stemmed_count_vec), 
                              ('tfidf', TfidfTransformer()),
                              ('nb_classifier', MultinomialNB(fit_prior=False))])

doc_classifier_nb = doc_classifier_nb.fit(newsgroups_train.data, newsgroups_train.target)

In [64]:
predicted_nb_stem = doc_classifier_nb.predict(newsgroups_test.data)
print (len(predicted_nb_stem))

7532


In [65]:
# Evaluating the performnace of the NB classifier
np.mean(predicted_nb_stem == newsgroups_test.target)

0.81678173127987252

# Training with SVM and Evaluating its performance

In [66]:
doc_classifier_svm = Pipeline([ 
    ('vectorizer', stemmed_count_vec), 
    ('tfidf', TfidfTransformer()), 
    ('svm_classifier', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))
])

doc_classifier_svm = doc_classifier_svm.fit(newsgroups_train.data, newsgroups_train.target)



In [67]:
predicted_svm_stem = doc_classifier_svm.predict(newsgroups_test.data)
print (len(predicted_svm_stem))

7532


In [68]:
# Evaluating the performnace of the SVM classifier
print (np.mean(predicted_svm_stem == newsgroups_test.target))

0.819437068508


## Conclusion
    .. SVM Classifier Performed slightly better than NB Classifier. 
    .. I realize that the performance can be further improved by performing more extensive text cleansing before
        fitting it into the pipeline. This will form my theme in the subsequent notebooks. 