# Supervised ML Pipeline for text classification using scikit-learn

In [2]:
# Scikit Learn Packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB #Naive Bayes Classifier
from sklearn.linear_model import SGDClassifier #SVM Classifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups #Dataset

# NLTK Packages 
import nltk
from nltk.stem.snowball import SnowballStemmer

# Computational Packages
import numpy as np
import pandas as pd

# Visualisation Packages
import matplotlib.pyplot as plt

In [8]:
# Importing the dataset
newsgroups_train = fetch_20newsgroups(subset='train', shuffle=True)
# Validating data sets
print (newsgroups_train.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


# Preprocessing Stages

In [12]:
count_vec = CountVectorizer()
train_cv_x = count_vec.fit_transform(newsgroups_train.data)
print ("Feature Shape  = %s" % str(train_cv_x.shape))

Feature Shape  = (11314, 130107)


In [15]:
tf_idf_transformer = TfidfTransformer()
train_tfidf_x = tf_idf_transformer.fit_transform(train_cv_x)
print ("Shape of the TF-IDF Matrix = %s" % str(train_tfidf_x.shape))

Shape of the TF-IDF Matrix = (11314, 130107)


In [16]:
# Stemming Code

stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

stemmed_count_vec = StemmedCountVectorizer(stop_words='english')

In [19]:
# NB Classifier for training on the text data
clf = MultinomialNB().fit(train_tfidf_x, newsgroups_train.target)

In [None]:
# Building Pipeline for the Classifier
doc_classifier = Pipeline([('vect', stemmed_count_vec), ('tfidf', TfidfTransformer()), 
                           ('classifier', MultinomialNB(fit_prior=False))])

doc_classifier = doc_classifier.fit(newsgroups_train.data, newsgroups_train.target)