# Text Categorizer

This is an simple text categorizer model which takes a sequence of text as input and classifies it into one of the defined categories

In [1]:
# import necessary packages
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from nltk.stem.snowball import SnowballStemmer
import pickle
import numpy as np

Load the 20 newsgroups dataset which has 20 different categories

In [2]:
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
print("Categories are ...", twenty_train.target_names) #prints all the categories

Categories are ... ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


Create a stemmer

In [3]:
stemmer = SnowballStemmer("english", ignore_stopwords=True)
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

Train the dataset using SDGClassifier

In [8]:
text_clf = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()), \
                      ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, \
                                            n_iter=5, random_state=42)), ])

parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3), }
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)

Test the model with the test data

In [9]:
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
predicted_svm = gs_clf.predict(twenty_test.data)
print(np.mean(predicted_svm == twenty_test.target))

0.829660116835


Writing the model into the pickle file

In [10]:
# Create an variable to pickle and open it in write mode
model_pickle_path = 'model_pickle.pkl'
model_pickle = open(model_pickle_path, 'wb')
pickle.dump(gs_clf, model_pickle)
model_pickle.close()

Loading the pickle file for prediction

In [11]:
# load the unpickle object into a variable
model_pickle_path = 'model_pickle.pkl'
model_unpickle = open(model_pickle_path, 'rb')
model_clf = pickle.load(model_unpickle)

Prediction of the text present in the test file

In [26]:
# prediction
test_txt = ["We cant mix religion with science"]
stemmed_count_vect.fit_transform(test_txt)
print(test_txt)
test_txt_clf = open('test_txt_clf', 'r')
predicted = model_clf.predict(test_txt)
print(predicted)
for i in predicted:
    print(twenty_train.target_names[i])

['We cant mix religion with science']
[0]
alt.atheism
