In [1]:
# fetching data from sklearn directory
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)

In [54]:
print(type(twenty_train.data))

<class 'list'>


In [2]:
twenty_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [3]:
#count vectorizer
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape
print('Count vectorizer: ')
print(X_train_counts)

Count vectorizer: 
  (0, 86580)	1
  (0, 128420)	1
  (0, 35983)	1
  (0, 35187)	1
  (0, 66098)	1
  (0, 114428)	1
  (0, 78955)	1
  (0, 94362)	1
  (0, 76722)	1
  (0, 57308)	1
  (0, 62221)	1
  (0, 128402)	2
  (0, 67156)	1
  (0, 123989)	1
  (0, 90252)	1
  (0, 63363)	1
  (0, 78784)	1
  (0, 96144)	1
  (0, 128026)	1
  (0, 109271)	1
  (0, 51730)	1
  (0, 86001)	1
  (0, 83256)	1
  (0, 113986)	1
  (0, 37565)	1
  :	:
  (11313, 87626)	1
  (11313, 30044)	1
  (11313, 76377)	1
  (11313, 119714)	1
  (11313, 47982)	1
  (11313, 28146)	2
  (11313, 88363)	2
  (11313, 56283)	1
  (11313, 111695)	1
  (11313, 90252)	1
  (11313, 51730)	1
  (11313, 68766)	1
  (11313, 89860)	1
  (11313, 80638)	1
  (11313, 4605)	1
  (11313, 76032)	1
  (11313, 89362)	1
  (11313, 90379)	1
  (11313, 64095)	1
  (11313, 95162)	1
  (11313, 87620)	1
  (11313, 111322)	1
  (11313, 85354)	1
  (11313, 50527)	2
  (11313, 56979)	2


In [4]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape
print('TfIdf :')
print(X_train_tfidf)

TfIdf :
  (0, 128420)	0.04278499079283093
  (0, 128402)	0.05922294083277842
  (0, 128026)	0.060622095889758885
  (0, 124931)	0.08882569909852546
  (0, 124031)	0.10798795154169122
  (0, 123989)	0.08207027465330353
  (0, 123984)	0.036854292634593756
  (0, 123796)	0.049437556160455476
  (0, 123292)	0.14534718515938805
  (0, 123162)	0.2597090245735688
  (0, 118983)	0.037085978050619146
  (0, 118280)	0.2118680720828169
  (0, 115475)	0.042472629883573
  (0, 114731)	0.14447275512784058
  (0, 114688)	0.06214070986309586
  (0, 114579)	0.03671830826216751
  (0, 114455)	0.12287762616208957
  (0, 114428)	0.05511105154696676
  (0, 113986)	0.17691750674853082
  (0, 111322)	0.01915671802495043
  (0, 109581)	0.10809248404447917
  (0, 109271)	0.10844724822064673
  (0, 108252)	0.07526015712540636
  (0, 106116)	0.09869734624201922
  (0, 104813)	0.08462829788929047
  :	:
  (11313, 62696)	0.06213004660468942
  (11313, 60910)	0.34638730155641734
  (11313, 60803)	0.07995422310508192
  (11313, 56979)	0.039703

In [5]:
# training models 
# Naive Bayes (NB)
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [7]:
# Building a pipeline
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

In [8]:
#predicting on test data
import numpy as np
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
predicted = text_clf.predict(twenty_test.data)
print('Accuracy using NB')
print(np.mean(predicted == twenty_test.target))

Accuracy using NB
0.7738980350504514


In [9]:
# Stemming Code
import nltk
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

In [10]:
# applying stemming and removing stop words
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()), 
                             ('mnb', MultinomialNB(fit_prior=False))])

text_mnb_stemmed = text_mnb_stemmed.fit(twenty_train.data, twenty_train.target)

In [11]:
#predicting on the test data
predicted_mnb_stemmed = text_mnb_stemmed.predict(twenty_test.data)
print("Accuracy after using NLTK")
print(np.mean(predicted_mnb_stemmed == twenty_test.target))

Accuracy after using NLTK
0.8167817312798725


In [29]:
# using the classifier model on real world daily news
import os
data1='/home/ikscare/Documents/Projects/Mousam/DC_using_OCR/newsData'
newslist=os.listdir(data1)
predi=text_mnb_stemmed.predict(newslist)
print(predi)

[ 9 15  6 15 15 11 15 15 15 15]


In [52]:
# showing formatted output
i=0
for pred in predi:
    print('Document {0:s} is of category %d'.format(newslist[i]) %pred)
    i+=1

Document 04.txt is of category 9
Document 01.txt is of category 15
Document 10.txt is of category 6
Document 07.txt is of category 15
Document 03.txt is of category 15
Document 06.txt is of category 11
Document 08.txt is of category 15
Document 05.txt is of category 15
Document 02.txt is of category 15
Document 09.txt is of category 15
