# Text Classification

### step1:Loading the dataset i.e. 20Newsgroups 

In [2]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)

In [24]:
#this is how a text in one of the document looks like
twenty_train.data[0]

u"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

In [3]:
#checking the 20 categories in the dataset
twenty_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

### step2:Extracting features from text files

In [4]:
#2.1building feature vector and removing stop words using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
#2.2 Transforming the values in document-matrix into TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer
#building the pipeline for the above process so as to simplify our code 
from sklearn.pipeline import Pipeline
#for nb classifier
from sklearn.naive_bayes import MultinomialNB
text_nb_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), 
                     ('clf', MultinomialNB())])
#for svm classifier
from sklearn.linear_model import SGDClassifier
text_svm_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))])

### step3: training classifiers

In [5]:
#Naive bayes classifier
#now fitting our model on the training dataset
text_nb_clf = text_nb_clf.fit(twenty_train.data, twenty_train.target)

In [6]:
#svm classifier
#now fiiting our model on the training dataset 
text_svm_clf = text_svm_clf.fit(twenty_train.data, twenty_train.target)

### step4 : testing classifiers

In [7]:
#fetching the test data
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
import numpy as np
#naive bayes
predicted_nb = text_nb_clf.predict(twenty_test.data)
accuracy_nb=np.mean(predicted_nb == twenty_test.target)*100

In [8]:
accuracy_nb

81.6914498141264

In [9]:
#support vector machine
predicted_svm = text_svm_clf.predict(twenty_test.data)
accuracy_svm=np.mean(predicted_svm == twenty_test.target)*100

In [10]:
accuracy_svm

82.249070631970255

### step5: analysing Results

In [12]:
#test case 
text1=twenty_test.data[0]
print text1

From: v064mb9k@ubvmsd.cc.buffalo.edu (NEIL B. GANDLER)
Subject: Need info on 88-89 Bonneville
Organization: University at Buffalo
Lines: 10
News-Software: VAX/VMS VNEWS 1.41
Nntp-Posting-Host: ubvmsd.cc.buffalo.edu


 I am a little confused on all of the models of the 88-89 bonnevilles.
I have heard of the LE SE LSE SSE SSEI. Could someone tell me the
differences are far as features or performance. I am also curious to
know what the book value is for prefereably the 89 model. And how much
less than book value can you usually get them for. In other words how
much are they in demand this time of year. I have heard that the mid-spring
early summer is the best time to buy.

			Neil Gandler



In [15]:
expected=twenty_test.target_names[twenty_test.target[0]]
print expected

rec.autos


In [20]:
observed=twenty_test.target_names[predicted_svm[0]]
print observed

rec.autos


In [33]:
twenty_test.target[0:20]

array([ 7,  5,  0, 17, 19, 13, 15, 15,  5,  1,  2,  5, 17,  8,  0,  2,  4,
        1,  6, 16])

In [30]:
predicted_nb[0:20]

array([ 7,  1,  0, 17,  0, 13, 15,  2,  5,  1,  2,  5, 17,  8, 15,  3,  2,
        1,  6, 16])

In [31]:
predicted_svm[0:20]

array([ 7,  1,  0, 17,  0, 13, 15,  2,  5,  1,  2,  5, 17,  8, 15,  3,  2,
        1,  6, 16])