In [2]:
### Packages and Libraries we used in the Project
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
# Loading training and test data

from sklearn.datasets import fetch_20newsgroups
data_train = fetch_20newsgroups(subset='train', shuffle=True)

from sklearn.datasets import fetch_20newsgroups
data_test = fetch_20newsgroups(subset='test', shuffle=True)

In [4]:
data_train.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR', 'description'])

In [6]:
#Printing out all the categories

data_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [11]:
## Total Number of Documents
print('Number of documents:', len(data_train.filenames))

Number of documents: 11314


In [12]:
#Bag of words - Implementation of CountVectorizer

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(data_train.data)
X_train_counts.shape

(11314, 130107)

In [13]:
### using TF - IDF Vectorizer here, comments mentioned in the report


tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(11314, 130107)

<h1 align="center">Multinomial Naive Bayes Model</h1>

In [14]:
######### NAIVE BAYES MODEL ########

In [15]:
clf = MultinomialNB().fit(X_train_tfidf, data_train.target)

In [16]:
# Implementing Naive Bayes Model on training data set

NB = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

NB = NB.fit(data_train.data, data_train.target)

In [17]:
#Performance of NB model on training data set

predicted = NB.predict(data_train.data)
acc = np.mean(predicted == data_train.target)
print('Accuracy of Naive Bayes Classifier on training data set:', acc)

Accuracy of Naive Bayes Classifier on training data set: 0.9326498143892522


In [19]:
### Classification Report for Various Categories

print(classification_report(data_train.target, predicted, target_names=data_train.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.92      0.83      0.87       480
           comp.graphics       0.98      0.93      0.95       584
 comp.os.ms-windows.misc       0.97      0.95      0.96       591
comp.sys.ibm.pc.hardware       0.90      0.96      0.93       590
   comp.sys.mac.hardware       0.99      0.97      0.98       578
          comp.windows.x       0.99      0.96      0.97       593
            misc.forsale       0.97      0.86      0.91       585
               rec.autos       0.96      0.99      0.97       594
         rec.motorcycles       0.99      0.98      0.99       598
      rec.sport.baseball       0.99      0.98      0.99       597
        rec.sport.hockey       0.97      0.99      0.98       600
               sci.crypt       0.89      0.99      0.94       595
         sci.electronics       0.98      0.94      0.96       591
                 sci.med       1.00      0.96      0.98       594
         

In [20]:
##### Performance of model on test data set

predicted = NB.predict(data_test.data)
acc = np.mean(predicted == data_test.target)
print('Accuracy of Naive Bayes Classifier on test data set:', acc)

Accuracy of Naive Bayes Classifier on test data set: 0.7738980350504514


In [21]:
#### Classification Report #####
print(classification_report(data_test.target, predicted, target_names=data_train.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.80      0.52      0.63       319
           comp.graphics       0.81      0.65      0.72       389
 comp.os.ms-windows.misc       0.82      0.65      0.73       394
comp.sys.ibm.pc.hardware       0.67      0.78      0.72       392
   comp.sys.mac.hardware       0.86      0.77      0.81       385
          comp.windows.x       0.89      0.75      0.82       395
            misc.forsale       0.93      0.69      0.80       390
               rec.autos       0.85      0.92      0.88       396
         rec.motorcycles       0.94      0.93      0.93       398
      rec.sport.baseball       0.92      0.90      0.91       397
        rec.sport.hockey       0.89      0.97      0.93       399
               sci.crypt       0.59      0.97      0.74       396
         sci.electronics       0.84      0.60      0.70       393
                 sci.med       0.92      0.74      0.82       396
         

In [22]:
# Grid Search
# Here, we are creating a list of parameters for which we would like to do performance tuning. 
# E.g. vect__ngram_range; here we are telling to use unigram and bigrams and choose the one which is optimal.

from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}

In [23]:
# Next, we create an instance of the grid search by passing the classifier, parameters 
# and n_jobs=-1 which tells to use multiple cores from user machine.

gs_clf = GridSearchCV(NB, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(data_train.data, data_train.target)

In [26]:
acc = gs_clf.best_score_

In [24]:
#### Accuracy of Naive Bayes With Grid Search with Train Data
print('Accuracy of NB with Grid Search:', acc)

Accuracy of NB with Grid Search: 0.7738980350504514


In [25]:
### Accuracy of Naive Bayes with Grid Search for Test Data
predicted = gs_clf.predict(data_test.data)
acc = np.mean(predicted == data_test.target)
print('Accuracy of NB with Grid search on test data:', acc)

Accuracy of NB with Grid search on test data: 0.8344397238449283


In [28]:
###### Best Parameters ####
print('best parameters for performance tuning:',gs_clf.best_params_)

best parameters for performance tuning: {'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}


In [33]:
# Removing stop words and implementing NB model

NB = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('nb', MultinomialNB())])

NB = NB.fit(data_train.data, data_train.target)

In [34]:
# performance of NB model on training set after removing stop words

predicted = NB.predict(data_train.data)
acc = np.mean(predicted == data_train.target)
print('Accuracy of Naive Bayes Classifier on training data set:', acc)

Accuracy of Naive Bayes Classifier on training data set: 0.9573979140887396


In [35]:
## Classification Report for Above model
print(classification_report(data_train.target, predicted, target_names=data_train.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.92      0.94      0.93       480
           comp.graphics       0.98      0.95      0.97       584
 comp.os.ms-windows.misc       0.96      0.96      0.96       591
comp.sys.ibm.pc.hardware       0.91      0.96      0.94       590
   comp.sys.mac.hardware       0.98      0.98      0.98       578
          comp.windows.x       0.98      0.98      0.98       593
            misc.forsale       0.97      0.93      0.95       585
               rec.autos       0.97      0.98      0.98       594
         rec.motorcycles       0.98      0.99      0.99       598
      rec.sport.baseball       0.98      0.99      0.99       597
        rec.sport.hockey       0.97      1.00      0.98       600
               sci.crypt       0.98      0.99      0.98       595
         sci.electronics       0.98      0.96      0.97       591
                 sci.med       0.99      0.98      0.98       594
         

In [36]:
# Performance of NB model on test data after removing stop words

predicted = NB.predict(data_test.data)
acc = np.mean(predicted == data_test.target)
print('Accuracy of Naive Bayes Classifier on test data set:', acc)

Accuracy of Naive Bayes Classifier on test data set: 0.8169144981412639


In [37]:
## Classification Report for Above Model
print(classification_report(data_test.target, predicted, target_names=data_train.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.80      0.69      0.74       319
           comp.graphics       0.78      0.72      0.75       389
 comp.os.ms-windows.misc       0.79      0.72      0.75       394
comp.sys.ibm.pc.hardware       0.68      0.81      0.74       392
   comp.sys.mac.hardware       0.86      0.81      0.84       385
          comp.windows.x       0.87      0.78      0.82       395
            misc.forsale       0.87      0.80      0.83       390
               rec.autos       0.88      0.91      0.90       396
         rec.motorcycles       0.93      0.96      0.95       398
      rec.sport.baseball       0.91      0.92      0.92       397
        rec.sport.hockey       0.88      0.98      0.93       399
               sci.crypt       0.75      0.96      0.84       396
         sci.electronics       0.84      0.65      0.74       393
                 sci.med       0.92      0.79      0.85       396
         

In [38]:
# Changing the parameter of NB model
# Setting FitPrior as False

NB = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('nb', MultinomialNB(fit_prior=False))])

NB = NB.fit(data_train.data, data_train.target)

In [39]:
# Performance of NB model on training set after changing the model's parameter

predicted = NB.predict(data_train.data)
acc = np.mean(predicted == data_train.target)
print('Accuracy of Naive Bayes Classifier on training data set:', acc)

Accuracy of Naive Bayes Classifier on training data set: 0.9627894643804137


In [41]:
print(classification_report(data_train.target, predicted, target_names=data_train.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.92      0.96      0.94       480
           comp.graphics       0.98      0.95      0.97       584
 comp.os.ms-windows.misc       0.96      0.96      0.96       591
comp.sys.ibm.pc.hardware       0.91      0.96      0.94       590
   comp.sys.mac.hardware       0.98      0.98      0.98       578
          comp.windows.x       0.99      0.98      0.98       593
            misc.forsale       0.97      0.93      0.95       585
               rec.autos       0.97      0.98      0.98       594
         rec.motorcycles       0.99      0.99      0.99       598
      rec.sport.baseball       0.99      0.99      0.99       597
        rec.sport.hockey       0.98      0.99      0.99       600
               sci.crypt       0.98      0.99      0.99       595
         sci.electronics       0.98      0.96      0.97       591
                 sci.med       0.99      0.98      0.99       594
         

In [42]:
# Performance of NB model on test data after changing the model's parameter

predicted = NB.predict(data_test.data)
acc = np.mean(predicted == data_test.target)
print('Accuracy of Naive Bayes Classifier on test data set:', acc)

Accuracy of Naive Bayes Classifier on test data set: 0.8214285714285714


In [45]:
print(classification_report(data_test.target, predicted, target_names=data_train.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.80      0.71      0.75       319
           comp.graphics       0.77      0.72      0.75       389
 comp.os.ms-windows.misc       0.79      0.72      0.75       394
comp.sys.ibm.pc.hardware       0.69      0.80      0.74       392
   comp.sys.mac.hardware       0.86      0.82      0.84       385
          comp.windows.x       0.87      0.78      0.82       395
            misc.forsale       0.87      0.80      0.83       390
               rec.autos       0.89      0.91      0.90       396
         rec.motorcycles       0.94      0.96      0.95       398
      rec.sport.baseball       0.91      0.92      0.92       397
        rec.sport.hockey       0.88      0.98      0.93       399
               sci.crypt       0.76      0.96      0.85       396
         sci.electronics       0.85      0.65      0.73       393
                 sci.med       0.93      0.78      0.85       396
         

In [46]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to C:\Users\Raja
[nltk_data]     Amlan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [47]:
# Stemming and Implementing NB model

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
    
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()), 
                             ('mnb', MultinomialNB(fit_prior=False))])

text_mnb_stemmed = text_mnb_stemmed.fit(data_train.data, data_train.target)



In [53]:
# Performance of NB model on training set after stemming

predicted_mnb_stemmed = text_mnb_stemmed.predict(data_train.data)

acc = np.mean(predicted_mnb_stemmed == data_train.target)
print('Accuracy is:', acc)

Accuracy is: 0.9554534205409227


In [54]:
print(classification_report(data_train.target, predicted_mnb_stemmed, target_names=data_train.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.92      0.94      0.93       480
           comp.graphics       0.98      0.94      0.96       584
 comp.os.ms-windows.misc       0.96      0.96      0.96       591
comp.sys.ibm.pc.hardware       0.90      0.96      0.93       590
   comp.sys.mac.hardware       0.97      0.97      0.97       578
          comp.windows.x       0.98      0.97      0.98       593
            misc.forsale       0.96      0.90      0.93       585
               rec.autos       0.97      0.98      0.97       594
         rec.motorcycles       0.99      0.98      0.99       598
      rec.sport.baseball       0.99      0.99      0.99       597
        rec.sport.hockey       0.97      0.99      0.98       600
               sci.crypt       0.96      0.99      0.97       595
         sci.electronics       0.97      0.95      0.96       591
                 sci.med       1.00      0.98      0.99       594
         

In [55]:
# Performance of NB model on test data set after stemming

predicted_mnb_stemmed = text_mnb_stemmed.predict(data_test.data)

acc = np.mean(predicted_mnb_stemmed == data_test.target)
print('Accuracy is:', acc)

Accuracy is: 0.8167817312798725


In [57]:
print(classification_report(data_test.target, predicted_mnb_stemmed, target_names=data_train.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.80      0.70      0.75       319
           comp.graphics       0.78      0.72      0.75       389
 comp.os.ms-windows.misc       0.82      0.69      0.75       394
comp.sys.ibm.pc.hardware       0.69      0.79      0.74       392
   comp.sys.mac.hardware       0.85      0.83      0.84       385
          comp.windows.x       0.86      0.79      0.83       395
            misc.forsale       0.88      0.75      0.81       390
               rec.autos       0.88      0.92      0.90       396
         rec.motorcycles       0.93      0.96      0.94       398
      rec.sport.baseball       0.93      0.92      0.92       397
        rec.sport.hockey       0.91      0.98      0.94       399
               sci.crypt       0.72      0.97      0.83       396
         sci.electronics       0.83      0.64      0.72       393
                 sci.med       0.92      0.78      0.84       396
         

<h1 align="center">SVM Model</h1>

In [58]:
############# SVM Model ###########

In [60]:
## Building SVM model on training data set and evalutaing its performance on the basis of accuracy

svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))])

svm = svm.fit(data_train.data, data_train.target)
predict = svm.predict(data_train.data)
acc = np.mean(predict == data_train.target)
print('Accuracy of SVM model on training data:', acc)



Accuracy of SVM model on training data: 0.966590065405692


In [61]:
print(classification_report(data_train.target, predict, target_names=data_train.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.95      0.96      0.95       480
           comp.graphics       0.98      0.94      0.96       584
 comp.os.ms-windows.misc       0.95      0.97      0.96       591
comp.sys.ibm.pc.hardware       0.94      0.94      0.94       590
   comp.sys.mac.hardware       0.99      0.97      0.98       578
          comp.windows.x       0.98      0.97      0.98       593
            misc.forsale       0.92      0.96      0.94       585
               rec.autos       0.98      0.98      0.98       594
         rec.motorcycles       0.98      0.99      0.99       598
      rec.sport.baseball       1.00      0.98      0.99       597
        rec.sport.hockey       0.97      1.00      0.98       600
               sci.crypt       0.98      1.00      0.99       595
         sci.electronics       0.99      0.94      0.97       591
                 sci.med       0.99      0.99      0.99       594
         

In [62]:
# Performance of SVM Model on test data

predict = svm.predict(data_test.data)
acc = np.mean(predict == data_test.target)
print('Accuracy of SVM model on test data:', acc)

Accuracy of SVM model on test data: 0.8238183749336165


In [63]:
print(classification_report(data_test.target, predict, target_names=data_train.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.73      0.72      0.72       319
           comp.graphics       0.80      0.70      0.74       389
 comp.os.ms-windows.misc       0.73      0.76      0.75       394
comp.sys.ibm.pc.hardware       0.71      0.70      0.70       392
   comp.sys.mac.hardware       0.83      0.81      0.82       385
          comp.windows.x       0.83      0.77      0.80       395
            misc.forsale       0.84      0.90      0.87       390
               rec.autos       0.92      0.89      0.91       396
         rec.motorcycles       0.92      0.96      0.94       398
      rec.sport.baseball       0.89      0.90      0.89       397
        rec.sport.hockey       0.88      0.99      0.93       399
               sci.crypt       0.83      0.96      0.89       396
         sci.electronics       0.83      0.60      0.70       393
                 sci.med       0.87      0.86      0.86       396
         

In [65]:

# Similarly doing grid search for SVM
from sklearn.model_selection import GridSearchCV
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False),'clf-svm__alpha': (1e-2, 1e-3)}

gs_clf_svm = GridSearchCV(svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(data_train.data, data_train.target)

### Accuracy of this model with Training Data

acc = gs_clf_svm.best_score_
print('Accuracy of SVM Model with Grid Search:', acc)



Accuracy of SVM Model with Grid Search: 0.8979140887396146


In [66]:

### Accuracy of this model with Test Data.

predicted = gs_clf_svm.predict(data_test.data)
acc = np.mean(predicted == data_test.target)
print('Accuracy of SVM with Grid Search on test data:', acc)

Accuracy of SVM with Grid Search on test data: 0.8331120552310144


In [67]:
### Best Parameters for this model.

print('best parameters for svm model:', gs_clf_svm.best_params_)

best parameters for svm model: {'clf-svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}


In [68]:
# Building SVM model after removing stop words on training data set

svm = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()),
                ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))])

svm = svm.fit(data_train.data, data_train.target)
predict = svm.predict(data_train.data)
acc = np.mean(predict == data_train.target)
print('Accuracy of SVM model on training data:', acc)



Accuracy of SVM model on training data: 0.9661481350539155


In [69]:
print(classification_report(data_train.target, predict, target_names=data_train.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.94      0.95      0.95       480
           comp.graphics       0.99      0.95      0.97       584
 comp.os.ms-windows.misc       0.95      0.97      0.96       591
comp.sys.ibm.pc.hardware       0.94      0.94      0.94       590
   comp.sys.mac.hardware       0.99      0.97      0.98       578
          comp.windows.x       0.98      0.97      0.98       593
            misc.forsale       0.93      0.95      0.94       585
               rec.autos       0.97      0.98      0.97       594
         rec.motorcycles       0.99      0.99      0.99       598
      rec.sport.baseball       0.99      0.98      0.99       597
        rec.sport.hockey       0.97      0.99      0.98       600
               sci.crypt       0.98      1.00      0.99       595
         sci.electronics       0.99      0.94      0.96       591
                 sci.med       0.99      0.99      0.99       594
         

In [70]:
# Performance of SVM Model on test data after removing stop words

predict = svm.predict(data_test.data)
acc = np.mean(predict == data_test.target)
print('Accuracy of SVM model on test data:', acc)

Accuracy of SVM model on test data: 0.8224907063197026


In [71]:
print(classification_report(data_test.target, predict, target_names=data_train.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.72      0.71      0.71       319
           comp.graphics       0.79      0.70      0.74       389
 comp.os.ms-windows.misc       0.73      0.77      0.75       394
comp.sys.ibm.pc.hardware       0.71      0.68      0.69       392
   comp.sys.mac.hardware       0.82      0.82      0.82       385
          comp.windows.x       0.84      0.77      0.80       395
            misc.forsale       0.82      0.87      0.85       390
               rec.autos       0.91      0.89      0.90       396
         rec.motorcycles       0.92      0.97      0.94       398
      rec.sport.baseball       0.90      0.91      0.90       397
        rec.sport.hockey       0.86      0.98      0.92       399
               sci.crypt       0.85      0.96      0.90       396
         sci.electronics       0.81      0.62      0.70       393
                 sci.med       0.90      0.87      0.88       396
         

In [72]:
# Stemming and implementing SVM model

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
    
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

text_svm_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()),('svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))])

text_svm_stemmed = text_svm_stemmed.fit(data_train.data, data_train.target)



In [73]:
# Performance of SVM model on training set after stemming

predicted_svm_stemmed = text_svm_stemmed.predict(data_train.data)

acc = np.mean(predicted_svm_stemmed == data_train.target)
print('Accuracy is:', acc)

Accuracy is: 0.9593424076365564


In [74]:
# Performance of SVM model on test data set after stemming

predicted_svm_stemmed = text_svm_stemmed.predict(data_test.data)

acc = np.mean(predicted_svm_stemmed == data_test.target)
print('Accuracy is:', acc)

Accuracy is: 0.8194370685077005


<h1 align="center">Logistic Regression Model</h1>

In [75]:
########## Logistic Regression Model ##########

In [76]:
# Implementing Logistic Regression Model on training data set

pipe = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('lr', LogisticRegression(random_state=0))])

pipe = pipe.fit(data_train.data, data_train.target)

In [82]:
#Performance of Logistic regression model on training data set

predicted = pipe.predict(data_train.data)
acc = np.mean(predicted == data_train.target)
print('Accuracy of Logistic Regression Model on training data set:', acc)

Accuracy of Logistic Regression Model on training data set: 0.9698603500088386


In [83]:
print(classification_report(data_train.target, predicted, target_names=data_train.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.97      0.97      0.97       480
           comp.graphics       0.93      0.96      0.95       584
 comp.os.ms-windows.misc       0.95      0.97      0.96       591
comp.sys.ibm.pc.hardware       0.93      0.94      0.94       590
   comp.sys.mac.hardware       0.99      0.97      0.98       578
          comp.windows.x       0.98      0.97      0.98       593
            misc.forsale       0.90      0.95      0.93       585
               rec.autos       0.98      0.97      0.97       594
         rec.motorcycles       1.00      0.99      0.99       598
      rec.sport.baseball       0.99      0.99      0.99       597
        rec.sport.hockey       0.99      0.99      0.99       600
               sci.crypt       0.99      0.98      0.99       595
         sci.electronics       0.97      0.97      0.97       591
                 sci.med       0.99      0.98      0.99       594
         

In [84]:
### Performance of Log. Regression on Test Data
predicted = pipe.predict(data_test.data)
acc = np.mean(predicted == data_test.target)
print('Accuracy of Logistic Regression Model on test data set:', acc)

Accuracy of Logistic Regression Model on test data set: 0.8279341476367499


In [85]:
print(classification_report(data_test.target, predicted, target_names=data_train.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.80      0.74      0.77       319
           comp.graphics       0.69      0.78      0.74       389
 comp.os.ms-windows.misc       0.76      0.75      0.75       394
comp.sys.ibm.pc.hardware       0.73      0.72      0.72       392
   comp.sys.mac.hardware       0.81      0.83      0.82       385
          comp.windows.x       0.83      0.74      0.78       395
            misc.forsale       0.76      0.90      0.83       390
               rec.autos       0.91      0.89      0.90       396
         rec.motorcycles       0.94      0.95      0.94       398
      rec.sport.baseball       0.87      0.93      0.90       397
        rec.sport.hockey       0.94      0.96      0.95       399
               sci.crypt       0.93      0.89      0.91       396
         sci.electronics       0.76      0.78      0.77       393
                 sci.med       0.89      0.84      0.86       396
         

In [86]:
# Implementing Logistic Regression Model on training data set after removing stop words

LR = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('lr', LogisticRegression())])

LR = LR.fit(data_train.data, data_train.target)

In [87]:
# Evaluating performance of Logistic regression model on training set after removing stop word
predicted = LR.predict(data_train.data)
acc = np.mean(predicted == data_train.target)
print('Accuracy of Logistic Regression Model on training data set:', acc)

Accuracy of Logistic Regression Model on training data set: 0.9746331978080255


In [88]:
# Evaluating performance of Logistic regression model on training set after removing stop word
predicted = LR.predict(data_test.data)
acc = np.mean(predicted == data_test.target)
print('Accuracy of Logistic Regression Model on test data set:', acc)

Accuracy of Logistic Regression Model on test data set: 0.8297928836962294


In [89]:
print(classification_report(data_test.target, predicted, target_names=data_train.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.78      0.72      0.75       319
           comp.graphics       0.71      0.79      0.75       389
 comp.os.ms-windows.misc       0.75      0.76      0.75       394
comp.sys.ibm.pc.hardware       0.71      0.71      0.71       392
   comp.sys.mac.hardware       0.80      0.82      0.81       385
          comp.windows.x       0.84      0.75      0.79       395
            misc.forsale       0.78      0.87      0.82       390
               rec.autos       0.90      0.89      0.89       396
         rec.motorcycles       0.93      0.95      0.94       398
      rec.sport.baseball       0.88      0.92      0.90       397
        rec.sport.hockey       0.93      0.96      0.94       399
               sci.crypt       0.95      0.91      0.93       396
         sci.electronics       0.74      0.78      0.76       393
                 sci.med       0.88      0.86      0.87       396
         

In [90]:
# Stemming and building Logistic regression model on training set and predicting the accuracy for the model on training and test data sets

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
    
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()), 
                             ('mnb', LogisticRegression())])

text_mnb_stemmed = text_mnb_stemmed.fit(data_train.data, data_train.target)

predicted_mnb_stemmed = text_mnb_stemmed.predict(data_train.data)

acc = np.mean(predicted_mnb_stemmed == data_train.target)

# Evaluating performance of LR model on test data
predicted_mnb_stemmed2 = text_mnb_stemmed.predict(data_test.data)

acc2 = np.mean(predicted_mnb_stemmed2 == data_test.target)
print('Accuracy of LR model after stemming on train set:', acc)
print('Accuracy of LR model after stemming on test set:', acc2)

Accuracy of LR model after stemming on train set: 0.969948736079194
Accuracy of LR model after stemming on test set: 0.8321826872012745


In [92]:
### Classification report for the categories ###
print(classification_report(data_test.target, predicted_mnb_stemmed2, target_names=data_train.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.77      0.71      0.74       319
           comp.graphics       0.72      0.80      0.76       389
 comp.os.ms-windows.misc       0.76      0.76      0.76       394
comp.sys.ibm.pc.hardware       0.72      0.73      0.73       392
   comp.sys.mac.hardware       0.80      0.84      0.82       385
          comp.windows.x       0.84      0.77      0.81       395
            misc.forsale       0.75      0.85      0.80       390
               rec.autos       0.91      0.88      0.90       396
         rec.motorcycles       0.97      0.95      0.96       398
      rec.sport.baseball       0.91      0.92      0.92       397
        rec.sport.hockey       0.93      0.97      0.95       399
               sci.crypt       0.94      0.92      0.93       396
         sci.electronics       0.75      0.78      0.77       393
                 sci.med       0.88      0.86      0.87       396
         