# 20 Newsgroups - Document Classification

### The various steps performed are:
1. Data Exploration and Cleaning
2. Data Transformation / Feature Extraction
3. Modeling
4. Training and testing results of the models
5. Comparative analysis of accuracy to find the best model

In [2]:
!pip install langid
#Importing Necessary Libraries
from sklearn.datasets import fetch_20newsgroups
import re
import string
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn import metrics

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier



In [3]:
#loading the training and test dataset

newsgroups_train = fetch_20newsgroups(subset='train', shuffle=True, random_state = 50)
newsgroups_test = fetch_20newsgroups(subset='test', shuffle=True, random_state = 42)

In [4]:
#prints all the categories that are present in the dataset
# There are 20 categories of news
newsgroups_train.target_names 

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [5]:
#prints first line of the first data file
print("\n".join(newsgroups_train.data[0].split("\n")[:3])) 

From: twork@egr.msu.edu (Michael Twork)
Subject: Re: Thumbs up to ESPN
Organization: Michigan State University


# Data Cleaning

####  1. Remove Non Ascii characters

In [6]:
def strip_non_ascii(data_str):
    ''' Returns the string without non ASCII characters''' 
    stripped = (c for c in data_str if 0 < ord(c) < 127) 
    return ''.join(stripped)

####  2. Clean the corpus by removing unwanted information

In [7]:
def remove_features(data_str): # compile regex
    url_re = re.compile('https?://(www.)?\w+\.\w+(/\w+)*/?') 
    punc_re = re.compile('[%s]' % re.escape(string.punctuation)) 
    num_re = re.compile('(\\d+)')
    mention_re = re.compile('@(\w+)')
    alpha_num_re = re.compile("^[a-z0-9_.]+$")
    # convert to lowercase
    data_str = data_str.lower()
    # remove hyperlinks
    data_str = url_re.sub(' ', data_str)
    # remove @mentions
    data_str = mention_re.sub(' ', data_str)
    # remove puncuation
    data_str = punc_re.sub(' ', data_str)
    # remove numeric 'words'
    data_str = num_re.sub(' ', data_str)
    # remove non a-z 0-9 characters and words shorter than 1 characters 
    list_pos = 0
    cleaned_str = ''
    for word in data_str.split():
        if list_pos == 0:
            if alpha_num_re.match(word) and len(word) > 1:
                cleaned_str = word 
            else:
                cleaned_str = ' '
        else:
            if alpha_num_re.match(word) and len(word) > 1:
                cleaned_str = cleaned_str + ' ' + word 
            else:
                cleaned_str += ' '
        list_pos += 1
    
    return " ".join(cleaned_str.split())

#### 3. Perform Lemmatization

In [8]:
def lemmatize(data_str):
    # expects a string
    list_pos = 0
    cleaned_str = ''
    lmtzr = WordNetLemmatizer() 
    text = data_str.split() 
    tagged_words = pos_tag(text) 
    for word in tagged_words:
        if 'v' in word[1].lower():
            lemma = lmtzr.lemmatize(word[0], pos='v')
        else:
            lemma = lmtzr.lemmatize(word[0], pos='n')
        if list_pos == 0: 
            cleaned_str = lemma
        else:
            cleaned_str = cleaned_str + ' ' + lemma
        list_pos += 1 
    return cleaned_str

#### Clean train and test data by applying the fuctions created above

In [17]:
# Cleaning Train Data
train_clean = []
for i in range(len(newsgroups_train.data)):
    res = remove_features(newsgroups_train.data[i])
    res1 = strip_non_ascii(res)
    res2 = lemmatize(res1)
    train_clean.append(res2)

In [18]:
# Cleaning Test Data
test_clean = []
for i in range(len(newsgroups_test.data)):
    res = remove_features(newsgroups_test.data[i])
    res1 = strip_non_ascii(res)
    res2 = lemmatize(res1)
    test_clean.append(res2)

### Data Transformation

In [21]:
#Creating a TFIDF vectorizer
tv = TfidfVectorizer(stop_words="english",min_df = 0.01, max_df = 0.85)
train_tv = tv.fit_transform(train_clean).todense()
test_tv = tv.transform(test_clean)
train_tv

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.08185201,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]])

In [23]:
# Converting the vector to dataframe
dt = pd.DataFrame(train_tv,columns=tv.get_feature_names())

In [24]:
dt.head()

Unnamed: 0,aa,ab,ability,able,absolute,absolutely,abuse,ac,academic,accept,...,writer,writes,wrong,yeah,year,yes,yesterday,york,young,zero
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.101277,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.42459,0.0,0.0,0.0,0.0,0.0


In [27]:
#Creating Document term matrix using Count Vectorizer
cv = CountVectorizer(stop_words="english", min_df = 0.01, max_df = 0.9)
train_cv = cv.fit_transform(train_clean)
test_cv = cv.transform(test_clean)

### Text Classification Modeling :
    1. Naive Bayes
    2. SGD Classifier
    3. Support Vector Machines
    4. Random Forest
    5. Logistic Regression
    6. Decision Tree

#### 1.1 Naive Bayes using tf-idf

In [32]:
# Naive Bayes using TFIDF
text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', MultinomialNB(alpha=0.1))])

text_clf.fit(train_clean, newsgroups_train.target)  

pred_test = text_clf.predict(test_clean)
pred_train = text_clf.predict(train_clean)

In [37]:
# # Performance Metrics for train data
print("train data: Naive Bayes Accuracy Score -> ",accuracy_score(newsgroups_train.target, pred_train)*100)
print("train data: Naive Bayes precision Score -> ",precision_score(newsgroups_train.target, pred_train,average='macro')*100)
print("train data: Naive Bayes recall Score -> ",recall_score(newsgroups_train.target, pred_train,average='macro')*100)
print("train data: Naive Bayes f1 Score -> ",f1_score(newsgroups_train.target, pred_train, average='macro')*100)

train data: Naive Bayes Accuracy Score ->  98.18808555771611
train data: Naive Bayes precision Score ->  98.28757106016826
train data: Naive Bayes recall Score ->  97.78470301504868
train data: Naive Bayes f1 Score ->  97.93053518942601


In [38]:
# # Performance Metrics for test data
print("Test data: Naive Bayes Accuracy Score -> ",accuracy_score(newsgroups_test.target, pred_test)*100)
print("Test data: Naive Bayes precision Score -> ",precision_score(newsgroups_test.target, pred_test,average='macro')*100)
print("Test data: Naive Bayes recall Score -> ",recall_score(newsgroups_test.target, pred_test,average='macro')*100)
print("Test data: Naive Bayes f1 Score -> ",f1_score(newsgroups_test.target, pred_test, average='macro')*100)

Test data: Naive Bayes Accuracy Score ->  82.58098778544876
Test data: Naive Bayes precision Score ->  83.7520034786151
Test data: Naive Bayes recall Score ->  81.36526403917198
Test data: Naive Bayes f1 Score ->  81.3218215415369


In [41]:
print(metrics.classification_report(newsgroups_test.target, pred_test, target_names=newsgroups_test.target_names))
print("Accuracy of the model=",metrics.accuracy_score(newsgroups_test.target, pred_test ))

                          precision    recall  f1-score   support

             alt.atheism       0.83      0.73      0.78       319
           comp.graphics       0.76      0.75      0.75       389
 comp.os.ms-windows.misc       0.77      0.64      0.70       394
comp.sys.ibm.pc.hardware       0.66      0.78      0.72       392
   comp.sys.mac.hardware       0.83      0.84      0.84       385
          comp.windows.x       0.86      0.79      0.83       395
            misc.forsale       0.90      0.72      0.80       390
               rec.autos       0.88      0.91      0.90       396
         rec.motorcycles       0.90      0.97      0.93       398
      rec.sport.baseball       0.94      0.95      0.95       397
        rec.sport.hockey       0.95      0.97      0.96       399
               sci.crypt       0.80      0.95      0.87       396
         sci.electronics       0.79      0.75      0.77       393
                 sci.med       0.90      0.85      0.88       396
         

#### 1.2 Naive Bayes using countvectorizer

In [42]:
# Model Fitting
clf2 = MultinomialNB(alpha=.01)
clf2.fit(train_cv, newsgroups_train.target)
pred_test = clf2.predict(test_cv)
pred_train = clf2.predict(train_cv)

In [43]:
# # Performance Metrics for train data
print("train data: Naive Bayes Accuracy Score -> ",accuracy_score(newsgroups_train.target, pred_train)*100)
print("train data: Naive Bayes precision Score -> ",precision_score(newsgroups_train.target, pred_train,average='macro')*100)
print("train data: Naive Bayes recall Score -> ",recall_score(newsgroups_train.target, pred_train,average='macro')*100)
print("train data: Naive Bayes f1 Score -> ",f1_score(newsgroups_train.target, pred_train, average='macro')*100)

train data: Naive Bayes Accuracy Score ->  85.0715927169878
train data: Naive Bayes precision Score ->  85.5164254833315
train data: Naive Bayes recall Score ->  84.99201621489777
train data: Naive Bayes f1 Score ->  84.98016685058315


In [44]:
# # Performance Metrics for train data
print("Test data: Naive Bayes Accuracy Score -> ",accuracy_score(newsgroups_test.target, pred_test)*100)
print("Test data: Naive Bayes precision Score -> ",precision_score(newsgroups_test.target, pred_test,average='macro')*100)
print("Test data: Naive Bayes recall Score -> ",recall_score(newsgroups_test.target, pred_test,average='macro')*100)
print("Test data: Naive Bayes f1 Score -> ",f1_score(newsgroups_test.target, pred_test, average='macro')*100)

Test data: Naive Bayes Accuracy Score ->  70.06107275624004
Test data: Naive Bayes precision Score ->  70.27762853143636
Test data: Naive Bayes recall Score ->  69.38217556149547
Test data: Naive Bayes f1 Score ->  69.07111685825271


In [45]:
print(metrics.classification_report(newsgroups_test.target, pred_test, target_names=newsgroups_test.target_names))
print("Accuracy of the model=",metrics.accuracy_score(newsgroups_test.target, pred_test))

                          precision    recall  f1-score   support

             alt.atheism       0.66      0.67      0.66       319
           comp.graphics       0.51      0.68      0.58       389
 comp.os.ms-windows.misc       0.67      0.27      0.38       394
comp.sys.ibm.pc.hardware       0.53      0.58      0.55       392
   comp.sys.mac.hardware       0.58      0.67      0.62       385
          comp.windows.x       0.71      0.65      0.68       395
            misc.forsale       0.71      0.79      0.75       390
               rec.autos       0.70      0.76      0.73       396
         rec.motorcycles       0.70      0.88      0.78       398
      rec.sport.baseball       0.86      0.83      0.85       397
        rec.sport.hockey       0.91      0.88      0.90       399
               sci.crypt       0.88      0.81      0.84       396
         sci.electronics       0.59      0.61      0.60       393
                 sci.med       0.76      0.68      0.72       396
         

#### 2.1 SGD classifier using tf-idf

In [47]:
text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-3, random_state=42,max_iter=5, tol=0.0001))])

text_clf.fit(train_clean, newsgroups_train.target)  

pred_test = text_clf.predict(test_clean)
pred_train = text_clf.predict(train_clean)



In [48]:
print("train data: SGD Accuracy Score -> ",accuracy_score(newsgroups_train.target, pred_train)*100)
print("train data: SGD precision Score -> ",precision_score(newsgroups_train.target, pred_train,average='macro')*100)
print("train data: SGD recall Score -> ",recall_score(newsgroups_train.target, pred_train,average='macro')*100)
print("train data: SGD f1 Score -> ",f1_score(newsgroups_train.target, pred_train, average='macro')*100)

train data: SGD Accuracy Score ->  95.62488951741206
train data: SGD precision Score ->  95.83029676163434
train data: SGD recall Score ->  95.04553312471998
train data: SGD f1 Score ->  95.22550933150444


In [49]:
print("test data: SGD Accuracy Score -> ",accuracy_score(newsgroups_test.target, pred_test)*100)
print("test data: SGD precision Score -> ",precision_score(newsgroups_test.target, pred_test,average='macro')*100)
print("test data: SGD recall Score -> ",recall_score(newsgroups_test.target, pred_test,average='macro')*100)
print("test data: SGD f1 Score -> ",f1_score(newsgroups_test.target, pred_test, average='macro')*100)

test data: SGD Accuracy Score ->  81.82421667551779
test data: SGD precision Score ->  82.38162486647323
test data: SGD recall Score ->  80.66383171630622
test data: SGD f1 Score ->  80.50730381817907


In [50]:
print(metrics.classification_report(newsgroups_test.target, pred_test, target_names=newsgroups_test.target_names))
print("Accuracy of the model=",metrics.accuracy_score(newsgroups_test.target, pred_test ))

                          precision    recall  f1-score   support

             alt.atheism       0.71      0.69      0.70       319
           comp.graphics       0.79      0.73      0.76       389
 comp.os.ms-windows.misc       0.75      0.70      0.72       394
comp.sys.ibm.pc.hardware       0.74      0.69      0.71       392
   comp.sys.mac.hardware       0.80      0.83      0.82       385
          comp.windows.x       0.84      0.75      0.79       395
            misc.forsale       0.82      0.86      0.84       390
               rec.autos       0.89      0.88      0.88       396
         rec.motorcycles       0.91      0.95      0.93       398
      rec.sport.baseball       0.90      0.91      0.90       397
        rec.sport.hockey       0.88      0.99      0.93       399
               sci.crypt       0.82      0.96      0.88       396
         sci.electronics       0.83      0.65      0.73       393
                 sci.med       0.89      0.86      0.88       396
         

#### 2.2 SGD classifier using count vectorizer

In [51]:
clf = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42,max_iter=5, tol=0.0001)
clf.fit(train_cv, newsgroups_train.target)
pred_train1 = clf.predict(train_cv)
pred_test = clf.predict(test_cv)

In [52]:
print("train data: SGD Accuracy Score -> ",accuracy_score(newsgroups_train.target, pred_train1)*100)
print("train data: SGD precision Score -> ",precision_score(newsgroups_train.target, pred_train1,average='macro')*100)
print("train data: SGD recall Score -> ",recall_score(newsgroups_train.target, pred_train1,average='macro')*100)
print("train data: SGD f1 Score -> ",f1_score(newsgroups_train.target, pred_train1, average='macro')*100)

train data: SGD Accuracy Score ->  92.9998232278593
train data: SGD precision Score ->  93.32875462519206
train data: SGD recall Score ->  93.08295060265847
train data: SGD f1 Score ->  93.09501077340695


In [53]:
print("test data: SGD Accuracy Score -> ",accuracy_score(newsgroups_test.target, pred_test)*100)
print("test data: SGD precision Score -> ",precision_score(newsgroups_test.target, pred_test,average='macro')*100)
print("test data: SGD recall Score -> ",recall_score(newsgroups_test.target, pred_test,average='macro')*100)
print("test data: SGD f1 Score -> ",f1_score(newsgroups_test.target, pred_test, average='macro')*100)

test data: SGD Accuracy Score ->  68.73340414232607
test data: SGD precision Score ->  69.1596266654847
test data: SGD recall Score ->  67.9838431570855
test data: SGD f1 Score ->  67.85005965681187


In [54]:
print(metrics.classification_report(newsgroups_test.target, pred_test, target_names=newsgroups_test.target_names))
print("Accuracy of the model=",metrics.accuracy_score(newsgroups_test.target, pred_test ))

                          precision    recall  f1-score   support

             alt.atheism       0.51      0.65      0.57       319
           comp.graphics       0.61      0.54      0.57       389
 comp.os.ms-windows.misc       0.65      0.49      0.56       394
comp.sys.ibm.pc.hardware       0.53      0.59      0.56       392
   comp.sys.mac.hardware       0.51      0.79      0.62       385
          comp.windows.x       0.62      0.70      0.66       395
            misc.forsale       0.73      0.75      0.74       390
               rec.autos       0.77      0.74      0.76       396
         rec.motorcycles       0.82      0.83      0.82       398
      rec.sport.baseball       0.83      0.76      0.80       397
        rec.sport.hockey       0.86      0.90      0.88       399
               sci.crypt       0.79      0.84      0.81       396
         sci.electronics       0.69      0.44      0.54       393
                 sci.med       0.80      0.63      0.70       396
         

#### 3.1 Support Vector Machines using TFIDF

In [55]:
#Creating a TFIDF vectorizer
tv = TfidfVectorizer(stop_words="english",min_df = 0.01, max_df = 0.95)
train_tv1 = tv.fit_transform(train_clean).todense()
test_tv1 = tv.transform(test_clean).todense()
train_tv1

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.08185201,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]])

In [57]:
# Running SVM for TFIDF
svclassifier = SVC(kernel='linear', C=0.1, gamma = 0.1)  
svclassifier.fit(train_tv1, newsgroups_train.target)
pred_train = svclassifier.predict(train_tv1)
pred_test = svclassifier.predict(test_tv1)

In [58]:
# # Performance Metrics for train data
print("train data: SVM Accuracy Score -> ",accuracy_score(newsgroups_train.target, pred_train)*100)
print("train data: SVM precision Score -> ",precision_score(newsgroups_train.target, pred_train,average='macro')*100)
print("train data: SVM recall Score -> ",recall_score(newsgroups_train.target, pred_train,average='macro')*100)
print("train data: SVM f1 Score -> ",f1_score(newsgroups_train.target, pred_train, average='macro')*100)

train data: SVM Accuracy Score ->  80.91744741028813
train data: SVM precision Score ->  83.68017415239343
train data: SVM recall Score ->  79.9555309571556
train data: SVM f1 Score ->  80.42712364571578


In [59]:
# # Performance Metrics for test data
print("test data: SVM Accuracy Score -> ",accuracy_score(newsgroups_test.target, pred_test)*100)
print("test data: SVM precision Score -> ",precision_score(newsgroups_test.target, pred_test,average='macro')*100)
print("test data: SVM recall Score -> ",recall_score(newsgroups_test.target, pred_test,average='macro')*100)
print("test data: SVM f1 Score -> ",f1_score(newsgroups_test.target, pred_test, average='macro')*100)

test data: SVM Accuracy Score ->  69.06532129580457
test data: SVM precision Score ->  72.8444291707015
test data: SVM recall Score ->  67.74937145779408
test data: SVM f1 Score ->  68.35824909958883


In [60]:
print(metrics.classification_report(newsgroups_test.target, pred_test, target_names=newsgroups_test.target_names))
print("Accuracy of the model=",metrics.accuracy_score(newsgroups_test.target, pred_test ))

                          precision    recall  f1-score   support

             alt.atheism       0.64      0.53      0.58       319
           comp.graphics       0.58      0.69      0.63       389
 comp.os.ms-windows.misc       0.69      0.63      0.66       394
comp.sys.ibm.pc.hardware       0.61      0.61      0.61       392
   comp.sys.mac.hardware       0.74      0.63      0.68       385
          comp.windows.x       0.74      0.62      0.67       395
            misc.forsale       0.79      0.73      0.76       390
               rec.autos       0.82      0.72      0.76       396
         rec.motorcycles       0.87      0.81      0.84       398
      rec.sport.baseball       0.78      0.86      0.82       397
        rec.sport.hockey       0.93      0.85      0.89       399
               sci.crypt       0.96      0.69      0.80       396
         sci.electronics       0.34      0.78      0.47       393
                 sci.med       0.67      0.67      0.67       396
         

#### 3.2 SVM using Count Vectorizer

In [61]:
# Running SVM for Count Vectorizer
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear', C=0.1, gamma = 0.1)  
svclassifier.fit(train_cv, newsgroups_train.target)
pred_train = svclassifier.predict(train_cv)
pred_test = svclassifier.predict(test_cv)

In [62]:
# # Performance Metrics for train data
print("train data: SVM Accuracy Score -> ",accuracy_score(newsgroups_train.target, pred_train)*100)
print("train data: SVM precision Score -> ",precision_score(newsgroups_train.target, pred_train,average='macro')*100)
print("train data: SVM recall Score -> ",recall_score(newsgroups_train.target, pred_train,average='macro')*100)
print("train data: SVM f1 Score -> ",f1_score(newsgroups_train.target, pred_train, average='macro')*100)

train data: SVM Accuracy Score ->  99.14265511755347
train data: SVM precision Score ->  99.18173502939864
train data: SVM recall Score ->  99.15841647412218
train data: SVM f1 Score ->  99.1682722202168


In [63]:
# # Performance Metrics for train data
print("test data: SVM Accuracy Score -> ",accuracy_score(newsgroups_test.target, pred_test)*100)
print("test data: SVM precision Score -> ",precision_score(newsgroups_test.target, pred_test,average='macro')*100)
print("test data: SVM recall Score -> ",recall_score(newsgroups_test.target, pred_test,average='macro')*100)
print("test data: SVM f1 Score -> ",f1_score(newsgroups_test.target, pred_test, average='macro')*100)

test data: SVM Accuracy Score ->  67.55177907594265
test data: SVM precision Score ->  67.38747865113191
test data: SVM recall Score ->  66.77518016376872
test data: SVM f1 Score ->  66.78307179022993


In [64]:
print(metrics.classification_report(newsgroups_test.target, pred_test, target_names=newsgroups_test.target_names))
print("Accuracy of the model=",metrics.accuracy_score(newsgroups_test.target, pred_test ))

                          precision    recall  f1-score   support

             alt.atheism       0.53      0.65      0.58       319
           comp.graphics       0.49      0.63      0.55       389
 comp.os.ms-windows.misc       0.63      0.57      0.60       394
comp.sys.ibm.pc.hardware       0.57      0.58      0.58       392
   comp.sys.mac.hardware       0.59      0.65      0.62       385
          comp.windows.x       0.69      0.62      0.65       395
            misc.forsale       0.73      0.78      0.76       390
               rec.autos       0.70      0.70      0.70       396
         rec.motorcycles       0.84      0.81      0.82       398
      rec.sport.baseball       0.72      0.77      0.75       397
        rec.sport.hockey       0.88      0.84      0.86       399
               sci.crypt       0.80      0.79      0.79       396
         sci.electronics       0.56      0.52      0.54       393
                 sci.med       0.71      0.58      0.64       396
         

#### 4.1 RandomForest Classifier using tf-idf

In [66]:
text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', RandomForestClassifier(n_estimators=500,criterion='entropy',random_state=0,max_depth=20, min_impurity_decrease=0.001))])

text_clf.fit(train_clean, newsgroups_train.target)  

pred_test = text_clf.predict(test_clean)
pred_train = text_clf.predict(train_clean) 

In [67]:
# # Performance Metrics for train data
print("train data: Random Forest Accuracy Score -> ",accuracy_score(newsgroups_train.target, pred_train)*100)
print("train data: Random Forest precision Score -> ",precision_score(newsgroups_train.target, pred_train,average='macro')*100)
print("train data: Random Forest recall Score -> ",recall_score(newsgroups_train.target, pred_train,average='macro')*100)
print("train data: Random Forest f1 Score -> ",f1_score(newsgroups_train.target, pred_train, average='macro')*100)

train data: Random Forest Accuracy Score ->  89.64999116139296
train data: Random Forest precision Score ->  91.9691029577696
train data: Random Forest recall Score ->  88.77855109333403
train data: Random Forest f1 Score ->  89.35801510035198


In [68]:
# # Performance Metrics for test data
print("Test data: Random Forest Accuracy Score -> ",accuracy_score(newsgroups_test.target, pred_test)*100)
print("Test data: Random Forest precision Score -> ",precision_score(newsgroups_test.target, pred_test,average='macro')*100)
print("Test data: Random Forest recall Score -> ",recall_score(newsgroups_test.target, pred_test,average='macro')*100)
print("Test data: Random Forest f1 Score -> ",f1_score(newsgroups_test.target, pred_test, average='macro')*100)

Test data: Random Forest Accuracy Score ->  69.87519915029209
Test data: Random Forest precision Score ->  74.9423068244963
Test data: Random Forest recall Score ->  68.1724575196023
Test data: Random Forest f1 Score ->  67.35790305785214


In [69]:
print(metrics.classification_report(newsgroups_test.target, pred_test, target_names=newsgroups_test.target_names))
print("Accuracy of the model=",metrics.accuracy_score(newsgroups_test.target, pred_test ))

                          precision    recall  f1-score   support

             alt.atheism       0.72      0.48      0.57       319
           comp.graphics       0.58      0.61      0.59       389
 comp.os.ms-windows.misc       0.62      0.68      0.65       394
comp.sys.ibm.pc.hardware       0.64      0.61      0.62       392
   comp.sys.mac.hardware       0.80      0.67      0.73       385
          comp.windows.x       0.70      0.67      0.68       395
            misc.forsale       0.45      0.94      0.61       390
               rec.autos       0.83      0.73      0.78       396
         rec.motorcycles       0.83      0.86      0.85       398
      rec.sport.baseball       0.83      0.81      0.82       397
        rec.sport.hockey       0.83      0.93      0.88       399
               sci.crypt       0.81      0.89      0.85       396
         sci.electronics       0.73      0.40      0.51       393
                 sci.med       0.85      0.56      0.67       396
         

#### 4.2 RandomfForest classifier using countvectorizer

In [70]:
clf2=RandomForestClassifier(n_estimators=500,criterion='entropy',random_state=0,max_depth=20, min_impurity_decrease=0.001)
clf2.fit(train_cv, newsgroups_train.target)
pred_train2 = clf2.predict(train_cv)
pred_test2 = clf2.predict(test_cv)

In [71]:
print("train data: Random Forest Accuracy Score -> ",accuracy_score(newsgroups_train.target, pred_train2)*100)
print("train data: Random Forest precision Score -> ",precision_score(newsgroups_train.target, pred_train2,average='macro')*100)
print("train data: Random Forest recall Score -> ",recall_score(newsgroups_train.target, pred_train2,average='macro')*100)
print("train data: Random Forest f1 Score -> ",f1_score(newsgroups_train.target, pred_train2, average='macro')*100)

train data: Random Forest Accuracy Score ->  81.51847268870426
train data: Random Forest precision Score ->  83.2006089258025
train data: Random Forest recall Score ->  80.84915598285883
train data: Random Forest f1 Score ->  81.33508701152319


In [72]:
print("Test data: Random Forest Accuracy Score -> ",accuracy_score(newsgroups_test.target, pred_test2)*100)
print("Test data: Random Forest precision Score -> ",precision_score(newsgroups_test.target, pred_test2,average='macro')*100)
print("Test data: Random Forest recall Score -> ",recall_score(newsgroups_test.target, pred_test2,average='macro')*100)
print("Test data: Random Forest f1 Score -> ",f1_score(newsgroups_test.target, pred_test2, average='macro')*100)

Test data: Random Forest Accuracy Score ->  65.06903876792353
Test data: Random Forest precision Score ->  66.44049573687066
Test data: Random Forest recall Score ->  63.75034501522342
Test data: Random Forest f1 Score ->  63.651407039617226


In [73]:
print(metrics.classification_report(newsgroups_test.target, pred_test2, target_names=newsgroups_test.target_names))
print("Accuracy of the model=",metrics.accuracy_score(newsgroups_test.target, pred_test2 ))

                          precision    recall  f1-score   support

             alt.atheism       0.66      0.49      0.56       319
           comp.graphics       0.52      0.58      0.55       389
 comp.os.ms-windows.misc       0.54      0.67      0.60       394
comp.sys.ibm.pc.hardware       0.54      0.55      0.54       392
   comp.sys.mac.hardware       0.72      0.61      0.66       385
          comp.windows.x       0.68      0.48      0.56       395
            misc.forsale       0.66      0.79      0.72       390
               rec.autos       0.76      0.66      0.71       396
         rec.motorcycles       0.88      0.81      0.84       398
      rec.sport.baseball       0.78      0.74      0.76       397
        rec.sport.hockey       0.83      0.86      0.85       399
               sci.crypt       0.84      0.83      0.84       396
         sci.electronics       0.34      0.39      0.36       393
                 sci.med       0.47      0.63      0.54       396
         

#### 5.1 Logistic Regression using tf-idf

In [75]:
text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', LogisticRegression())])

text_clf.fit(train_clean, newsgroups_train.target)  

pred_test = text_clf.predict(test_clean)
pred_train = text_clf.predict(train_clean)  



In [76]:
# Performance Metrics for train data
print("train data: Logistic Regression Accuracy Score -> ",accuracy_score(newsgroups_train.target, pred_train)*100)
print("train data: Logistic Regression precision Score -> ",precision_score(newsgroups_train.target, pred_train,average='macro')*100)
print("train data: Logistic Regression recall Score -> ",recall_score(newsgroups_train.target, pred_train,average='macro')*100)
print("train data: Logistic Regression f1 Score -> ",f1_score(newsgroups_train.target, pred_train, average='macro')*100)

train data: Logistic Regression Accuracy Score ->  96.47339579282304
train data: Logistic Regression precision Score ->  96.60419387727973
train data: Logistic Regression recall Score ->  96.21266996821467
train data: Logistic Regression f1 Score ->  96.35813188687166


In [77]:
# Performance Metrics for train data
print("test data: Logistic Regression Accuracy Score -> ",accuracy_score(newsgroups_test.target, pred_test)*100)
print("test data: Logistic Regression precision Score -> ",precision_score(newsgroups_test.target, pred_test,average='macro')*100)
print("test data: Logistic Regression recall Score -> ",recall_score(newsgroups_test.target, pred_test,average='macro')*100)
print("test data: Logistic Regression f1 Score -> ",f1_score(newsgroups_test.target, pred_test, average='macro')*100)

test data: Logistic Regression Accuracy Score ->  82.14285714285714
test data: Logistic Regression precision Score ->  82.31316449560545
test data: Logistic Regression recall Score ->  81.13012352679375
test data: Logistic Regression f1 Score ->  81.24197622183736


In [78]:
print(metrics.classification_report(newsgroups_test.target, pred_test, target_names=newsgroups_test.target_names))
print("Accuracy of the model=",metrics.accuracy_score(newsgroups_test.target, pred_test ))

                          precision    recall  f1-score   support

             alt.atheism       0.77      0.71      0.74       319
           comp.graphics       0.70      0.78      0.74       389
 comp.os.ms-windows.misc       0.73      0.71      0.72       394
comp.sys.ibm.pc.hardware       0.71      0.72      0.72       392
   comp.sys.mac.hardware       0.82      0.82      0.82       385
          comp.windows.x       0.81      0.74      0.77       395
            misc.forsale       0.74      0.86      0.80       390
               rec.autos       0.90      0.87      0.89       396
         rec.motorcycles       0.95      0.94      0.94       398
      rec.sport.baseball       0.90      0.93      0.91       397
        rec.sport.hockey       0.95      0.96      0.95       399
               sci.crypt       0.92      0.89      0.91       396
         sci.electronics       0.74      0.78      0.76       393
                 sci.med       0.87      0.86      0.86       396
         

#### 5.2 Logistic Regression using countvectorizer

In [79]:
lr = LogisticRegression()

# Train the model
lr.fit(train_cv, newsgroups_train.target)

pred_train2= lr.predict(train_cv)
pred_test2 = lr.predict(test_cv)

In [80]:
print("train data: Logistic Regression Accuracy Score -> ",accuracy_score(newsgroups_train.target, pred_train2)*100)
print("train data: Logistic Regression precision Score -> ",precision_score(newsgroups_train.target, pred_train2,average='macro')*100)
print("train data: Logistic Regression recall Score -> ",recall_score(newsgroups_train.target, pred_train2,average='macro')*100)
print("train data: Logistic Regression f1 Score -> ",f1_score(newsgroups_train.target, pred_train2, average='macro')*100)

train data: Logistic Regression Accuracy Score ->  99.34594307937068
train data: Logistic Regression precision Score ->  99.37330439916387
train data: Logistic Regression recall Score ->  99.35920962531006
train data: Logistic Regression f1 Score ->  99.36513975002069


In [81]:
print("test data: Logistic Regression Accuracy Score -> ",accuracy_score(newsgroups_test.target, pred_test2)*100)
print("test data: Logistic Regression precision Score -> ",precision_score(newsgroups_test.target, pred_test2,average='macro')*100)
print("test data: Logistic Regression recall Score -> ",recall_score(newsgroups_test.target, pred_test2,average='macro')*100)
print("test data: Logistic Regression f1 Score -> ",f1_score(newsgroups_test.target, pred_test2, average='macro')*100)

test data: Logistic Regression Accuracy Score ->  70.06107275624004
test data: Logistic Regression precision Score ->  69.77587967499662
test data: Logistic Regression recall Score ->  69.21498840566878
test data: Logistic Regression f1 Score ->  69.3242005120084


In [82]:
print(metrics.classification_report(newsgroups_test.target, pred_test2, target_names=newsgroups_test.target_names))
print("Accuracy of the model=",metrics.accuracy_score(newsgroups_test.target, pred_test2 ))

                          precision    recall  f1-score   support

             alt.atheism       0.60      0.63      0.62       319
           comp.graphics       0.57      0.59      0.58       389
 comp.os.ms-windows.misc       0.62      0.59      0.61       394
comp.sys.ibm.pc.hardware       0.53      0.60      0.56       392
   comp.sys.mac.hardware       0.60      0.71      0.65       385
          comp.windows.x       0.67      0.64      0.65       395
            misc.forsale       0.76      0.78      0.77       390
               rec.autos       0.74      0.77      0.75       396
         rec.motorcycles       0.85      0.82      0.84       398
      rec.sport.baseball       0.78      0.81      0.79       397
        rec.sport.hockey       0.91      0.88      0.89       399
               sci.crypt       0.88      0.81      0.84       396
         sci.electronics       0.58      0.59      0.58       393
                 sci.med       0.76      0.68      0.72       396
         

#### 6.1 Decision Tree using TFIDF

In [84]:
clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 100, max_depth=50, splitter = 'random',min_samples_split=2, max_leaf_nodes = 80)
clf_entropy.fit(train_tv, newsgroups_train.target)
test_pred = clf_entropy.predict(test_tv)
train_pred = clf_entropy.predict(train_tv)

In [85]:
# Performance Metrics for train data
print("train data: Decision Tree Accuracy Score -> ",accuracy_score(newsgroups_train.target, train_pred)*100)
print("train data: Decision Tree precision Score -> ",precision_score(newsgroups_train.target, train_pred,average='macro')*100)
print("train data: Decision Tree recall Score -> ",recall_score(newsgroups_train.target, train_pred,average='macro')*100)
print("train data: Decision Tree f1 Score -> ",f1_score(newsgroups_train.target, train_pred, average='macro')*100)

train data: Decision Tree Accuracy Score ->  48.96588297684285
train data: Decision Tree precision Score ->  53.43252484761276
train data: Decision Tree recall Score ->  47.59761747978876
train data: Decision Tree f1 Score ->  47.96333061113469


In [86]:
# Performance Metrics for train data
print("test data: Decision Tree Accuracy Score -> ",accuracy_score(newsgroups_test.target, test_pred)*100)
print("test data: Decision Tree precision Score -> ",precision_score(newsgroups_test.target, test_pred,average='macro')*100)
print("test data: Decision Tree recall Score -> ",recall_score(newsgroups_test.target, test_pred,average='macro')*100)
print("test data: Decision Tree f1 Score -> ",f1_score(newsgroups_test.target, test_pred, average='macro')*100)

test data: Decision Tree Accuracy Score ->  42.84386617100372
test data: Decision Tree precision Score ->  46.997319918575485
test data: Decision Tree recall Score ->  41.430644216646115
test data: Decision Tree f1 Score ->  41.89742138923517


In [87]:
print(metrics.classification_report(newsgroups_test.target, test_pred, target_names=newsgroups_test.target_names))
print("Accuracy of the model=",metrics.accuracy_score(newsgroups_test.target, test_pred ))

                          precision    recall  f1-score   support

             alt.atheism       0.31      0.12      0.17       319
           comp.graphics       0.48      0.38      0.42       389
 comp.os.ms-windows.misc       0.56      0.56      0.56       394
comp.sys.ibm.pc.hardware       0.42      0.34      0.38       392
   comp.sys.mac.hardware       0.61      0.47      0.53       385
          comp.windows.x       0.15      0.28      0.20       395
            misc.forsale       0.73      0.48      0.58       390
               rec.autos       0.71      0.58      0.64       396
         rec.motorcycles       0.88      0.66      0.76       398
      rec.sport.baseball       0.27      0.37      0.31       397
        rec.sport.hockey       0.48      0.67      0.56       399
               sci.crypt       0.77      0.58      0.66       396
         sci.electronics       0.25      0.15      0.19       393
                 sci.med       0.16      0.33      0.22       396
         

#### 6.2 Decision Tree using Count Vectorizer

In [88]:
clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 100, max_depth=50, splitter = 'random',min_samples_split=2, max_leaf_nodes = 80)
clf_entropy.fit(train_cv, newsgroups_train.target)
test_pred = clf_entropy.predict(test_cv)
train_pred = clf_entropy.predict(train_cv)

In [89]:
# Performance Metrics for train data
print("train data: Decision Tree Accuracy Score -> ",accuracy_score(newsgroups_train.target, train_pred)*100)
print("train data: Decision Tree precision Score -> ",precision_score(newsgroups_train.target, train_pred,average='macro')*100)
print("train data: Decision Tree recall Score -> ",recall_score(newsgroups_train.target, train_pred,average='macro')*100)
print("train data: Decision Tree f1 Score -> ",f1_score(newsgroups_train.target, train_pred, average='macro')*100)

train data: Decision Tree Accuracy Score ->  48.70072476577691
train data: Decision Tree precision Score ->  56.74709217035879
train data: Decision Tree recall Score ->  47.669401744842524
train data: Decision Tree f1 Score ->  48.68271828829475


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [90]:
# Performance Metrics for train data
print("test data: Decision Tree Accuracy Score -> ",accuracy_score(newsgroups_test.target, test_pred)*100)
print("test data: Decision Tree precision Score -> ",precision_score(newsgroups_test.target, test_pred,average='macro')*100)
print("test data: Decision Tree recall Score -> ",recall_score(newsgroups_test.target, test_pred,average='macro')*100)
print("test data: Decision Tree f1 Score -> ",f1_score(newsgroups_test.target, test_pred, average='macro')*100)

test data: Decision Tree Accuracy Score ->  42.43228890069039
test data: Decision Tree precision Score ->  50.99974857518886
test data: Decision Tree recall Score ->  41.275111269473335
test data: Decision Tree f1 Score ->  42.80237006891186


In [91]:
print(metrics.classification_report(newsgroups_test.target, test_pred, target_names=newsgroups_test.target_names))
print("Accuracy of the model=",metrics.accuracy_score(newsgroups_test.target, test_pred ))

                          precision    recall  f1-score   support

             alt.atheism       0.40      0.16      0.23       319
           comp.graphics       0.41      0.14      0.21       389
 comp.os.ms-windows.misc       0.45      0.61      0.52       394
comp.sys.ibm.pc.hardware       0.23      0.34      0.28       392
   comp.sys.mac.hardware       0.58      0.44      0.50       385
          comp.windows.x       0.77      0.12      0.21       395
            misc.forsale       0.72      0.59      0.65       390
               rec.autos       0.79      0.53      0.63       396
         rec.motorcycles       0.89      0.70      0.78       398
      rec.sport.baseball       0.47      0.51      0.49       397
        rec.sport.hockey       0.84      0.41      0.55       399
               sci.crypt       0.76      0.59      0.66       396
         sci.electronics       0.10      0.33      0.16       393
                 sci.med       0.17      0.23      0.20       396
         

  'precision', 'predicted', average, warn_for)


#### Conclusion:
Decision Tree does not perform well in this case as being a tree classification method it might need several nodes which can be hard to find for text classification problems. However, Decision trees perform well for structured data.

Among various machine learning models Naive Bayes Classifier using TFIDF performs best closely followed by SGD Classifier.