### Importing libraries

In [3]:
from pprint import pprint
from time import time
import logging
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
import numpy as np
import gensim
import re
from sklearn.pipeline import Pipeline

### Loading 2 categories.

In [4]:
# Load some categories from the training set
categories = [
    'alt.atheism',
    'talk.religion.misc',
]

In [5]:
print("Loading 20 newsgroups dataset for categories:")
print(categories)

Loading 20 newsgroups dataset for categories:
['alt.atheism', 'talk.religion.misc']


### Fetching and printing the data

In [6]:
data = fetch_20newsgroups(subset='train', categories=categories)
print(f"{len(data.filenames)} documents")
print(f"{len(data.target_names)} categories")
print()
data

857 documents
2 categories



{'data': ['From: mangoe@cs.umd.edu (Charley Wingate)\nSubject: Benediktine Metaphysics\nLines: 24\n\nBenedikt Rosenau writes, with great authority:\n\n>     IF IT IS CONTRADICTORY IT CANNOT EXIST.\n\n"Contradictory" is a property of language.  If I correct this to\n\n\n      THINGS DEFINED BY CONTRADICTORY LANGUAGE DO NOT EXIST\n\nI will object to definitions as reality.  If you then amend it to\n\n      THINGS DESCRIBED BY CONTRADICTORY LANGUAGE DO NOT EXIST\n\nthen we\'ve come to something which is plainly false.  Failures in\ndescription are merely failures in description.\n\n(I\'m not an objectivist, remember.)\n\n\n-- \nC. Wingate        + "The peace of God, it is no peace,\n                  +    but strife closed in the sod.\nmangoe@cs.umd.edu +  Yet, brothers, pray for but one thing:\ntove!mangoe       +    the marv\'lous peace of God."\n',
  'Subject: Re: There must be a creator! (Maybe)\nFrom: halat@pooh.bears (Jim Halat)\nReply-To: halat@pooh.bears (Jim Halat)\nLines: 24\n\n

In [7]:
### Cleaning dataset.

def clean(data):
    data=re.sub(r'[^A-Za-z0-9@.]+', ' ',data)
    data=data.lower()
    return data

for i in range(0,len(data.data)):
    data.data[i]=clean(data.data[i])

for i in range(0,len(data.data)):
    data.data[i]=gensim.utils.simple_preprocess(data.data[i])

for i in range(0,len(data.data)):
    data.data[i]=' '.join(data.data[i])

In [8]:
data

{'data': ['from mangoe cs umd edu charley wingate subject benediktine metaphysics lines benedikt rosenau writes with great authority if it is contradictory it cannot exist contradictory is property of language if correct this to things defined by contradictory language do not exist will object to definitions as reality if you then amend it to things described by contradictory language do not exist then we ve come to something which is plainly false failures in description are merely failures in description not an objectivist remember wingate the peace of god it is no peace but strife closed in the sod mangoe cs umd edu yet brothers pray for but one thing tove mangoe the marv lous peace of god',
  'subject re there must be creator maybe from halat pooh bears jim halat reply to halat pooh bears jim halat lines in article ba drporter suvm syr edu drporter suvm syr edu brad porter writes science is wonderful at answering most of our questions not the type to question scientific findings ve

### Modelling

### Hyperparamters Tuning

In [26]:
par_lr = {'clf__penalty': ['l1', 'l2'],'clf__C': [0.1, 1, 10], 'clf__solver': ['liblinear']} 
par_svc = {'clf__gamma': [1, 0.1, 0.01], 'clf__kernel':['linear', 'poly', 'rbf']}
par_rf = {'clf__n_estimators':[100, 300], 
            'clf__max_features':['auto', 'sqrt'], 
            'clf__max_depth': [5, 10, None],
            'clf__min_samples_split': [2, 5, 10],
            'clf__min_samples_leaf': [1, 2, 4],
            'clf__bootstrap': [True, False]}
par_nb = {"clf__alpha": (0.00001, 0.000001)}

par_cv = {'vect__max_df': [0.5, 0.75, 1.0], 'vect__ngram_range': [(1, 1), (1, 2)], 'vect__stop_words': ['english'],
            'vect__analyzer':['word'] }

par_tfidv = {'vect__max_df': [0.5, 0.75, 1.0], 'vect__ngram_range': [(1, 1), (1, 2)], 'vect__stop_words': ['english'],
               'vect__analyzer':['word']}

### Loading the models and vectorizers into a pipeline

In [27]:
vctr=[CountVectorizer(), TfidfVectorizer()]
clfr=[MultinomialNB(), LogisticRegression(random_state=1), SVC(random_state=1), RandomForestClassifier(random_state=1)]
par_clfr=[par_nb, par_lr,par_svc, par_rf]
par_vctr=[par_cv,par_tfidv]

### Applying GridSearch CV.

In [28]:
for i in range(0,len(vctr)):
    for j in range(0,len(clfr)):
        pipeline = Pipeline([('vect', vctr[i]),('clf', clfr[j])])
        grid_search = GridSearchCV(pipeline, {**par_vctr[i], **par_clfr[j]},
                                   cv=5,n_jobs=-1, verbose=1)
        grid_search.fit(data.data, data.target)
        print(f'vectorizer: {vctr[i]} \nmodel: {clfr[j]}\n Best_parameters: {grid_search.best_params_}\nScore: {grid_search.best_score_}\n')

Fitting 5 folds for each of 12 candidates, totalling 60 fits
vectorizer: CountVectorizer() 
model: MultinomialNB()
 Best_parameters: {'clf__alpha': 1e-05, 'vect__analyzer': 'word', 'vect__max_df': 0.5, 'vect__ngram_range': (1, 2), 'vect__stop_words': 'english'}
Score: 0.9498232014143888

Fitting 5 folds for each of 36 candidates, totalling 180 fits
vectorizer: CountVectorizer() 
model: LogisticRegression(random_state=1)
 Best_parameters: {'clf__C': 10, 'clf__penalty': 'l2', 'clf__solver': 'liblinear', 'vect__analyzer': 'word', 'vect__max_df': 0.75, 'vect__ngram_range': (1, 2), 'vect__stop_words': 'english'}
Score: 0.9544879640962872

Fitting 5 folds for each of 54 candidates, totalling 270 fits
vectorizer: CountVectorizer() 
model: SVC(random_state=1)
 Best_parameters: {'clf__gamma': 1, 'clf__kernel': 'linear', 'vect__analyzer': 'word', 'vect__max_df': 0.5, 'vect__ngram_range': (1, 2), 'vect__stop_words': 'english'}
Score: 0.9533387732898138

Fitting 5 folds for each of 1296 candidates

### We can observe that almost all combination give very good results.

### The best result was given by SVC model with TdIdf vectorizer and the accuracy score was 95.56%
