In [65]:
# Importing libraries
import os
import pickle
import warnings

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score)
from sklearn.model_selection import (GridSearchCV, KFold, RandomizedSearchCV,
                                     learning_curve)
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

%matplotlib inline
warnings.filterwarnings('ignore')

In [2]:
np.random.seed(42)

In [66]:
# Creating a list of stopwords
stopwords_list = list(stopwords.words('english'))
stopwords_list

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [3]:
# Helper function to display the evaluation metrics of the different models
def show_eval_scores(model, test_set, model_name):
    """Function to show to different evaluation score of the model passed
    on the test set.
    
    Parameters:
    -----------
    model: scikit-learn object
        The model whose scores are to be shown.
    test_set: pandas dataframe
        The dataset on which the score of the model is to be shown.
    model_name: string
        The name of the model.
    """
    y_pred = model.predict(test_set['news'])
    y_true = test_set['label']
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    
    print('Report for ---> {}'.format(model_name))
    print('Accuracy is: {}'.format(accuracy))
    print('F1 score is: {}'.format(f1))
    print('Precision score is: {}'.format(precision))
    print('Recall score is: {}'.format(recall))

In [4]:
# Importing the datasets
train_data = pd.read_csv('./datasets/train.csv')
valid_data = pd.read_csv('./datasets/valid.csv')
test_data = pd.read_csv('./datasets/test.csv')

Viewing random rows of all the datasets

In [5]:
train_data.sample(5)

Unnamed: 0,label,news
3842,True,Polling shows that nearly 74 percent of Nation...
6480,False,I left the city with $43 million in the bank.
4521,False,Says she couldn't take stimulus money because ...
4026,True,The United States is the only industrialized c...
10111,False,The Health Care and Education Reconciliation A...


In [6]:
valid_data.sample(5)

Unnamed: 0,label,news
824,True,Al-Qaida has grown fourfold in five years.
548,True,"Under the clear letter of the law, (Justice Cl..."
870,True,"For immigrants with visa overstays, we make no..."
1047,True,The governors budget proposal reduces the stat...
1155,True,Says the director of NASA says its main missio...


In [7]:
test_data.sample(5)

Unnamed: 0,label,news
38,True,"The Fed created $1.2 trillion out of nothing, ..."
734,True,Says Rick Scott stripped women of access to pu...
138,True,Says NFL Commissioner Roger Goodell interviewe...
128,True,The federal government reviewed and verified h...
700,True,"In 1981, Matagorda, Brazoria, and Galveston Co..."


In [8]:
print('Train dataset size: {}'.format(train_data.shape))
print('Valid dataset size: {}'.format(valid_data.shape))
print('Test dataset size: {}'.format(test_data.shape))

Train dataset size: (10240, 2)
Valid dataset size: (1284, 2)
Test dataset size: (1267, 2)


Combining train_data and valid_data into a single training set as GridSearchCV with 5 fold cross validation will be used for hyperparameter tuning the different models

In [9]:
training_set = pd.concat([train_data, valid_data], ignore_index=True)
print('Training set size: {}'.format(training_set.shape))
training_set.sample(5)

Training set size: (11524, 2)


Unnamed: 0,label,news
493,True,Says President Obama has cracked down on emplo...
9003,False,It is truethat we know that ISIS is present in...
9892,False,State budget cuts for local schools resulted i...
4184,True,"Under the presidents plan, he cuts Medicare by..."
8571,False,The last time there was a sustained surge of c...


Creating a CountVectorizer object and analyzing the training set

In [10]:
countV = CountVectorizer()
train_count = countV.fit_transform(training_set['news'].values)

In [11]:
countV.vocabulary_

{'says': 10222,
 'the': 11608,
 'annies': 1100,
 'list': 7016,
 'political': 8858,
 'group': 5398,
 'supports': 11314,
 'third': 11656,
 'trimester': 11927,
 'abortions': 648,
 'on': 8170,
 'demand': 3462,
 'when': 12602,
 'did': 3629,
 'decline': 3365,
 'of': 8112,
 'coal': 2614,
 'start': 11014,
 'it': 6393,
 'started': 11015,
 'natural': 7848,
 'gas': 5124,
 'took': 11762,
 'off': 8114,
 'that': 11596,
 'to': 11732,
 'begin': 1619,
 'in': 6000,
 'president': 9044,
 'george': 5178,
 'bushs': 2071,
 'administration': 788,
 'hillary': 5695,
 'clinton': 2580,
 'agrees': 895,
 'with': 12698,
 'john': 6482,
 'mccain': 7366,
 'by': 2096,
 'voting': 12411,
 'give': 5226,
 'bush': 2069,
 'benefit': 1658,
 'doubt': 3892,
 'iran': 6335,
 'health': 5600,
 'care': 2214,
 'reform': 9614,
 'legislation': 6874,
 'is': 6352,
 'likely': 6980,
 'mandate': 7221,
 'free': 4993,
 'sex': 10485,
 'change': 2382,
 'surgeries': 11326,
 'economic': 4058,
 'turnaround': 11987,
 'at': 1328,
 'end': 4212,
 'my':

In [12]:
len(countV.get_feature_names())

12872

#### Building and tuning Logistic Regression pipeline 

In [74]:
# lr_pipeline = Pipeline([
#     ('lrCV', CountVectorizer(stop_words=stopwords_list)),
#     ('lr_clf', LogisticRegression(random_state=42, n_jobs=-1))
# ])

In [75]:
# param_grid = [
#     {
#         'lrCV__lowercase': [True, False],
#         'lrCV__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
#         'lr_clf__C': [0.0001, 0.00005, 0.00001]
#     }
# ]

# lr_gs = GridSearchCV(lr_pipeline, param_grid, scoring='f1', n_jobs=-1, cv=5, verbose=1)
# lr_gs.fit(training_set['news'], training_set['label'])

In [76]:
# lr_gs.best_params_

In [77]:
# lr_gs.best_score_

In [78]:
lr_pipeline = Pipeline([
    ('lrCV', CountVectorizer(stop_words=stopwords_list, lowercase=True, ngram_range=(1, 1))),
    ('lr_clf', LogisticRegression(C=0.0001,random_state=42, n_jobs=-1))
])

In [79]:
lr_pipeline.fit(training_set['news'], training_set['label'])

Pipeline(memory=None,
     steps=[('lrCV', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['i', 'me',..., penalty='l2', random_state=42,
          solver='warn', tol=0.0001, verbose=0, warm_start=False))])

In [80]:
show_eval_scores(lr_pipeline, test_data, 'Logistic Regression Count Vectorizer')

Report for ---> Logistic Regression Count Vectorizer
Accuracy is: 0.56353591160221
F1 score is: 0.7208480565371024
Precision score is: 0.56353591160221
Recall score is: 1.0


#### Building and tuning Naive Bayes pipeline

In [92]:
# nb_pipeline = Pipeline([
#     ('nb_CV', CountVectorizer(stop_words=stopwords_list)),
#     ('nb_clf', MultinomialNB())
# ])

In [93]:
# param_grid = {
#     'nb_clf__alpha': [i/10.0 for i in range(60, 71)],
#     'nb_CV__lowercase': [True, False],
#     'nb_CV__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)]
# }

# nb_gs = GridSearchCV(nb_pipeline, param_grid, scoring = 'f1', cv=5, n_jobs=-1, verbose=1)
# nb_gs.fit(training_set['news'], training_set['label'])

In [94]:
# nb_gs.best_params_

In [95]:
# nb_gs.best_score_

In [96]:
nb_pipeline = Pipeline([
    ('nb_CV', CountVectorizer(stop_words=stopwords_list, lowercase=True, ngram_range=(1, 4))),
    ('nb_clf', MultinomialNB(alpha=6.8))
])

In [97]:
nb_pipeline.fit(training_set['news'], training_set['label'])

Pipeline(memory=None,
     steps=[('nb_CV', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 4), preprocessor=None,
        stop_words=['i', 'me'...zer=None, vocabulary=None)), ('nb_clf', MultinomialNB(alpha=6.8, class_prior=None, fit_prior=True))])

In [98]:
show_eval_scores(nb_pipeline, test_data, 'Naive Bayes Count Vectorizer')

Report for ---> Naive Bayes Count Vectorizer
Accuracy is: 0.6203630623520127
F1 score is: 0.7326292384658143
Precision score is: 0.6073732718894009
Recall score is: 0.9229691876750701


#### Building and tuning SVM classifier pipeline

In [103]:
# svm_pipeline = Pipeline([
#     ('svm_CV', CountVectorizer(stop_words=stopwords_list)),
#     ('svm_clf', SVC(random_state=42))
# ])

In [104]:
# param_grid = [
#     {
#         'svm_CV__lowercase': [True, False],
#         'svm_CV__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
#         'svm_clf__kernel': ['poly'],
#         'svm_clf__degree': [1, 2, 3]
#     },
#     {
#         'svm_CV__lowercase': [True, False],
#         'svm_CV__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
#         'svm_clf__kernel': ['rbf'],
#         'svm_clf__gamma': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
#     }
# ]

# svm_gs = GridSearchCV(svm_pipeline, param_grid, scoring='f1', n_jobs=-1, cv=5, verbose=1)
# svm_gs.fit(training_set['news'], training_set['label'])

In [105]:
# svm_gs.best_params_

In [106]:
# svm_gs.best_score_

In [107]:
svm_pipeline = Pipeline([
    ('svm_CV', CountVectorizer(stop_words=stopwords_list, lowercase=False, ngram_range=(1, 1))),
    ('svm_clf', SVC(random_state=42, gamma=1.0, kernel='rbf'))
])

In [108]:
svm_pipeline.fit(training_set['news'], training_set['label'])

Pipeline(memory=None,
     steps=[('svm_CV', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['i', 'm...f',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False))])

In [109]:
show_eval_scores(svm_pipeline, test_data, 'SVM Classifier Count Vectorizer')

Report for ---> SVM Classifier Count Vectorizer
Accuracy is: 0.5666929755327546
F1 score is: 0.7211782630777045
Precision score is: 0.5657370517928287
Recall score is: 0.9943977591036415


#### Building and Tuning Random Forest Classifier pipeline 

In [117]:
# rf_pipeline = Pipeline([
#     ('rf_CV', CountVectorizer(stop_words=stopwords_list)),
#     ('rf_clf', RandomForestClassifier(n_jobs=-1, random_state=42))
# ])

In [118]:
# param_grid = {
#     'rf_CV__lowercase': [True, False],
#     'rf_CV__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)],
#     'rf_clf__n_estimators': [200, 300, 400, 500],
#     'rf_clf__max_depth': [i for i in range(8, 13)],
#     'rf_clf__max_features': ['auto', 'sqrt', 'log2']
# }
# rf_gs = GridSearchCV(rf_pipeline, param_grid, scoring='f1', cv=5, verbose=1, n_jobs=-1)
# rf_gs.fit(training_set['news'], training_set['label'])

In [119]:
# rf_gs.best_params_

In [120]:
# rf_gs.best_score_

In [114]:
rf_pipeline = Pipeline([
    ('rf_CV', CountVectorizer(stop_words=stopwords_list, lowercase=False, ngram_range=(1, 1))),
    ('rf_clf', RandomForestClassifier(max_depth=12, n_estimators=300, n_jobs=-1, random_state=42))
])

In [115]:
rf_pipeline.fit(training_set['news'], training_set['label'])

Pipeline(memory=None,
     steps=[('rf_CV', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['i', 'me...imators=300, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False))])

In [116]:
show_eval_scores(rf_pipeline, test_data, 'Random Forest Classifier Count Vectorizer')

Report for ---> Random Forest Classifier Count Vectorizer
Accuracy is: 0.5651144435674822
F1 score is: 0.7215765538150581
Precision score is: 0.5644268774703557
Recall score is: 1.0


#### Building a Voting Classifier using the above created models 

In [121]:
rf_voting_pipeline = Pipeline([
    ('rf_CV', CountVectorizer(stop_words=stopwords_list, lowercase=False, ngram_range=(1, 1))),
    ('rf_clf', RandomForestClassifier(max_depth=12, n_estimators=300, n_jobs=-1, random_state=42))
])

In [128]:
svm_voting_pipeline = Pipeline([
    ('svm_CV', CountVectorizer(stop_words=stopwords_list, lowercase=False, ngram_range=(1, 1))),
    ('svm_clf', SVC(random_state=42, gamma=1.0, kernel='rbf', probability=True))
])

In [123]:
nb_voting_pipeline = Pipeline([
    ('nb_CV', CountVectorizer(stop_words=stopwords_list, lowercase=True, ngram_range=(1, 4))),
    ('nb_clf', MultinomialNB(alpha=6.8))
])

In [124]:
lr_voting_pipeline = Pipeline([
    ('lrCV', CountVectorizer(stop_words=stopwords_list, lowercase=True, ngram_range=(1, 1))),
    ('lr_clf', LogisticRegression(C=0.0001,random_state=42, n_jobs=-1))
])

In [129]:
voting_classifier = VotingClassifier(estimators=[
    ('lr', lr_voting_pipeline), ('nb', nb_voting_pipeline),
    ('svm', svm_voting_pipeline), ('rf', rf_voting_pipeline)], voting='soft', n_jobs=-1)

In [130]:
voting_classifier.fit(training_set['news'], training_set['label'])

VotingClassifier(estimators=[('lr', Pipeline(memory=None,
     steps=[('lrCV', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), pre...tors=300, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False))]))],
         flatten_transform=None, n_jobs=-1, voting='soft', weights=None)

In [135]:
show_eval_scores(voting_classifier, test_data, 'Voting Classifier(soft) Count Vectorizer')

Report for ---> Voting Classifier(soft) Count Vectorizer
Accuracy is: 0.6045777426992897
F1 score is: 0.7293354943273906
Precision score is: 0.5936675461741425
Recall score is: 0.9453781512605042


#### Saving the voting classifier model for future use

In [138]:
pickle.dump(voting_classifier, open(os.path.join('./models', 'voting_classifier_count_vectorizer.pkl'), 'wb'))