In [1]:
# Importing libraries
import os
import pickle
import warnings

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score)
from sklearn.model_selection import (GridSearchCV, KFold, RandomizedSearchCV,
                                     learning_curve)
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

%matplotlib inline
warnings.filterwarnings('ignore')

In [2]:
np.random.seed(42)

In [3]:
# Creating a list of stopwords
stopwords_list = list(stopwords.words('english'))
stopwords_list

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [4]:
# Helper function to display the evaluation metrics of the different models
def show_eval_scores(model, test_set, model_name):
    """Function to show to different evaluation score of the model passed
    on the test set.
    
    Parameters:
    -----------
    model: scikit-learn object
        The model whose scores are to be shown.
    test_set: pandas dataframe
        The dataset on which the score of the model is to be shown.
    model_name: string
        The name of the model.
    """
    y_pred = model.predict(test_set['news'])
    y_true = test_set['label']
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)

    print('Report for ---> {}'.format(model_name))
    print('Accuracy is: {}'.format(accuracy))
    print('F1 score is: {}'.format(f1))
    print('Precision score is: {}'.format(precision))
    print('Recall score is: {}'.format(recall))

In [5]:
# Importing the datasets
train_data = pd.read_csv('./datasets/train.csv')
valid_data = pd.read_csv('./datasets/valid.csv')
test_data = pd.read_csv('./datasets/test.csv')

Viewing random rows of all the datasets

In [6]:
train_data.sample(5)

Unnamed: 0,label,news
3842,True,Polling shows that nearly 74 percent of Nation...
6480,False,I left the city with $43 million in the bank.
4521,False,Says she couldn't take stimulus money because ...
4026,True,The United States is the only industrialized c...
10111,False,The Health Care and Education Reconciliation A...


In [7]:
valid_data.sample(5)

Unnamed: 0,label,news
824,True,Al-Qaida has grown fourfold in five years.
548,True,"Under the clear letter of the law, (Justice Cl..."
870,True,"For immigrants with visa overstays, we make no..."
1047,True,The governors budget proposal reduces the stat...
1155,True,Says the director of NASA says its main missio...


In [8]:
test_data.sample(5)

Unnamed: 0,label,news
38,True,"The Fed created $1.2 trillion out of nothing, ..."
734,True,Says Rick Scott stripped women of access to pu...
138,True,Says NFL Commissioner Roger Goodell interviewe...
128,True,The federal government reviewed and verified h...
700,True,"In 1981, Matagorda, Brazoria, and Galveston Co..."


In [9]:
print('Train dataset size: {}'.format(train_data.shape))
print('Valid dataset size: {}'.format(valid_data.shape))
print('Test dataset size: {}'.format(test_data.shape))

Train dataset size: (10240, 2)
Valid dataset size: (1284, 2)
Test dataset size: (1267, 2)


Combining train_data and valid_data into a single training set as GridSearchCV with 5 fold cross validation will be used for hyperparameter tuning the different models

In [10]:
training_set = pd.concat([train_data, valid_data], ignore_index=True)
print('Training set size: {}'.format(training_set.shape))
training_set.sample(5)

Training set size: (11524, 2)


Unnamed: 0,label,news
493,True,Says President Obama has cracked down on emplo...
9003,False,It is truethat we know that ISIS is present in...
9892,False,State budget cuts for local schools resulted i...
4184,True,"Under the presidents plan, he cuts Medicare by..."
8571,False,The last time there was a sustained surge of c...


Creating a TfidfVectorizer object and analyzing the training set

In [11]:
tfidf_V = TfidfVectorizer(stop_words=stopwords_list, use_idf=True, smooth_idf=True)
train_count = tfidf_V.fit_transform(training_set['news'].values)

In [12]:
tfidf_V.vocabulary_

{'says': 10132,
 'annies': 1091,
 'list': 6952,
 'political': 8770,
 'group': 5358,
 'supports': 11218,
 'third': 11550,
 'trimester': 11816,
 'abortions': 648,
 'demand': 3436,
 'decline': 3339,
 'coal': 2589,
 'start': 10919,
 'started': 10920,
 'natural': 7777,
 'gas': 5084,
 'took': 11651,
 'begin': 1601,
 'president': 8956,
 'george': 5138,
 'bushs': 2049,
 'administration': 786,
 'hillary': 5644,
 'clinton': 2555,
 'agrees': 890,
 'john': 6419,
 'mccain': 7300,
 'voting': 12295,
 'give': 5186,
 'bush': 2047,
 'benefit': 1638,
 'doubt': 3859,
 'iran': 6277,
 'health': 5553,
 'care': 2189,
 'reform': 9525,
 'legislation': 6810,
 'likely': 6916,
 'mandate': 7155,
 'free': 4955,
 'sex': 10395,
 'change': 2357,
 'surgeries': 11230,
 'economic': 4022,
 'turnaround': 11876,
 'end': 4176,
 'term': 11455,
 'chicago': 2416,
 'bears': 1575,
 'starting': 10922,
 'quarterbacks': 9256,
 'last': 6718,
 '10': 21,
 'years': 12678,
 'total': 11672,
 'number': 7959,
 'tenured': 11454,
 'uw': 12130,

In [13]:
len(tfidf_V.get_feature_names())

12735

#### Building and tuning Logistic Regression pipeline 

In [21]:
lr_pipeline = Pipeline([
    ('lr_TF', TfidfVectorizer(stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
    ('lr_clf', LogisticRegression(random_state=42, n_jobs=-1))
])

In [27]:
# param_grid = {
#     'lr_TF__lowercase': [True, False],
#     'lr_TF__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)],
#     'lr_clf__C': [i/10.0 for i in range(10, 21)]
# }

# lr_gs = GridSearchCV(lr_pipeline, param_grid, scoring='f1', cv=5, verbose=1, n_jobs=-1)
# lr_gs.fit(training_set['news'], training_set['label'])

In [28]:
# lr_gs.best_score_

In [29]:
# lr_gs.best_params_

In [26]:
lr_pipeline = Pipeline([
    ('lr_TF', TfidfVectorizer(lowercase=False, ngram_range=(1, 5), stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
    ('lr_clf', LogisticRegression(C=1.0, random_state=42, n_jobs=-1))
])

In [30]:
lr_pipeline.fit(training_set['news'], training_set['label'])

Pipeline(memory=None,
     steps=[('lr_TF', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 5), norm='l2', preprocessor=None, smooth_idf=True...  penalty='l2', random_state=42, solver='warn', tol=0.0001,
          verbose=0, warm_start=False))])

In [31]:
show_eval_scores(lr_pipeline, test_data, 'Logistic Regression TFIDF Vectorizer')

Report for ---> Logistic Regression TFIDF Vectorizer
Accuracy is: 0.6314127861089187
F1 score is: 0.7143730886850153
Precision score is: 0.6340933767643865
Recall score is: 0.8179271708683473


#### Building and tuning Naive Bayes pipeline 

In [52]:
# nb_pipeline = Pipeline([
#     ('nb_TF', TfidfVectorizer(stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
#     ('nb_clf', MultinomialNB())
# ])

In [53]:
# param_grid = {
#     'nb_TF__lowercase': [True, False],
#     'nb_TF__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)],
#     'nb_clf__alpha': [i/10.0 for i in range(20, 31)]
# }

# nb_gs = GridSearchCV(nb_pipeline, param_grid, scoring='f1', cv=5, verbose=1, n_jobs=-1)
# nb_gs.fit(training_set['news'], training_set['label'])

In [54]:
# nb_gs.best_score_

In [55]:
# nb_gs.best_params_

In [46]:
nb_pipeline = Pipeline([
    ('nb_TF', TfidfVectorizer(lowercase=True, ngram_range=(1, 2), stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
    ('nb_clf', MultinomialNB(alpha=2.0))
])

In [47]:
nb_pipeline.fit(training_set['news'], training_set['label'])

Pipeline(memory=None,
     steps=[('nb_TF', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,...,
        vocabulary=None)), ('nb_clf', MultinomialNB(alpha=2.0, class_prior=None, fit_prior=True))])

In [48]:
show_eval_scores(nb_pipeline, test_data, 'Naive Bayes TFIDF Vectorizer')

Report for ---> Naive Bayes TFIDF Vectorizer
Accuracy is: 0.6053670086819258
F1 score is: 0.732905982905983
Precision score is: 0.5924006908462867
Recall score is: 0.9607843137254902


#### Building and Tuning SVM classifier pipeline 

In [63]:
# svm_pipeline = Pipeline([
#     ('svm_TF', TfidfVectorizer(stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
#     ('svm_clf', SVC(random_state=42))
# ])

In [64]:
# param_grid = [
#     {
#         'svm_TF__lowercase': [True, False],
#         'svm_TF__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)],
#         'svm_clf__kernel': ['poly'],
#         'svm_clf__degree': [1, 2, 3]
#     },
#     {
#         'svm_TF__lowercase': [True, False],
#         'svm_TF__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)],
#         'svm_clf__kernel': ['rbf'],
#         'svm_clf__gamma': [i/100.0 for i in range(10, 21)]
#     }
# ]

# svm_gs = GridSearchCV(svm_pipeline, param_grid, scoring='f1', n_jobs=-1, cv=5, verbose=1)
# svm_gs.fit(training_set['news'], training_set['label'])

In [65]:
# svm_gs.best_score_

In [66]:
# svm_gs.best_params_

In [67]:
svm_pipeline = Pipeline([
    ('svm_TF', TfidfVectorizer(lowercase=True, ngram_range=(1, 2), stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
    ('svm_clf', SVC(gamma=0.2, kernel='rbf', random_state=42))
])

In [68]:
svm_pipeline.fit(training_set['news'], training_set['label'])

Pipeline(memory=None,
     steps=[('svm_TF', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True...f',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False))])

In [76]:
show_eval_scores(svm_pipeline, test_data, 'SVM Classifier TFIDF Vectorizer')

Report for ---> SVM Classifier TFIDF Vectorizer
Accuracy is: 0.6006314127861089
F1 score is: 0.7201327433628317
Precision score is: 0.5950639853747715
Recall score is: 0.9117647058823529


#### Building and tuning Random Forest Classifier pipeline 

In [84]:
# rf_pipeline = Pipeline([
#     ('rf_TF', TfidfVectorizer(stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
#     ('rf_clf', RandomForestClassifier(random_state=42, n_jobs=-1))
# ])

In [85]:
# param_grid = {
#     'rf_TF__lowercase': [True, False],
#     'rf_TF__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)],
#     'rf_clf__n_estimators': [100, 200, 300, 400, 500],
#     'rf_clf__max_depth': [i for i in range(8, 16)],
#     'rf_clf__max_features': ['auto', 'sqrt', 'log2']
# }

# rf_gs = GridSearchCV(rf_pipeline, param_grid, scoring='f1', cv=5, verbose=1, n_jobs=-1)
# rf_gs.fit(training_set['news'], training_set['label'])

In [86]:
# rf_gs.best_score_

In [87]:
# rf_gs.best_params_

In [88]:
rf_pipeline = Pipeline([
    ('rf_TF', TfidfVectorizer(lowercase=True, ngram_range=(1, 2), stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
    ('rf_clf', RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, n_jobs=-1))
])

In [89]:
rf_pipeline.fit(training_set['news'], training_set['label'])

Pipeline(memory=None,
     steps=[('rf_TF', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,...imators=200, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False))])

In [90]:
show_eval_scores(rf_pipeline, test_data, 'Random Forest Classifier TFIDF Vectorizer')

Report for ---> Random Forest Classifier TFIDF Vectorizer
Accuracy is: 0.5722178374112076
F1 score is: 0.7248730964467006
Precision score is: 0.5684713375796179
Recall score is: 1.0


#### Building a Voting Classifier using the above created models 

In [91]:
lr_voting_pipeline = Pipeline([
    ('lr_TF', TfidfVectorizer(lowercase=False, ngram_range=(1, 5), stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
    ('lr_clf', LogisticRegression(C=1.0, random_state=42, n_jobs=-1))
])

In [92]:
nb_voting_pipeline = Pipeline([
    ('nb_TF', TfidfVectorizer(lowercase=True, ngram_range=(1, 2), stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
    ('nb_clf', MultinomialNB(alpha=2.0))
])

In [93]:
svm_voting_pipeline = Pipeline([
    ('svm_TF', TfidfVectorizer(lowercase=True, ngram_range=(1, 2), stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
    ('svm_clf', SVC(gamma=0.2, kernel='rbf', random_state=42, probability=True))
])

In [94]:
rf_voting_pipeline = Pipeline([
    ('rf_TF', TfidfVectorizer(lowercase=True, ngram_range=(1, 2), stop_words=stopwords_list, use_idf=True, smooth_idf=True)),
    ('rf_clf', RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, n_jobs=-1))
])

In [95]:
voting_classifier = VotingClassifier(estimators=[
    ('lr', lr_voting_pipeline), ('nb', nb_voting_pipeline),
    ('svm', svm_voting_pipeline), ('rf', rf_voting_pipeline)], voting='soft', n_jobs=-1)

In [96]:
voting_classifier.fit(training_set['news'], training_set['label'])

VotingClassifier(estimators=[('lr', Pipeline(memory=None,
     steps=[('lr_TF', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 5),...tors=200, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False))]))],
         flatten_transform=None, n_jobs=-1, voting='soft', weights=None)

In [97]:
show_eval_scores(voting_classifier, test_data, 'Voting Classifier(soft) TFIDF Vectorizer')

Report for ---> Voting Classifier(soft) TFIDF Vectorizer
Accuracy is: 0.6227308602999211
F1 score is: 0.723699421965318
Precision score is: 0.6161417322834646
Recall score is: 0.876750700280112


#### Saving the voting classifier for future use

In [98]:
pickle.dump(voting_classifier, open(os.path.join('./models', 'voting_classifier_tfidf_vectorizer.pkl'), 'wb'))