In [66]:
# Importing the libraries
import os
import pickle
import re
import string
import warnings

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score)
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

%matplotlib inline
warnings.filterwarnings('ignore')

In [67]:
np.random.seed(42)

In [68]:
# Creating list of english stopwords
stopwords_list = list(stopwords.words('english'))
stopwords_list

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [69]:
# Helper function to display the evaluation metrics of the different models
def show_eval_scores(model, test_set, model_name):
    """Function to show to different evaluation score of the model passed
    on the test set.
    
    Parameters:
    -----------
    model: scikit-learn object
        The model whose scores are to be shown.
    test_set: pandas dataframe
        The dataset on which the score of the model is to be shown.
    model_name: string
        The name of the model.
    """
    y_pred = model.predict(test_set)
    y_true = test_set['label']
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)

    print('Report for ---> {}'.format(model_name))
    print('Accuracy is: {}'.format(accuracy))
    print('F1 score is: {}'.format(f1))
    print('Precision score is: {}'.format(precision))
    print('Recall score is: {}'.format(recall))

In [70]:
# Importing the datasets
train_data = pd.read_csv('../datasets/train.csv')
valid_data = pd.read_csv('../datasets/valid.csv')
test_data = pd.read_csv('../datasets/test.csv')

Viewing random samples of all datasets

In [71]:
train_data.sample(5)

Unnamed: 0,label,news
3842,True,Polling shows that nearly 74 percent of Nation...
6480,False,I left the city with $43 million in the bank.
4521,False,Says she couldn't take stimulus money because ...
4026,True,The United States is the only industrialized c...
10111,False,The Health Care and Education Reconciliation A...


In [72]:
valid_data.sample(5)

Unnamed: 0,label,news
824,True,Al-Qaida has grown fourfold in five years.
548,True,"Under the clear letter of the law, (Justice Cl..."
870,True,"For immigrants with visa overstays, we make no..."
1047,True,The governors budget proposal reduces the stat...
1155,True,Says the director of NASA says its main missio...


In [73]:
test_data.sample(5)

Unnamed: 0,label,news
38,True,"The Fed created $1.2 trillion out of nothing, ..."
734,True,Says Rick Scott stripped women of access to pu...
138,True,Says NFL Commissioner Roger Goodell interviewe...
128,True,The federal government reviewed and verified h...
700,True,"In 1981, Matagorda, Brazoria, and Galveston Co..."


In [74]:
print('Train dataset size: {}'.format(train_data.shape))
print('Valid dataset size: {}'.format(valid_data.shape))
print('Test dataset size: {}'.format(test_data.shape))

Train dataset size: (10240, 2)
Valid dataset size: (1284, 2)
Test dataset size: (1267, 2)


Combining train_data and valid_data into a single training set as GridSearchCV with 5 fold cross validation will be used for hyperparameter tuning the different models

In [75]:
training_set = pd.concat([train_data, valid_data], ignore_index=True)
print('Training set size: {}'.format(training_set.shape))
training_set.sample(5)

Training set size: (11524, 2)


Unnamed: 0,label,news
493,True,Says President Obama has cracked down on emplo...
9003,False,It is truethat we know that ISIS is present in...
9892,False,State budget cuts for local schools resulted i...
4184,True,"Under the presidents plan, he cuts Medicare by..."
8571,False,The last time there was a sustained surge of c...


Creating a TfidfVectorizer object and analyzing the training set

In [76]:
tfidf_V = TfidfVectorizer(stop_words=stopwords_list, use_idf=True, smooth_idf=True)
train_count = tfidf_V.fit_transform(training_set['news'].values)

In [77]:
tfidf_V.vocabulary_

{'says': 10132,
 'annies': 1091,
 'list': 6952,
 'political': 8770,
 'group': 5358,
 'supports': 11218,
 'third': 11550,
 'trimester': 11816,
 'abortions': 648,
 'demand': 3436,
 'decline': 3339,
 'coal': 2589,
 'start': 10919,
 'started': 10920,
 'natural': 7777,
 'gas': 5084,
 'took': 11651,
 'begin': 1601,
 'president': 8956,
 'george': 5138,
 'bushs': 2049,
 'administration': 786,
 'hillary': 5644,
 'clinton': 2555,
 'agrees': 890,
 'john': 6419,
 'mccain': 7300,
 'voting': 12295,
 'give': 5186,
 'bush': 2047,
 'benefit': 1638,
 'doubt': 3859,
 'iran': 6277,
 'health': 5553,
 'care': 2189,
 'reform': 9525,
 'legislation': 6810,
 'likely': 6916,
 'mandate': 7155,
 'free': 4955,
 'sex': 10395,
 'change': 2357,
 'surgeries': 11230,
 'economic': 4022,
 'turnaround': 11876,
 'end': 4176,
 'term': 11455,
 'chicago': 2416,
 'bears': 1575,
 'starting': 10922,
 'quarterbacks': 9256,
 'last': 6718,
 '10': 21,
 'years': 12678,
 'total': 11672,
 'number': 7959,
 'tenured': 11454,
 'uw': 12130,

In [78]:
len(tfidf_V.get_feature_names_out())


12735

#### Importing the dataset containing the polarity and subjectivity of the datasets 

In [79]:
train_pol_sub = pd.read_csv('../datasets/train_pol_sub.csv')
valid_pol_sub = pd.read_csv('../datasets/valid_pol_sub.csv')
test_pol_sub = pd.read_csv('../datasets/test_pol_sub.csv')

Viewing random samples

In [80]:
train_pol_sub.sample(5)

Unnamed: 0,polarity,subjectivity
8727,-0.1,0.433
6578,0.0,0.0
8904,0.417,0.583
2809,0.0,0.75
2705,0.0,0.0


In [81]:
valid_pol_sub.sample(5)

Unnamed: 0,polarity,subjectivity
1008,0.188,0.313
120,0.0,0.0
1211,0.0,0.0
484,-0.063,0.375
138,0.175,0.45


In [82]:
test_pol_sub.sample(5)

Unnamed: 0,polarity,subjectivity
879,0.15,0.05
658,0.0,0.0
1178,0.054,0.637
590,-0.125,0.375
1117,0.167,0.333


In [83]:
print('Train dataset for polarity and subjectivity size: {}'.format(train_pol_sub.shape))
print('Valid dataset for polarity and subjectivity size: {}'.format(valid_pol_sub.shape))
print('Test dataset for polarity and subjectivity size: {}'.format(test_pol_sub.shape))

Train dataset for polarity and subjectivity size: (10240, 2)
Valid dataset for polarity and subjectivity size: (1284, 2)
Test dataset for polarity and subjectivity size: (1267, 2)


Combining train_pol_sub and valid_pol_sub into a single training set as GridSearchCV with 5 fold cross validation will be used for hyperparameter tuning the different models

In [84]:
training_set_pol_sub = pd.concat([train_pol_sub, valid_pol_sub], ignore_index=True)
training_set = pd.concat([training_set, training_set_pol_sub], axis=1)
test_set = pd.concat([test_data, test_pol_sub], axis=1)

In [85]:
print('Training set size: {}'.format(training_set.shape))
training_set.head()

Training set size: (11524, 4)


Unnamed: 0,label,news,polarity,subjectivity
0,False,Says the Annies List political group supports ...,0.0,0.1
1,True,When did the decline of coal start? It started...,0.1,0.4
2,True,"Hillary Clinton agrees with John McCain ""by vo...",0.0,0.0
3,False,Health care reform legislation is likely to ma...,0.2,0.9
4,True,The economic turnaround started at the end of ...,0.2,0.2


In [86]:
print('Training set size: {}'.format(test_set.shape))
test_set.head()

Training set size: (1267, 4)


Unnamed: 0,label,news,polarity,subjectivity
0,True,Building a wall on the U.S.-Mexico border will...,0.0,0.0
1,False,Wisconsin is on pace to double the number of l...,0.0,0.0
2,False,Says John McCain has done nothing to help the ...,0.0,0.0
3,True,Suzanne Bonamici supports a plan that will cut...,0.0,0.0
4,False,When asked by a reporter whether hes at the ce...,-0.25,0.325


#### Creating custom transformer for creating meta data for the datasets 

In [87]:
class CreateMetaData(BaseEstimator, TransformerMixin):
    """Class to create meta data about the news. The following meta datas
    are created:
    1. polarity
    2. subjectivity
    3. number of capital letters
    4. number of words in news
    5. number of punctuations
    6. number of definite article (the)
    7. number of indefinite articles (a/an)
    8. ratio of capital letters to the number of words
    9. number of quotes in the sentence
    """
    
    def __init__(self, pol_sub=True):
        self.pol_sub = pol_sub
        
    def count_occurence_word(self, statement, words):
        """Function to count the occurence of the word in the statement.
        
        Parameters:
        -----------
        statement: string
            The statement in which the occurence of the word is to be
            counted.
            
        words: list
            The words whose occurence is to be counted.
            
        Returns:
        --------
        count: integer
            The number of occurences of word in the statement.
        """
        count = 0
        for word in words:
            count = count + re.sub(r'[%s]'%(string.punctuation), " ", statement).lower().split().count(word)
        return count
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):        
        number_capital_letter = X['news'].apply(lambda x: len(re.findall(r'[A-Z]', x)))
        number_of_words = X['news'].apply(lambda x: len(x.split()))
        number_of_punctuations = X['news'].apply(lambda x: len(re.findall('[%s]'%(string.punctuation), x)))
        number_of_definite = X['news'].apply(lambda x: self.count_occurence_word(x, ['the']))
        number_of_indefinite = X['news'].apply(lambda x: self.count_occurence_word(x, ['an', 'a']))
        ratio_capital_words = round(number_capital_letter / number_of_words, 3)
        number_of_quotes = (X['news'].str.count('"') + X['news'].str.count("'")) //2
        
        if self.pol_sub:
            return np.c_[number_capital_letter, number_of_words, number_of_punctuations, number_of_definite,
                     number_of_indefinite, ratio_capital_words, number_of_quotes, X['polarity'], 
                     X['subjectivity']]
        else:
            return np.c_[number_capital_letter, number_of_words, number_of_punctuations, number_of_definite,
                         number_of_indefinite, ratio_capital_words, number_of_quotes]            


In [88]:
class ItemSelector(BaseEstimator, TransformerMixin):
    """Class to select particular columns from the dataframe.
    """
    
    def __init__(self, keys):
        self.keys = keys
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):   
        return X[self.keys]

In [89]:
class Converter(BaseEstimator, TransformerMixin):
    """Class to convert 2d output of ItemSelector to 1d
    """
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.values.ravel()

#### Building and tuning Logistic Regression Pipeline 

In [90]:
# lr_pipeline = Pipeline([
#     ('lr_union', FeatureUnion([
#         ('meta_data', CreateMetaData()),
#         ('tfidf', Pipeline([
#             ('item_selector', ItemSelector(['news'])),
#             ('converter', Converter()),
#             ('TF', TfidfVectorizer(stop_words=stopwords_list, use_idf=True, smooth_idf=True))
#         ]))
#     ])),
#     ('lr_clf', LogisticRegression(random_state=42, n_jobs=-1))
# ])

In [91]:
# param_grid = {
#     'lr_union__meta_data__pol_sub': [True, False],
#     'lr_union__tfidf__TF__lowercase': [True, False],
#     'lr_union__tfidf__TF__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)],
#     'lr_clf__C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
# }

# lr_gs = GridSearchCV(lr_pipeline, param_grid, scoring='f1', cv=5, verbose=1, n_jobs=-1)
# lr_gs.fit(training_set, training_set['label'])

In [92]:
# lr_gs.best_score_

In [93]:
# lr_gs.best_params_

In [94]:
lr_pipeline = Pipeline([
    ('lr_union', FeatureUnion([
        ('meta_data', CreateMetaData(pol_sub=False)),
        ('tfidf', Pipeline([
            ('item_selector', ItemSelector(['news'])),
            ('converter', Converter()),
            ('TF', TfidfVectorizer(lowercase=True, ngram_range=(1, 1), stop_words=stopwords_list, use_idf=True, smooth_idf=True))
        ]))
    ])),
    ('lr_clf', LogisticRegression(C=0.0001, random_state=42, n_jobs=-1))
])

In [95]:
lr_pipeline.fit(training_set, training_set['label'])

0,1,2
,steps,"[('lr_union', ...), ('lr_clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformer_list,"[('meta_data', ...), ('tfidf', ...)]"
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True

0,1,2
,pol_sub,False

0,1,2
,keys,['news']

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,"['a', 'about', ...]"
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.0001
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,100


In [96]:
show_eval_scores(lr_pipeline, test_set, 'Logistic Regression TFIDF Vectorizer with meta data')

Report for ---> Logistic Regression TFIDF Vectorizer with meta data
Accuracy is: 0.5753749013417522
F1 score is: 0.7177334732423925
Precision score is: 0.5738255033557047
Recall score is: 0.957983193277311


#### Building and Tuning Naive Bayes pipeline 

In [97]:
# nb_pipeline = Pipeline([
#     ('nb_union', FeatureUnion([
#         ('meta_data', CreateMetaData(pol_sub=False)),
#         ('tfidf', Pipeline([
#             ('item_selector', ItemSelector(['news'])),
#             ('converter', Converter()),
#             ('TF', TfidfVectorizer(stop_words=stopwords_list, use_idf=True, smooth_idf=True))
#         ]))
#     ])),
#     ('nb_clf', MultinomialNB())
# ])

In [98]:
# param_grid = {
#     'nb_union__tfidf__TF__lowercase': [True, False],
#     'nb_union__tfidf__TF__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)],
#     'nb_clf__alpha': [i/10.0 for i in range(18, 32)]
# }

# nb_gs = GridSearchCV(nb_pipeline, param_grid, scoring='f1', cv=5, verbose=1, n_jobs=-1)
# nb_gs.fit(training_set, training_set['label'])

In [99]:
# nb_gs.best_score_

In [100]:
# nb_gs.best_params_

In [101]:
nb_pipeline = Pipeline([
    ('nb_union', FeatureUnion([
        ('meta_data', CreateMetaData(pol_sub=False)),
        ('tfidf', Pipeline([
            ('item_selector', ItemSelector(['news'])),
            ('converter', Converter()),
            ('TF', TfidfVectorizer(lowercase=True, ngram_range=(1, 1), stop_words=stopwords_list, use_idf=True, smooth_idf=True))
        ]))
    ])),
    ('nb_clf', MultinomialNB(alpha=1.9))
])

In [102]:
nb_pipeline.fit(training_set, training_set['label'])

0,1,2
,steps,"[('nb_union', ...), ('nb_clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformer_list,"[('meta_data', ...), ('tfidf', ...)]"
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True

0,1,2
,pol_sub,False

0,1,2
,keys,['news']

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,"['a', 'about', ...]"
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,alpha,1.9
,force_alpha,True
,fit_prior,True
,class_prior,


In [103]:
show_eval_scores(nb_pipeline, test_set, 'Naive Bayes TFIDF Vectorizer with meta data')

Report for ---> Naive Bayes TFIDF Vectorizer with meta data
Accuracy is: 0.5951065509076559
F1 score is: 0.7249329758713137
Precision score is: 0.5873153779322329
Recall score is: 0.9467787114845938


#### Building and tuning SVM classifier 

In [104]:
# svm_pipeline = Pipeline([
#     ('svm_union', FeatureUnion([
#         ('meta_data', CreateMetaData()),
#         ('tfidf', Pipeline([
#             ('item_selector', ItemSelector(['news'])),
#             ('converter', Converter()),
#             ('TF', TfidfVectorizer(stop_words=stopwords_list, use_idf=True, smooth_idf=True))
#         ]))
#     ])),
#     ('svm_clf', SVC(kernel='rbf', random_state=42))
# ])

In [105]:
# param_grid = {
#     'svm_union__tfidf__TF__lowercase': [True, False],
#     'svm_union__meta_data__pol_sub': [True, False],
#     'svm_union__tfidf__TF__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)],
#     'svm_clf__gamma': [i/10.0 for i in range(14, 18)]
# }

# svm_gs = GridSearchCV(svm_pipeline, param_grid, scoring='f1', n_jobs=-1, cv=5, verbose=1)
# svm_gs.fit(training_set, training_set['label'])

In [106]:
# svm_gs.best_score_

In [107]:
# svm_gs.best_params_

In [108]:
svm_pipeline = Pipeline([
    ('svm_union', FeatureUnion([
        ('meta_data', CreateMetaData()),
        ('tfidf', Pipeline([
            ('item_selector', ItemSelector(['news'])),
            ('converter', Converter()),
            ('TF', TfidfVectorizer(lowercase=True, ngram_range=(1, 4), stop_words=stopwords_list, use_idf=True, smooth_idf=True))
        ]))
    ])),
    ('svm_clf', SVC(gamma=1.7, kernel='rbf', random_state=42))
])

In [109]:
svm_pipeline.fit(training_set, training_set['label'])

0,1,2
,steps,"[('svm_union', ...), ('svm_clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformer_list,"[('meta_data', ...), ('tfidf', ...)]"
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True

0,1,2
,pol_sub,True

0,1,2
,keys,['news']

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,"['a', 'about', ...]"
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,1.7
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [110]:
show_eval_scores(svm_pipeline, test_set, 'SVM TFIDF Vectorizer with meta data')

Report for ---> SVM TFIDF Vectorizer with meta data
Accuracy is: 0.5666929755327546
F1 score is: 0.7203260315843097
Precision score is: 0.566052842273819
Recall score is: 0.9901960784313726


#### Building a Voting Classifier using the above created models

In [111]:
lr_voting_pipeline = Pipeline([
    ('lr_union', FeatureUnion([
        ('meta_data', CreateMetaData(pol_sub=False)),
        ('tfidf', Pipeline([
            ('item_selector', ItemSelector(['news'])),
            ('converter', Converter()),
            ('TF', TfidfVectorizer(lowercase=True, ngram_range=(1, 1), stop_words=stopwords_list, use_idf=True, smooth_idf=True))
        ]))
    ])),
    ('lr_clf', LogisticRegression(C=0.0001, random_state=42, n_jobs=-1))
])

In [112]:
nb_voting_pipeline = Pipeline([
    ('nb_union', FeatureUnion([
        ('meta_data', CreateMetaData(pol_sub=False)),
        ('tfidf', Pipeline([
            ('item_selector', ItemSelector(['news'])),
            ('converter', Converter()),
            ('TF', TfidfVectorizer(lowercase=True, ngram_range=(1, 1), stop_words=stopwords_list, use_idf=True, smooth_idf=True))
        ]))
    ])),
    ('nb_clf', MultinomialNB(alpha=1.9))
])

In [113]:
svm_voting_pipeline = Pipeline([
    ('svm_union', FeatureUnion([
        ('meta_data', CreateMetaData()),
        ('tfidf', Pipeline([
            ('item_selector', ItemSelector(['news'])),
            ('converter', Converter()),
            ('TF', TfidfVectorizer(lowercase=True, ngram_range=(1, 4), stop_words=stopwords_list, use_idf=True, smooth_idf=True))
        ]))
    ])),
    ('svm_clf', SVC(gamma=1.7, kernel='rbf', random_state=42, probability=True))
])

In [114]:
voting_classifier = VotingClassifier(estimators=[
    ('lr', lr_voting_pipeline), ('nb', nb_voting_pipeline),
    ('svm', svm_voting_pipeline)], voting='soft', n_jobs=-1)

In [115]:
voting_classifier.fit(training_set, training_set['label'])

0,1,2
,estimators,"[('lr', ...), ('nb', ...), ...]"
,voting,'soft'
,weights,
,n_jobs,-1
,flatten_transform,True
,verbose,False

0,1,2
,transformer_list,"[('meta_data', ...), ('tfidf', ...)]"
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True

0,1,2
,pol_sub,False

0,1,2
,keys,['news']

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,"['a', 'about', ...]"
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.0001
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,100

0,1,2
,transformer_list,"[('meta_data', ...), ('tfidf', ...)]"
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True

0,1,2
,pol_sub,False

0,1,2
,keys,['news']

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,"['a', 'about', ...]"
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,alpha,1.9
,force_alpha,True
,fit_prior,True
,class_prior,

0,1,2
,transformer_list,"[('meta_data', ...), ('tfidf', ...)]"
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True

0,1,2
,pol_sub,True

0,1,2
,keys,['news']

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,"['a', 'about', ...]"
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,1.7
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,


In [116]:
show_eval_scores(voting_classifier, test_set, 'Voting Classifier(soft) TFIDF Vectorizer')

Report for ---> Voting Classifier(soft) TFIDF Vectorizer
Accuracy is: 0.5832675611681136
F1 score is: 0.7232704402515723
Precision score is: 0.5778894472361809
Recall score is: 0.9663865546218487
