# Model Evaluation - Cross Validation

Create text classification models using logistic regression & Naive Bayes and evaluate the performance of those models through Cross Validation and Parameter Tuning to select the best model.

#### Data set

Download the data set and place it in the appropriate directory. The zip file of data set is available on university site - https://content.bellevue.edu/cst/dsc/550/id/data.zip.  The JSONL data files for this project are in reddit directory. Place the "controversial-comments.jsonl" and "categorized-comments.jsonl" in the directory.

#### Import libraries for dataset preparation, feature engineering, model training 

In [1]:
import pandas as pd
import string, numpy as np
import re

from sklearn import model_selection
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from  sklearn.linear_model import LogisticRegression
from sklearn import metrics

from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import LabelEncoder

import nltk
from nltk.corpus import wordnet
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer 
from nltk.stem.snowball import FrenchStemmer

# for measuring accuracy, precision, recall, f1 and auc scores
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_val_score

In [2]:
# load the dataset
df = pd.read_json('controversial-comments.jsonl', lines = 'True')
print(df.head())

   con                                                txt
0    0  Well it's great that he did something about th...
1    0                       You are right Mr. President.
2    0  You have given no input apart from saying I am...
3    0  I get the frustration but the reason they want...
4    0  I am far from an expert on TPP and I would ten...


In [3]:
print(len(df))
print("count of 0 contro :", len(df[df.con == 0]))
print("count of 1 contro :", len(df[df.con == 1]))

950000
count of 0 contro : 908145
count of 1 contro : 41855


In [4]:
#Cleaning text
def textcleaning(text):
    
    text = text.lower()
    #removing \n
    text = re.sub(r'\n', '', text)
    text = re.sub(r'@\w+', '', text)
    # removing urls
    text = re.sub(r'http.?://[^\s]+[\s]?', ' ', text)
    # removing symbols and numbers
    text = re.sub('[^a-zA-Z\s]', '', text)
    # removing 3 letter words
    text = re.sub(r'(\b\w{1,3}\b)', '', text) 
    
    return text

In [5]:
#Applying text cleaning on text field to clean it up
df['clndtxt'] = df['txt'].apply(textcleaning)

In [6]:
#from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    tokens = text.split()
    
    return [lemmatizer.lemmatize(w) for w in tokens]

In [7]:
df['lemmedtxt'] = df['clndtxt'].apply(lemmatize_text).apply(lambda x : " ".join(x))

In [8]:
df.head()

Unnamed: 0,con,txt,clndtxt,lemmedtxt
0,0,Well it's great that he did something about th...,well great that something about those belie...,well great that something about those belief w...
1,0,You are right Mr. President.,right president,right president
2,0,You have given no input apart from saying I am...,have given input apart from saying wrong ...,have given input apart from saying wrong have ...
3,0,I get the frustration but the reason they want...,frustration reason they want them that...,frustration reason they want them that because...
4,0,I am far from an expert on TPP and I would ten...,from expert would tend agree that ...,from expert would tend agree that problem unde...


In [9]:
# Replace the column to latest transformed values
df['clndtxt'] = df['lemmedtxt']
df.head()

Unnamed: 0,con,txt,clndtxt,lemmedtxt
0,0,Well it's great that he did something about th...,well great that something about those belief w...,well great that something about those belief w...
1,0,You are right Mr. President.,right president,right president
2,0,You have given no input apart from saying I am...,have given input apart from saying wrong have ...,have given input apart from saying wrong have ...
3,0,I get the frustration but the reason they want...,frustration reason they want them that because...,frustration reason they want them that because...
4,0,I am far from an expert on TPP and I would ten...,from expert would tend agree that problem unde...,from expert would tend agree that problem unde...


In [10]:
text, target = df['clndtxt'], df['con']

In [11]:
#split data into train and test
text_train, text_test, label_train, label_test = model_selection.train_test_split(df['clndtxt'], df['con'], 
                                                                                  test_size = 0.25)


In [12]:
# Checking lenghts of test and test
print("Number of observations in train", len(text_train))

print("Number of observations in train", len(text_test))


Number of observations in train 712500
Number of observations in train 237500


In [13]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', stop_words='english', lowercase=True, min_df = 2)

cvtrnsftexttrain = count_vect.fit_transform(text_train)
print(len(count_vect.get_feature_names())) 

79527


In [14]:
print(count_vect.fit_transform(text_train).shape)

(712500, 79527)


In [15]:
# converting testing text into matrix form for computation using transform
cvtrnsftexttest = count_vect.transform(text_test)
print(count_vect.transform(text_test).shape) 

(237500, 79527)


In [16]:
# Confirming shapes of test and train transformations
print(cvtrnsftexttrain.shape)
print(cvtrnsftexttest.shape)


(712500, 79527)
(237500, 79527)


# Logistic Regression

Logistict Regression with penalty = L1

In [17]:
# Creating instance for logistic regression with penalty L1
logregmodel = LogisticRegression(n_jobs=-1, penalty='l1', random_state=0) 


# Applying logistic regression to train data set
logregmodel.fit(cvtrnsftexttrain, label_train)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=-1,
          penalty='l1', random_state=0, solver='warn', tol=0.0001,
          verbose=0, warm_start=False)

In [18]:
# predict the labels on trained dataset
logregmodelpredtr = logregmodel.predict(cvtrnsftexttrain)

In [19]:
# predict the labels on test dataset
logregmodelpredtest = logregmodel.predict(cvtrnsftexttest)

In [20]:
# Testing total accurate predictions
print (np.sum(logregmodelpredtest == label_test)) 

226766


In [21]:
print (np.sum(logregmodelpredtest == label_test)/237500) 

0.9548042105263158


In [22]:
#Generate labelled performance metrics
print(metrics.classification_report(label_test, logregmodelpredtest)) 

              precision    recall  f1-score   support

           0       0.96      1.00      0.98    226968
           1       0.15      0.00      0.01     10532

   micro avg       0.95      0.95      0.95    237500
   macro avg       0.55      0.50      0.49    237500
weighted avg       0.92      0.95      0.93    237500



In [23]:
accuracy_train = metrics.accuracy_score(label_train, logregmodelpredtr)
accuracy_test = metrics.accuracy_score(label_test, logregmodelpredtest)
print(accuracy_train, accuracy_test) 

0.9564729824561403 0.9548042105263158


In [119]:
# create rows list for data manipulation
rows = []


In [120]:
def fn_metrics(actual_label, predicted_label):
    """
    function that takes acutal labels and predicted labels and returns
    accuracy, auc, precision, recall and f1 scores
    average = 'micro' - to return global metrics
    """
    accuracy = accuracy_score(actual_label, predicted_label)
    auc =  roc_auc_score(actual_label, predicted_label)
    precision = precision_score(actual_label, predicted_label, average = 'micro')
    recall = recall_score(actual_label, predicted_label,  average = 'micro')
    f1 = f1_score(actual_label, predicted_label, average = 'micro')
    
    return (accuracy, auc, precision, recall, f1)

In [121]:
#print(fn_metrics(label_test, logregmodelpredtest))

acc, auc, prec, recall, f1 = fn_metrics(label_test, logregmodelpredtest)

l1row= ['Logistic Regression (L1)', acc, auc, prec, recall, f1]
print(l1row)

rows.append(l1row)
                    
# ['Logistic Regression (L1)', 0.9548042105263158, 0.5015922168040325, 0.1541095890410959, 0.0042726927459172045, 0.008314855875831485]

['Logistic Regression (L1)', 0.9548042105263158, 0.5015922168040325, 0.9548042105263158, 0.9548042105263158, 0.9548042105263158]


# Logistic Regression for Penalty = L2

In [27]:
logregmodel2 = LogisticRegression(n_jobs=-1, penalty='l2', random_state=0, solver = 'sag')
logregmodel2.fit(cvtrnsftexttrain, label_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=-1,
          penalty='l2', random_state=0, solver='sag', tol=0.0001,
          verbose=0, warm_start=False)

In [28]:
# predict the labels on trained dataset
logregmodelpredtr2 = logregmodel2.predict(cvtrnsftexttrain)

# predict the labels on test dataset
logregmodelpredtest2 = logregmodel2.predict(cvtrnsftexttest)

print(metrics.classification_report(label_test, logregmodelpredtest2)) #, target_names=docs_to_train.target_names))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98    226968
           1       0.24      0.00      0.00     10532

   micro avg       0.96      0.96      0.96    237500
   macro avg       0.60      0.50      0.49    237500
weighted avg       0.92      0.96      0.93    237500



In [29]:
accuracy_trainl2 = metrics.accuracy_score(label_train, logregmodelpredtr2)
accuracy_testl2 = metrics.accuracy_score(label_test, logregmodelpredtest2)
print(accuracy_trainl2, accuracy_testl2)

0.9559705263157895 0.9556


In [122]:
#print(fn_metrics(label_test, logregmodelpredtest))

acc, auc, prec, recall, f1 = fn_metrics(label_test, logregmodelpredtest2)

l2row= ['Logistic Regression (L2)', acc, auc, prec, recall, f1]
print(l2row)

rows.append(l2row)
# ['Logistic Regression (L2)', 0.9556, 0.5002429900623745, 0.24, 0.000569692366122294, 0.0011366865586814436]

['Logistic Regression (L2)', 0.9556, 0.5002429900623745, 0.9556, 0.9556, 0.9556]


# Using Naive Bayes Algorithm

In [31]:
#from sklearn.naive_bayes import MultinomialNB

Nb_cv = MultinomialNB()
Nb_cv.fit(cvtrnsftexttrain, label_train)

# predict the labels on trained dataset
NB_cv_predtr = Nb_cv.predict(cvtrnsftexttrain)

# predict the labels on test dataset
NB_cv_predtst = Nb_cv.predict(cvtrnsftexttest)

# Calculating Accuracy scores
Acc_NB_cv_predtr = metrics.accuracy_score(label_train, NB_cv_predtr)
Acc_NB_cv_predtst = metrics.accuracy_score(label_test, NB_cv_predtst)
print(Acc_NB_cv_predtr, Acc_NB_cv_predtst) 

# Testing total accurate predictions for test
print (np.sum(NB_cv_predtst == label_test)) 

0.9428533333333333 0.9420547368421053
223738


In [123]:
#print(fn_metrics(label_test, logregmodelpredtest))

acc, auc, prec, recall, f1 = fn_metrics(label_test, NB_cv_predtst)

nbrow= ['Naive Bayes', acc, auc, prec, recall, f1]
print(nbrow)

rows.append(nbrow)
# ['Naive Bayes', 0.9420547368421053, 0.5124417086564873, 0.10552027357107963, 0.041017850360805165, 0.05907288390537399]                 

['Naive Bayes', 0.9420547368421053, 0.5124417086564873, 0.9420547368421053, 0.9420547368421053, 0.9420547368421053]


# Creating data frame for reporting

In [124]:
# create data frame to hold results
modelperf_df = pd.DataFrame()
modelperf_df = pd.DataFrame(rows, columns = ['Model', 'Accuracy', 'AUC', 'Precision', 'Recall', 'F1'])
modelperf_df

Unnamed: 0,Model,Accuracy,AUC,Precision,Recall,F1
0,Logistic Regression (L1),0.954804,0.501592,0.954804,0.954804,0.954804
1,Logistic Regression (L2),0.9556,0.500243,0.9556,0.9556,0.9556
2,Naive Bayes,0.942055,0.512442,0.942055,0.942055,0.942055


# Observations:

Logisitc Regression penalty L2 has slightly better accuracy than other models

# b. Cross Validation

In [34]:
from sklearn.model_selection import KFold, cross_val_score

# Create k-Fold cross-validation 
kf3 = KFold(n_splits=3 # 3 fold cross validation
            , shuffle=True # to shuffle observations
            ,random_state=1)

#features = cvtrnsftexttrain #cvtrnsftexttest
#target = label_train #label_test

In [35]:

text, target = df['clndtxt'], df['con']

# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', stop_words='english', lowercase=True, min_df = 2)

# Converting features into vectorized matrix for applying model
features = count_vect.fit_transform(text)
print(len(count_vect.get_feature_names())) 


96711


In [36]:
# Conduct k-fold cross-validation
cv_results = cross_val_score(logregmodel2, # Logistic Regression with penalty L2
                            features, # Feature matrix
                            target, # Target vector
                            cv=kf3, # Cross-validation technique
                            scoring="roc_auc", # Loss function
                            n_jobs=-1) # Use all CPU scores

In [37]:
print(cv_results)
# for test [0.5289055  0.53574204 0.60549728]
# for train [0.62508317 0.61615647 0.6207463 ]
# for entire data set [0.63102262 0.60787666 0.60858587]

[0.63102262 0.60787666 0.60858587]


In [38]:
print(cv_results.mean())

0.6158283839346127


# Observations

Cross validation improved AUC of Logistic Regression (L2) from  0.500668 to  0.615

# Multi Label Classification using Logistic Regression and  Naive Bayes

In [39]:
# load the dataset
catdf = pd.read_json('categorized-comments.jsonl', lines = 'True')
print(catdf.head())

      cat                                                txt
0  sports  Barely better than Gabbert? He was significant...
1  sports  Fuck the ducks and the Angels! But welcome to ...
2  sports  Should have drafted more WRs.\n\n- Matt Millen...
3  sports            [Done](https://i.imgur.com/2YZ90pm.jpg)
4  sports                                      No!! NOO!!!!!


In [40]:
catdf.count()

cat    2347476
txt    2347476
dtype: int64

In [41]:
# unique values for cat
catdf['cat'].unique()

array(['sports', 'science_and_technology', 'video_games', 'news'],
      dtype=object)

In [42]:
# Checking whether categorizations are balanced or not by checking counts for each categorization
catdf.groupby('cat').count()

Unnamed: 0_level_0,txt
cat,Unnamed: 1_level_1
news,408311
science_and_technology,158246
sports,775199
video_games,1005720


In [43]:
#Copying original data frame
catdf_orig = catdf

Looks like categorizations are imbalanced, so need to add logic to balance that by adding class_weight="balanced" parameter and as it is multi-classification add multi_class = 'multinomial' for logistic regression

In [44]:
# Converting "cat" column to numerical encoding
#from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

cat = catdf['cat']
catdf['cat'] = encoder.fit_transform(cat)
catdf.groupby('cat').count()


Unnamed: 0_level_0,txt
cat,Unnamed: 1_level_1
0,408311
1,158246
2,775199
3,1005720


In [45]:
catdf.head()

Unnamed: 0,cat,txt
0,2,Barely better than Gabbert? He was significant...
1,2,Fuck the ducks and the Angels! But welcome to ...
2,2,Should have drafted more WRs.\n\n- Matt Millen...
3,2,[Done](https://i.imgur.com/2YZ90pm.jpg)
4,2,No!! NOO!!!!!


Due to performance issues in handling 2.3M rows, using proportionate sampling to complete model fitting.

In [46]:
# Sampling huge data set and at the same time, keep the fraction of the category by using same samplng fraction across

catdf0 = catdf[catdf.cat == 0 ].sample(frac=0.05, random_state= 1)
#len(catdf0)
catdf1 = catdf[catdf.cat == 1 ].sample(frac=0.05, random_state= 1)
catdf2 = catdf[catdf.cat == 2 ].sample(frac=0.05, random_state= 1)
catdf3 = catdf[catdf.cat == 3 ].sample(frac=0.05, random_state= 1)


sampledcatdf = pd.concat([catdf0, catdf1, catdf2, catdf3])
len(sampledcatdf)


117374

In [47]:
sampledcatdf.groupby('cat').count()

Unnamed: 0_level_0,txt
cat,Unnamed: 1_level_1
0,20416
1,7912
2,38760
3,50286


In [48]:
#Cleaning text
def textcleaning(text):
    
    text = text.lower()
    #removing \n
    text = re.sub(r'\n', '', text)
    text = re.sub(r'@\w+', '', text)
    # removing urls
    text = re.sub(r'http.?://[^\s]+[\s]?', ' ', text)
    # removing symbols and numbers
    text = re.sub('[^a-zA-Z\s]', '', text)
    # removing 3 letter words
    text = re.sub(r'(\b\w{1,3}\b)', '', text) 
    
    return text

In [49]:
# Applying cleaning on sampled data set
sampledcatdf['clndtxt'] = sampledcatdf['txt'].apply(textcleaning)

In [125]:
sampledcatdf.head()

Unnamed: 0,cat,txt,clndtxt
1553277,0,"#""The Main Obstacle To A Stable And Just World...",main obstacle stable just world order un...
2070434,0,It might be you,might
1498674,0,His PR team is on it right now. Big time.,team right time
2244475,0,&gt; My point is that the West Bank is part of...,point that west bank part israel everyt...
2055758,0,Sadly someone who wishes to keep you in a stat...,sadly someone wishes keep state ignoranc...


In [50]:
#split data into train and test
cattext_train, cattext_test, catlabel_train, catlabel_test = model_selection.train_test_split(sampledcatdf['clndtxt']
                                                                                              ,sampledcatdf['cat'], 
                                                                                              test_size = 0.25, 
                                                                                              shuffle=True )

In [51]:
# Checking lenghts of test and test
print("Number of observations in train", len(cattext_train))

print("Number of observations in train", len(cattext_test))


Number of observations in train 88030
Number of observations in train 29344


In [52]:
# Creating count vectorizer objects
catcv_vect = CountVectorizer( analyzer='word', 
                                stop_words = 'english',  # removes english stop words
                                ngram_range=(2,3),       # ngrams - 2,3 
                                max_features=10000, # Had to restrict to 10000 features otherwise its running forever
                                lowercase = True,  
                                max_df = 0.5,  
                                min_df = 3)  

In [53]:
catcvtexttrain = catcv_vect.fit_transform(cattext_train) 
print(len(catcv_vect.get_feature_names()))

10000


In [54]:
print(catcv_vect.fit_transform(cattext_train).shape) #(176061, 34779) #(176061, 41095)

(88030, 10000)


In [55]:
# converting testing text into matric form for computation using transform
catcvtexttest = catcv_vect.transform(cattext_test)
print(catcv_vect.transform(cattext_test).shape) #(58687, 34779) # (11738, 10000)

(29344, 10000)


In [56]:
# Confirming shapes of test and train transformations
print(catcvtexttrain.shape)
print(catcvtexttest.shape)


(88030, 10000)
(29344, 10000)


# Logistric Refression for Categorization Data Set with Penalty L1

In [57]:
catlogregmodel = LogisticRegression(n_jobs=-1, penalty='l1', multi_class = 'multinomial', random_state=0,
                                    class_weight="balanced", solver = 'saga')  # solver="sag" for L2

# fitting logistic regression model to trained vectors
catlogregmodel.fit(catcvtexttrain, catlabel_train)



LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='multinomial', n_jobs=-1, penalty='l1',
          random_state=0, solver='saga', tol=0.0001, verbose=0,
          warm_start=False)

In [149]:
# predict the labels on trained dataset
catlogregmodelpredtr = catlogregmodel.predict(catcvtexttrain)

In [150]:
# predict the labels on test dataset
catlogregmodelpredtest = catlogregmodel.predict(catcvtexttest)

In [151]:
# Testing total accurate predictions
print (np.sum(catlogregmodelpredtest == catlabel_test)) 

12194


In [152]:
#Generate labelled performance metrics

print(metrics.classification_report(catlabel_test, catlogregmodelpredtest)) 

              precision    recall  f1-score   support

           0       0.45      0.26      0.33      5197
           1       0.14      0.16      0.15      1941
           2       0.39      0.81      0.52      9654
           3       0.69      0.22      0.33     12552

   micro avg       0.42      0.42      0.42     29344
   macro avg       0.42      0.36      0.33     29344
weighted avg       0.51      0.42      0.38     29344



In [153]:
cataccuracy_train = metrics.accuracy_score(catlabel_train, catlogregmodelpredtr)
cataccuracy_test = metrics.accuracy_score(catlabel_test, catlogregmodelpredtest)
print(cataccuracy_train, cataccuracy_test) 
# 0.48422368009996875 0.3903561083659908

0.4643644212200386 0.4155534351145038


In [154]:
# creating a dummylist to hold model performance metrics
modelperfrows2 = []

In [155]:
def fn_multiclass_metrics(actual_label, predicted_label):
    """
    function that takes acutal labels and predicted labels and returns
    accuracy, auc, precision, recall and f1 scores
    average = 'weighted' for multi class classification
    """
    accuracy = accuracy_score(actual_label, predicted_label)
    precision = precision_score(actual_label, predicted_label, average = 'weighted')
    recall = recall_score(actual_label, predicted_label, average = 'weighted')
    f1 = f1_score(actual_label, predicted_label, average = 'weighted')

    return (accuracy, precision, recall, f1)

In [156]:
fn_multiclass_metrics(catlabel_test, catlogregmodelpredtest)


(0.4155534351145038,
 0.5132254291964905,
 0.4155534351145038,
 0.3814522093962927)

In [157]:
#print(fn_metrics(label_test, logregmodelpredtest))

acc, prec, recall, f1 = fn_multiclass_metrics(catlabel_test, catlogregmodelpredtest)

acc, prec, recall, f1



(0.4155534351145038,
 0.5132254291964905,
 0.4155534351145038,
 0.3814522093962927)

In [158]:
l1row= ['Logistic Regression (L1)', acc, prec, recall, f1]
print(l1row)

modelperfrows2.append(l1row)
                    

['Logistic Regression (L1)', 0.4155534351145038, 0.5132254291964905, 0.4155534351145038, 0.3814522093962927]


# Logistric Regression with L2 Pentality

In [63]:
# Creating logistic regression with penalty L2, multiclass & imbalanced classification 
catlogregmodel2 = LogisticRegression(n_jobs=-1, penalty='l2', multi_class = 'multinomial', 
                                     random_state=0, class_weight="balanced", solver = 'sag')



In [159]:
# fitting trained data to logistic regression with penalty L2, multiclass & imbalanced classification
catlogregmodel2.fit(catcvtexttrain, catlabel_train)



LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='multinomial', n_jobs=-1, penalty='l2',
          random_state=0, solver='sag', tol=0.0001, verbose=0,
          warm_start=False)

In [160]:
# predict the labels on trained dataset
catlogregmodelpredtr2 = catlogregmodel2.predict(catcvtexttrain)

In [161]:
# predict the labels on test dataset
catlogregmodelpredtest2 = catlogregmodel2.predict(catcvtexttest)

In [162]:
# Testing total accurate predictions
print (np.sum(catlogregmodelpredtest2 == catlabel_test)) 

12353


In [163]:
#Generate labelled performance metrics

print(metrics.classification_report(catlabel_test, catlogregmodelpredtest2)) 

              precision    recall  f1-score   support

           0       0.45      0.26      0.33      5197
           1       0.14      0.17      0.15      1941
           2       0.39      0.80      0.52      9654
           3       0.69      0.24      0.35     12552

   micro avg       0.42      0.42      0.42     29344
   macro avg       0.42      0.37      0.34     29344
weighted avg       0.51      0.42      0.39     29344



In [164]:
cataccuracy_train2 = metrics.accuracy_score(catlabel_train, catlogregmodelpredtr2)
cataccuracy_test2 = metrics.accuracy_score(catlabel_test, catlogregmodelpredtest2)
print(cataccuracy_train2, cataccuracy_test2) 

0.4867090764512098 0.42097191930207195


In [165]:
acc, prec, recall, f1 = fn_multiclass_metrics(catlabel_test, catlogregmodelpredtest2)

acc, prec, recall, f1

(0.42097191930207195,
 0.5120788817372477,
 0.42097191930207195,
 0.39222068844269153)

In [166]:
# creating a list of accuracy, precision, recall, f1
l2row= ['Logistic Regression (L2)', acc, prec, recall, f1]
print(l2row)

# Appending logistic regression penalty 2 model metrics
modelperfrows2.append(l2row)

['Logistic Regression (L2)', 0.42097191930207195, 0.5120788817372477, 0.42097191930207195, 0.39222068844269153]


# Navie Bayes Algorithm for Multi Classification

In [99]:
catNb_cv = MultinomialNB(alpha = 1) # added alpha = 1 for laplace smoothing for multi-classification

catNb_cv.fit(catcvtexttrain, catlabel_train)

# predict the labels on trained dataset
catNb_cv_predtr = catNb_cv.predict(catcvtexttrain)

# predict the labels on test dataset
catNb_cv_predtst = catNb_cv.predict(catcvtexttest)

# Calculating Accuracy scores
Acc_catNb_cv_predtr = metrics.accuracy_score(catlabel_train, catNb_cv_predtr)
Acc_catNb_cv_predtst = metrics.accuracy_score(catlabel_test, catNb_cv_predtst)

print(Acc_catNb_cv_predtr, Acc_catNb_cv_predtst) 

# Testing total accurate predictions for test
print (np.sum(catNb_cv_predtst == catlabel_test)) 


0.5064523457912076 0.4685114503816794
13748


In [100]:
# Classification Report for test data set using Naive Bayes and CV (for vectorization) algorithms
print(metrics.classification_report(catlabel_test, catNb_cv_predtst)) 

              precision    recall  f1-score   support

           0       0.62      0.21      0.31      5197
           1       0.72      0.02      0.04      1941
           2       0.43      0.26      0.32      9654
           3       0.47      0.81      0.59     12552

   micro avg       0.47      0.47      0.47     29344
   macro avg       0.56      0.32      0.32     29344
weighted avg       0.50      0.47      0.42     29344



In [168]:
acc, prec, recall, f1 = fn_multiclass_metrics(catlabel_test, catNb_cv_predtst)

acc, prec, recall, f1


(0.4685114503816794,
 0.4985819173945918,
 0.4685114503816794,
 0.4166923428768707)

In [169]:
nbrow= ['Naive Bayes', acc, prec, recall, f1]
print(nbrow)

modelperfrows2.append(nbrow)
                    

['Naive Bayes', 0.4685114503816794, 0.4985819173945918, 0.4685114503816794, 0.4166923428768707]


In [170]:
modelperfrows2

[['Logistic Regression (L1)',
  0.4155534351145038,
  0.5132254291964905,
  0.4155534351145038,
  0.3814522093962927],
 ['Logistic Regression (L2)',
  0.42097191930207195,
  0.5120788817372477,
  0.42097191930207195,
  0.39222068844269153],
 ['Naive Bayes',
  0.4685114503816794,
  0.4985819173945918,
  0.4685114503816794,
  0.4166923428768707]]

# Creating data Frame for reporting

In [171]:
# create data frame to hold results
multiclass_modelperf_df = pd.DataFrame()
multiclass_modelperf_df = pd.DataFrame(modelperfrows2, columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1'])
multiclass_modelperf_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Logistic Regression (L1),0.415553,0.513225,0.415553,0.381452
1,Logistic Regression (L2),0.420972,0.512079,0.420972,0.392221
2,Naive Bayes,0.468511,0.498582,0.468511,0.416692


# Observations:

The lower accuracies for Multi-Classificaiton might have something to do 
    with sampling data set and restricting the maxfeatures to 10000.
    
Naive Bayes performed better than Logistic Regression in terms of both accuracy, recall and F1, but has lesser precision.

Logistic Regression with Penalty L2 has slightly better accuracy than Penalty L1

# b. Cross Validation

In [126]:
from sklearn.model_selection import KFold, cross_val_score

# Create k-Fold cross-validation 
kf3 = KFold(n_splits=3 # 3 fold cross validation
            , shuffle=True # to shuffle observations
            ,random_state=1)



In [128]:
text, target = sampledcatdf['clndtxt'], sampledcatdf['cat']

In [129]:

# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', 
                                stop_words = 'english',  # removes english stop words
                                ngram_range=(2,3),       # ngrams - 2,3 
                                max_features=10000, 
                                lowercase = True,  
                                max_df = 0.5,  
                                min_df = 3)

# Converting features into vectorized matrix for applying model
features = count_vect.fit_transform(text)
print(len(count_vect.get_feature_names())) 

10000


In [130]:

# Conduct k-fold cross-validation
cv_results = cross_val_score(logregmodel2, # Logistic Regression with penalty L2
                            features, # Feature matrix
                            target, # Target vector
                            cv=kf3, # Cross-validation technique
                            scoring="f1_weighted", # for imbalanced multi-class returns average of all labels
                            n_jobs=-1) # Use all CPU scores

In [131]:
print(cv_results)

[0.39113744 0.39859671 0.39619027]


In [132]:
# the cross-validation score of the model’s F1 score using three-fold cross validation
print(cv_results.mean())

0.39530813887574134


# 3. Hyperparameter Selection

In [133]:
# Use scikit-learn’s GridSearchCV
from sklearn.model_selection import GridSearchCV

In [136]:
# Create logistic regression
logistic = LogisticRegression()

# Create range of candidate penalty hyperparameter values
penalty = ['l1', 'l2']

# Create range of candidate regularization hyperparameter values
C = np.logspace(0, 4, 10)

# Create dictionary hyperparameter candidates
hyperparameters = dict(C=C, penalty=penalty)

In [137]:
# Create grid search
gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, n_jobs=-1, verbose=0)

In [138]:
text, target = df['clndtxt'], df['con']

# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', stop_words='english', lowercase=True, min_df = 2)

# Converting features into vectorized matrix for applying model
features = count_vect.fit_transform(text)
print(len(count_vect.get_feature_names())) 

96711


In [139]:
# Fit grid search
best_model = gridsearch.fit(features, target)



In [140]:
# View best hyperparameters
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])

Best Penalty: l1
Best C: 1.0


In [141]:
# Conduct nested cross-validation and outut the average score
cross_val_score(gridsearch, features, target).mean()



0.9547884215976842

In [142]:
# Creating data frame from cross validation of grid search with various parameters and results
results = pd.DataFrame(gridsearch.cv_results_)
results.head(10)



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,94.417649,8.344085,0.128166,0.018739,1.0,l1,"{'C': 1.0, 'penalty': 'l1'}",0.9543,0.954742,0.954921,...,0.954766,0.000343,1,0.956432,0.956404,0.956399,0.956443,0.956389,0.956413,2.1e-05
1,375.220455,13.539467,0.105151,0.00863,1.0,l2,"{'C': 1.0, 'penalty': 'l2'}",0.954258,0.954568,0.954742,...,0.954618,0.000395,2,0.956588,0.956533,0.956629,0.956759,0.956593,0.956621,7.6e-05
2,88.468141,11.406862,0.102076,0.012127,2.78256,l1,"{'C': 2.7825594022071245, 'penalty': 'l1'}",0.950889,0.951253,0.951858,...,0.951601,0.000686,12,0.95968,0.959639,0.95962,0.959666,0.959518,0.959625,5.7e-05
3,389.289163,37.168921,0.115767,0.007892,2.78256,l2,"{'C': 2.7825594022071245, 'penalty': 'l2'}",0.9538,0.954179,0.954311,...,0.954162,0.000436,3,0.95698,0.956929,0.957253,0.957395,0.957176,0.957147,0.000172
4,116.183664,13.235475,0.106541,0.011664,7.74264,l1,"{'C': 7.742636826811269, 'penalty': 'l1'}",0.946668,0.947116,0.947953,...,0.947642,0.001019,13,0.962087,0.962047,0.961933,0.962254,0.961868,0.962038,0.000133
5,430.987193,29.21152,0.120428,0.020043,7.74264,l2,"{'C': 7.742636826811269, 'penalty': 'l2'}",0.953605,0.953826,0.953726,...,0.953844,0.000455,4,0.957213,0.957345,0.958109,0.95772,0.957645,0.957606,0.000313
6,123.808456,18.734265,0.111004,0.009997,21.5443,l1,"{'C': 21.544346900318832, 'penalty': 'l1'}",0.943895,0.944395,0.945442,...,0.945082,0.001179,14,0.962662,0.962632,0.962611,0.962812,0.962488,0.962641,0.000104
7,468.162305,47.013286,0.144834,0.023164,21.5443,l2,"{'C': 21.544346900318832, 'penalty': 'l2'}",0.953447,0.953963,0.953642,...,0.953699,0.000495,7,0.957408,0.957193,0.95815,0.958178,0.958079,0.957802,0.000416
8,154.357125,28.032086,0.125785,0.012787,59.9484,l1,"{'C': 59.94842503189409, 'penalty': 'l1'}",0.943047,0.943411,0.944416,...,0.944076,0.001139,15,0.962825,0.962805,0.962807,0.962992,0.962705,0.962827,9.3e-05
9,479.928193,23.774045,0.132133,0.027747,59.9484,l2,"{'C': 59.94842503189409, 'penalty': 'l2'}",0.953447,0.953863,0.953584,...,0.953704,0.000476,6,0.957353,0.957266,0.958197,0.958064,0.957984,0.957773,0.000386
