In [58]:
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.pipeline import Pipeline
import pickle
import scipy.sparse
import os
warnings.filterwarnings("ignore")

## Loading datasets from `analysis_no_features` notebook

In [59]:
# Load sparse matrices
tf_idf_vectors = scipy.sparse.load_npz('./datasets/train_sparse_matrix.npz')
X_test_tfidf = scipy.sparse.load_npz('./datasets/test_sparse_matrix.npz')

In [60]:
# Load tfidf feature names list
# if file exists we have already pickled a list
if os.path.isfile("tfidf_features.txt"):
    with open("tfidf_features.txt", 'rb') as f:
        tfidf_feature_names = pickle.load(f)

In [61]:
#Load X_train and X_test
X_train = pd.read_csv('./datasets/wikihow_X_train.csv') 
X_test = pd.read_csv('./datasets/wikihow_X_test.csv') 

In [62]:
#Load y_train and y_test 
y_train = pd.read_csv('./datasets/wikihow_y_train.csv', header = None) 
y_test = pd.read_csv('./datasets/wikihow_y_test.csv', header = None) 

## Reconstructing Dataframes for analysis 
Let's add some features to see if our model performs better. We start by adding the sentences length feture to see how it performs.

In [63]:
X_train_ft = X_train[['sentence','sentence_len']]

In [64]:
#Reshaping the sentence lengths array 
sent_lengths = np.array(X_train_ft['sentence_len'].values).reshape(-1, 1)

In [65]:
sent_lengths

array([[14],
       [37],
       [11],
       ...,
       [20],
       [16],
       [20]])

In [66]:
#Converting sentence lengths array to sparse matrix
sparse_sent_lengths = scipy.sparse.csr_matrix(sent_lengths)

In [67]:
sparse_sent_lengths

<70000x1 sparse matrix of type '<class 'numpy.int64'>'
	with 70000 stored elements in Compressed Sparse Row format>

In [68]:
#Concatenating the two sparse arrays  
X_train_feats = scipy.sparse.hstack([tf_idf_vectors,sparse_sent_lengths ])

In [69]:
#Check the shape of the arrays  
X_train_feats.shape

(70000, 95346)

In [71]:
#Same with test set 
X_test_ft = X_test[['sentence','sentence_len']]

In [72]:
#Reshaping the sentence lengths array 
sent_lengths_test = np.array(X_test_ft['sentence_len'].values).reshape(-1, 1)

In [73]:
#Converting sentence lengths array to sparse matrix
sparse_sent_lengths_test = scipy.sparse.csr_matrix(sent_lengths_test)

In [74]:
#Concatenating the two sparse arrays  
X_test_feats = scipy.sparse.hstack([X_test_tfidf,sparse_sent_lengths_test ])

In [75]:
X_test_feats.shape

(30000, 95346)

## Perform Cross-Validation with a Naive Bayes Classifier 

In [77]:
nb = MultinomialNB()
param_grid = {'alpha': (0.1, 1, 5, 10, 50)}

In [78]:
nb_CV = GridSearchCV(estimator=nb, param_grid=param_grid, cv=5)

In [79]:
nb_CV.fit(X_train_feats, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                     fit_prior=True),
             iid='warn', n_jobs=None, param_grid={'alpha': (0.1, 1, 5, 10, 50)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [80]:
nb_CV.best_params_

{'alpha': 0.1}

## Classification using Naive Bayes with tf idf scores and sentence lengths

In [81]:
nb_ft = MultinomialNB(alpha=0.1)

In [82]:
nb_ft.fit(X_train_feats, y_train)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [84]:
preds= nb_ft.predict(X_test_feats)

In [86]:
print(f'Confusion Matrix: \n {metrics.confusion_matrix(y_test, preds)}')

Confusion Matrix: 
 [[24669   291]
 [ 3159  1881]]


In [87]:
print(f'Classification Report: \n\n {metrics.classification_report(y_test, preds)}')

Classification Report: 

               precision    recall  f1-score   support

           0       0.89      0.99      0.93     24960
           1       0.87      0.37      0.52      5040

    accuracy                           0.89     30000
   macro avg       0.88      0.68      0.73     30000
weighted avg       0.88      0.89      0.87     30000



## Classification using Naive Bayes with BoW and sentence lengths

In [119]:
#Performing Cross Validation for the best combination of parameters
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=2)
X_train_dtm = vectorizer.fit_transform(X_train['sentence'])
X_train_dtm_dense = X_train_dtm.toarray() 
len(Xtrain_dtm_dense)

70000

### Add features to X_train_dtm and X_test_dtm 

In [123]:
X_train_bow_feats = scipy.sparse.hstack([X_train_dtm ,sparse_sent_lengths ])
X_train_bow_feats

<70000x100223 sparse matrix of type '<class 'numpy.int64'>'
	with 1719852 stored elements in COOrdinate format>

In [121]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vectorizer.transform(X_test['sentence'])
X_test_dtm

<30000x100222 sparse matrix of type '<class 'numpy.int64'>'
	with 676598 stored elements in Compressed Sparse Row format>

In [125]:
X_test_bow_feats =scipy.sparse.hstack([X_test_dtm ,sparse_sent_lengths_test ])
X_test_bow_feats

<30000x100223 sparse matrix of type '<class 'numpy.int64'>'
	with 706598 stored elements in COOrdinate format>

### Perform MultinomialNB Classification 

In [162]:
nb_bow = MultinomialNB(alpha=0.1)

In [163]:
nb_bow.fit(X_train_bow_feats, y_train)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [164]:
preds_bow= nb_bow.predict(X_test_bow_feats)

In [165]:
print(f'Confusion Matrix: \n {metrics.confusion_matrix(y_test, preds_bow)}')

Confusion Matrix: 
 [[23426  1534]
 [ 2056  2984]]


In [166]:
print(f'Classification Report: \n\n {metrics.classification_report(y_test, preds_bow)}')

Classification Report: 

               precision    recall  f1-score   support

           0       0.92      0.94      0.93     24960
           1       0.66      0.59      0.62      5040

    accuracy                           0.88     30000
   macro avg       0.79      0.77      0.78     30000
weighted avg       0.88      0.88      0.88     30000



**Precision** - For both classes we have pretty high precision although for class 1 it is lower than the precision for the same class with th multinomialNB classifier with no features (0.77)

**Recall (sensitivity)** - is the ratio of correctly predicted positive observations to the all observations in actual class, with adding the sentence length feature, we get a higher recall for class 1 (summary) as opposed to the same classifier without this feature (0.41).

**F1 score** - F1 is usually more useful than accuracy, especially since we have an uneven class distribution. In our case, F1 score is 0.88, but we notice the the F1 score for the summary class has improved (0.62) compared to the model without the `sentence_len` feature (0.53).

Overall the recall for the summary class here is significantly better probably because the MultinomialNB classifier works best with integers (suitable for classification with discrete features).

## Adding the `tfidf_score` to the MultinomialNB model

In [152]:
X_train_ft = X_train[['sentence_len', 'tfidf_score']]

In [153]:
sent_tfidf = np.array(X_train_ft['tfidf_score'].values).reshape(-1, 1)

In [154]:
#Converting sentence tfidf array to sparse matrix
sparse_sent_tfidf = scipy.sparse.csr_matrix(sent_tfidf)

In [155]:
X_train_bow_feats_1 = scipy.sparse.hstack([X_train_bow_feats ,sparse_sent_tfidf])
X_train_bow_feats_1

<70000x100224 sparse matrix of type '<class 'numpy.float64'>'
	with 1789831 stored elements in COOrdinate format>

In [156]:
#Same for test set
X_test_ft = X_test[['sentence','tfidf_score']]
sent_tfidf_test = np.array(X_test_ft['tfidf_score'].values).reshape(-1, 1)
#Converting sentence tfidf array to sparse matrix
sparse_sent_tfidf_test = scipy.sparse.csr_matrix(sent_tfidf_test)
#Concatenating the two sparse arrays  
X_test_bow_feats_1 = scipy.sparse.hstack([X_test_bow_feats,sparse_sent_tfidf_test])

In [157]:
X_test_bow_feats_1

<30000x100224 sparse matrix of type '<class 'numpy.float64'>'
	with 736591 stored elements in COOrdinate format>

In [169]:
nb_bow_1 = MultinomialNB(alpha=0.1)
nb_bow_1.fit(X_train_bow_feats_1, y_train)
preds_bow_1= nb_bow_1.predict(X_test_bow_feats_1)

In [170]:
print(f'Confusion Matrix: \n {metrics.confusion_matrix(y_test, preds_bow_1)}')

Confusion Matrix: 
 [[23434  1526]
 [ 1903  3137]]


In [171]:
print(f'Classification Report: \n\n {metrics.classification_report(y_test, preds_bow_1)}')

Classification Report: 

               precision    recall  f1-score   support

           0       0.92      0.94      0.93     24960
           1       0.67      0.62      0.65      5040

    accuracy                           0.89     30000
   macro avg       0.80      0.78      0.79     30000
weighted avg       0.88      0.89      0.88     30000



This model performs even better 

## Adding the `title_similarity` feature to the MultinomialNB model

In [174]:
X_train_ft = X_train[['sentence_len', 'title_similarity']]

In [183]:
X_train_ft[X_train_ft['title_similarity'] < 0].count()

sentence_len        3
title_similarity    3
dtype: int64

In [184]:
#We only have 3 negative values, let's try making them 0 and see if the model is better this way
X_train_ft[X_train_ft['title_similarity'] < 0] = 0

In [185]:
X_train_ft[X_train_ft['title_similarity'] < 0].count()

sentence_len        0
title_similarity    0
dtype: int64

In [186]:
sent_title_similarity = np.array(X_train_ft['title_similarity'].values).reshape(-1, 1)

In [187]:
#Converting sentence tfidf array to sparse matrix
sparse_sent_title_similarity = scipy.sparse.csr_matrix(sent_title_similarity)

In [188]:
X_train_bow_feats_2 = scipy.sparse.hstack([X_train_bow_feats_1 ,sparse_sent_title_similarity])
X_train_bow_feats_2

<70000x100225 sparse matrix of type '<class 'numpy.float64'>'
	with 1859827 stored elements in COOrdinate format>

In [189]:
#Same for test set
X_test_ft = X_test[['sentence','title_similarity']]
sent_title_similarity_test = np.array(X_test_ft['title_similarity'].values).reshape(-1, 1)
#Converting sentence tfidf array to sparse matrix
sparse_sent_title_similarity_test = scipy.sparse.csr_matrix(sent_title_similarity_test)
#Concatenating the two sparse arrays  
X_test_bow_feats_2 = scipy.sparse.hstack([X_test_bow_feats_1,sparse_sent_title_similarity_test])

In [190]:
X_test_bow_feats_2

<30000x100225 sparse matrix of type '<class 'numpy.float64'>'
	with 766589 stored elements in COOrdinate format>

In [191]:
nb_bow_2 = MultinomialNB(alpha=0.1)
nb_bow_2.fit(X_train_bow_feats_2, y_train)
preds_bow_2= nb_bow_2.predict(X_test_bow_feats_2)

In [192]:
print(f'Confusion Matrix: \n {metrics.confusion_matrix(y_test, preds_bow_2)}')

Confusion Matrix: 
 [[23450  1510]
 [ 1799  3241]]


In [193]:
print(f'Classification Report: \n\n {metrics.classification_report(y_test, preds_bow_2)}')

Classification Report: 

               precision    recall  f1-score   support

           0       0.93      0.94      0.93     24960
           1       0.68      0.64      0.66      5040

    accuracy                           0.89     30000
   macro avg       0.81      0.79      0.80     30000
weighted avg       0.89      0.89      0.89     30000



This model give slighlty better results.