In [1]:
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.pipeline import Pipeline
import pickle
import scipy.sparse
import os
from eval_funcs import *
warnings.filterwarnings("ignore")

## Loading datasets from `analysis_no_features` notebook

In [2]:
# Load sparse matrices
tf_idf_vectors = scipy.sparse.load_npz('./datasets/train_sparse_matrix.npz')
X_test_tfidf = scipy.sparse.load_npz('./datasets/test_sparse_matrix.npz')

In [3]:
# Load tfidf feature names list
# if file exists we have already pickled a list
if os.path.isfile("tfidf_features.txt"):
    with open("tfidf_features.txt", 'rb') as f:
        tfidf_feature_names = pickle.load(f)

In [4]:
#Load X_train and X_test
X_train = pd.read_csv('./datasets/wikihow_X_train.csv') 
X_test = pd.read_csv('./datasets/wikihow_X_test.csv') 

In [5]:
#Load y_train and y_test 
y_train = pd.read_csv('./datasets/wikihow_y_train.csv', header = None) 
y_test = pd.read_csv('./datasets/wikihow_y_test.csv', header = None) 

## Reconstructing Dataframes for analysis 
Let's add some features to see if our model performs better. We start by adding the sentences length feture to see how it performs.

In [6]:
X_train_ft = X_train[['sentence','sentence_len']]

In [7]:
#Reshaping the sentence lengths array 
sent_lengths = np.array(X_train_ft['sentence_len'].values).reshape(-1, 1)

In [8]:
sent_lengths

array([[14],
       [ 9],
       [11],
       ...,
       [ 7],
       [ 9],
       [14]])

In [9]:
#Converting sentence lengths array to sparse matrix
sparse_sent_lengths = scipy.sparse.csr_matrix(sent_lengths)

In [10]:
sparse_sent_lengths

<70000x1 sparse matrix of type '<class 'numpy.int64'>'
	with 70000 stored elements in Compressed Sparse Row format>

In [11]:
#Concatenating the two sparse arrays  
X_train_feats = scipy.sparse.hstack([tf_idf_vectors,sparse_sent_lengths ])

In [12]:
#Check the shape of the arrays  
X_train_feats.shape

(70000, 94995)

In [13]:
#Same with test set 
X_test_ft = X_test[['sentence','sentence_len']]

In [14]:
#Reshaping the sentence lengths array 
sent_lengths_test = np.array(X_test_ft['sentence_len'].values).reshape(-1, 1)

In [15]:
#Converting sentence lengths array to sparse matrix
sparse_sent_lengths_test = scipy.sparse.csr_matrix(sent_lengths_test)

In [16]:
#Concatenating the two sparse arrays  
X_test_feats = scipy.sparse.hstack([X_test_tfidf,sparse_sent_lengths_test ])

In [17]:
X_test_feats.shape

(30000, 94995)

## Perform Cross-Validation with a Naive Bayes Classifier 

In [18]:
nb = MultinomialNB()
param_grid = {'alpha': (0.1, 1, 5, 10, 50)}

In [19]:
nb_CV = GridSearchCV(estimator=nb, param_grid=param_grid, cv=5)

In [20]:
nb_CV.fit(X_train_feats, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                     fit_prior=True),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': (0.1, 1, 5, 10, 50)}, pre_dispatch='2*n_jobs',
             refit=True, return_train_score=False, scoring=None, verbose=0)

In [21]:
nb_CV.best_params_

{'alpha': 0.1}

## Classification using Naive Bayes with tf idf scores and sentence lengths

In [22]:
preds = nb_CV.predict(X_test_feats)

In [23]:
print(f'Confusion Matrix: \n {metrics.confusion_matrix(y_test, preds)}')

Confusion Matrix: 
 [[24809   258]
 [ 3114  1819]]


In [24]:
print(f'Classification Report: \n\n {metrics.classification_report(y_test, preds)}')

Classification Report: 

               precision    recall  f1-score   support

           0       0.89      0.99      0.94     25067
           1       0.88      0.37      0.52      4933

    accuracy                           0.89     30000
   macro avg       0.88      0.68      0.73     30000
weighted avg       0.89      0.89      0.87     30000



## Classification using Naive Bayes with BoW and sentence lengths

In [25]:
#Performing Cross Validation for the best combination of parameters
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=2)
X_train_dtm = vectorizer.fit_transform(X_train['sentence'])
X_train_dtm_dense = X_train_dtm.toarray() 
len(X_train_dtm_dense)

70000

### Add features to X_train_dtm and X_test_dtm 

In [26]:
X_train_bow_feats = scipy.sparse.hstack([X_train_dtm ,sparse_sent_lengths ])
X_train_bow_feats

<70000x99757 sparse matrix of type '<class 'numpy.int64'>'
	with 1718399 stored elements in COOrdinate format>

In [27]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vectorizer.transform(X_test['sentence'])
X_test_dtm

<30000x99756 sparse matrix of type '<class 'numpy.int64'>'
	with 675443 stored elements in Compressed Sparse Row format>

In [28]:
X_test_bow_feats =scipy.sparse.hstack([X_test_dtm ,sparse_sent_lengths_test ])
X_test_bow_feats

<30000x99757 sparse matrix of type '<class 'numpy.int64'>'
	with 705443 stored elements in COOrdinate format>

### Perform MultinomialNB Classification 

In [29]:
nb_bow = MultinomialNB(alpha=0.1)

In [30]:
nb_bow.fit(X_train_bow_feats, y_train)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [31]:
preds_bow= nb_bow.predict(X_test_bow_feats)

In [32]:
print(f'Confusion Matrix: \n {metrics.confusion_matrix(y_test, preds_bow)}')

Confusion Matrix: 
 [[23551  1516]
 [ 2022  2911]]


In [33]:
print(f'Classification Report: \n\n {metrics.classification_report(y_test, preds_bow)}')

Classification Report: 

               precision    recall  f1-score   support

           0       0.92      0.94      0.93     25067
           1       0.66      0.59      0.62      4933

    accuracy                           0.88     30000
   macro avg       0.79      0.76      0.78     30000
weighted avg       0.88      0.88      0.88     30000



**Precision** - For both classes we have pretty high precision although for class 1 it is lower than the precision for the same class with th multinomialNB classifier with no features (0.77)

**Recall (sensitivity)** - is the ratio of correctly predicted positive observations to the all observations in actual class, with adding the sentence length feature, we get a higher recall for class 1 (summary) as opposed to the same classifier without this feature (0.41).

**F1 score** - F1 is usually more useful than accuracy, especially since we have an uneven class distribution. In our case, F1 score is 0.88, but we notice the the F1 score for the summary class has improved (0.62) compared to the model without the `sentence_len` feature (0.53).

Overall the recall for the summary class here is significantly better probably because the MultinomialNB classifier works best with integers (suitable for classification with discrete features).

## Adding the `tfidf_score` to the MultinomialNB model

In [34]:
X_train_ft = X_train[['sentence_len', 'tfidf_score']]

In [35]:
sent_tfidf = np.array(X_train_ft['tfidf_score'].values).reshape(-1, 1)

In [36]:
#Converting sentence tfidf array to sparse matrix
sparse_sent_tfidf = scipy.sparse.csr_matrix(sent_tfidf)

In [37]:
X_train_bow_feats_1 = scipy.sparse.hstack([X_train_bow_feats ,sparse_sent_tfidf])
X_train_bow_feats_1

<70000x99758 sparse matrix of type '<class 'numpy.float64'>'
	with 1788390 stored elements in COOrdinate format>

In [38]:
#Same for test set
X_test_ft = X_test[['sentence','tfidf_score']]
sent_tfidf_test = np.array(X_test_ft['tfidf_score'].values).reshape(-1, 1)
#Converting sentence tfidf array to sparse matrix
sparse_sent_tfidf_test = scipy.sparse.csr_matrix(sent_tfidf_test)
#Concatenating the two sparse arrays  
X_test_bow_feats_1 = scipy.sparse.hstack([X_test_bow_feats,sparse_sent_tfidf_test])

In [39]:
X_test_bow_feats_1

<30000x99758 sparse matrix of type '<class 'numpy.float64'>'
	with 735440 stored elements in COOrdinate format>

In [40]:
nb_bow_1 = MultinomialNB(alpha=0.1)
nb_bow_1.fit(X_train_bow_feats_1, y_train)
preds_bow_1= nb_bow_1.predict(X_test_bow_feats_1)

In [41]:
print(f'Confusion Matrix: \n {metrics.confusion_matrix(y_test, preds_bow_1)}')

Confusion Matrix: 
 [[23508  1559]
 [ 1472  3461]]


In [42]:
print(f'Classification Report: \n\n {metrics.classification_report(y_test, preds_bow_1)}')

Classification Report: 

               precision    recall  f1-score   support

           0       0.94      0.94      0.94     25067
           1       0.69      0.70      0.70      4933

    accuracy                           0.90     30000
   macro avg       0.82      0.82      0.82     30000
weighted avg       0.90      0.90      0.90     30000



This model performs even better 

## Adding the `title_similarity` feature to the MultinomialNB model

In [43]:
X_train_ft = X_train[['sentence_len', 'title_similarity']]

In [44]:
X_train_ft[X_train_ft['title_similarity'] < 0].count()

sentence_len        4
title_similarity    4
dtype: int64

In [45]:
#We only have 3 negative values, let's try making them 0 and see if the model is better this way
X_train_ft[X_train_ft['title_similarity'] < 0] = 0

In [46]:
X_train_ft[X_train_ft['title_similarity'] < 0].count()

sentence_len        0
title_similarity    0
dtype: int64

In [47]:
sent_title_similarity = np.array(X_train_ft['title_similarity'].values).reshape(-1, 1)

In [48]:
#Converting sentence tfidf array to sparse matrix
sparse_sent_title_similarity = scipy.sparse.csr_matrix(sent_title_similarity)

In [49]:
X_train_bow_feats_2 = scipy.sparse.hstack([X_train_bow_feats_1 ,sparse_sent_title_similarity])
X_train_bow_feats_2

<70000x99759 sparse matrix of type '<class 'numpy.float64'>'
	with 1858386 stored elements in COOrdinate format>

In [50]:
#Same for test set
X_test_ft = X_test[['sentence','title_similarity']]
sent_title_similarity_test = np.array(X_test_ft['title_similarity'].values).reshape(-1, 1)
#Converting sentence tfidf array to sparse matrix
sparse_sent_title_similarity_test = scipy.sparse.csr_matrix(sent_title_similarity_test)
#Concatenating the two sparse arrays  
X_test_bow_feats_2 = scipy.sparse.hstack([X_test_bow_feats_1,sparse_sent_title_similarity_test])

In [51]:
X_test_bow_feats_2

<30000x99759 sparse matrix of type '<class 'numpy.float64'>'
	with 765438 stored elements in COOrdinate format>

In [52]:
nb_bow_2 = MultinomialNB(alpha=0.1)
nb_bow_2.fit(X_train_bow_feats_2, y_train)
preds_bow_2= nb_bow_2.predict(X_test_bow_feats_2)

In [53]:
print(f'Confusion Matrix: \n {metrics.confusion_matrix(y_test, preds_bow_2)}')

Confusion Matrix: 
 [[23491  1576]
 [ 1403  3530]]


In [54]:
print(f'Classification Report: \n\n {metrics.classification_report(y_test, preds_bow_2)}')

Classification Report: 

               precision    recall  f1-score   support

           0       0.94      0.94      0.94     25067
           1       0.69      0.72      0.70      4933

    accuracy                           0.90     30000
   macro avg       0.82      0.83      0.82     30000
weighted avg       0.90      0.90      0.90     30000



This model give slighlty better results.

In [55]:
from eval_funcs import *
#New text never before seen and make predictions on it 
text_1_df = pd.read_csv('./datasets/text_1.csv')
text_1_vectors = vectorizer.transform(text_1_df['sentence'])
text_1_df['title_similarity'] = text_1_df.apply(lambda x: get_similarity(x['sentence'], x['title']), axis=1)
text_1_feat = prediction_preprocessing(text_1_df)
text_1_full_feats = scipy.sparse.hstack([text_1_vectors,text_1_feat])
pred_text_1 = nb_bow_2.predict(text_1_full_feats)
results_text_1 = text_1_df[['sentence','is_summary']]
results_text_1['predictions'] = pred_text_1
results_text_1.head(15)

Unnamed: 0,sentence,is_summary,predictions
0,Stop what you re doing,1,0
1,One of the best ways to calm down if you re al...,0,0
2,"Sometimes, even taking a few seconds before yo...",0,0
3,"Try counting to ten, or taking deep breaths, b...",0,0
4,Take a break,1,1
5,"For example, if an argument with your spouse i...",0,0
6,I need to take a minute break before we contin...,0,0
7,"Go to a different place, focus on breathing de...",0,0
8,I can do this,0,0
9,Focus on your senses,1,1


In [56]:
#Calculate recall for text 1
len(results_text_1[(results_text_1['is_summary'] == results_text_1['predictions'])&(results_text_1['predictions']==1)])
len(results_text_1[results_text_1['is_summary']==1])

recall_text_1 = len(results_text_1[(results_text_1['is_summary'] == results_text_1['predictions'])&(results_text_1['predictions']==1)])/len(results_text_1[results_text_1['is_summary']==1])
recall_text_1


0.7142857142857143

In [57]:
#New text never before seen and make predictions on it 
text_2_df = pd.read_csv('./datasets/text_2.csv')
text_2_vectors = vectorizer.transform(text_2_df['sentence'])
text_2_df['title_similarity'] = text_2_df.apply(lambda x: get_similarity(x['sentence'], x['title']), axis=1)
text_2_feat = prediction_preprocessing(text_2_df)
text_2_full_feats = scipy.sparse.hstack([text_2_vectors,text_2_feat])
pred_text_2 = nb_bow_2.predict(text_2_full_feats)
results_text_2 = text_2_df[['sentence','is_summary']]
results_text_2['predictions'] = pred_text_2
results_text_2.head(15)

Unnamed: 0,sentence,is_summary,predictions
0,Know that everyone has their own unique experi...,1,0
1,No one else will grieve in exactly the same wa...,0,0
2,If you feel like you are reacting differently ...,0,0
3,Allow yourself to feel your own unique emotion...,0,0
4,"There is no typical loss, which means there is...",0,0
5,"Sudden losses, such as those due to trauma, ac...",0,0
6,Acknowledge that there are many kinds of loss,1,0
7,Death is a loss we must all deal with at some ...,0,0
8,"However, it is not the only type of loss",0,0
9,You can mourn the ending of a relationship or ...,0,0


In [58]:
#Calculate recall for text 2
len(results_text_2[(results_text_2['is_summary'] == results_text_2['predictions'])&(results_text_2['predictions']==1)])
len(results_text_2[results_text_2['is_summary']==1])

recall_text_2 = len(results_text_2[(results_text_2['is_summary'] == results_text_2['predictions'])&(results_text_2['predictions']==1)])/len(results_text_2[results_text_2['is_summary']==1])
recall_text_2

0.5161290322580645

In [59]:
#New text never before seen and make predictions on it 
text_3_df = pd.read_csv('./datasets/text_3.csv')
text_3_vectors = vectorizer.transform(text_3_df['sentence'])
text_3_df['title_similarity'] = text_3_df.apply(lambda x: get_similarity(x['sentence'], x['title']), axis=1)
text_3_feat = prediction_preprocessing(text_3_df)
text_3_full_feats = scipy.sparse.hstack([text_3_vectors,text_3_feat])
pred_text_3 = nb_bow_2.predict(text_3_full_feats)
results_text_3 = text_3_df[['sentence','is_summary']]
results_text_3['predictions'] = pred_text_3
results_text_3.head(15)

Unnamed: 0,sentence,is_summary,predictions
0,Think about the time commitment of ferret proo...,1,1
1,"Before you get a ferret, you will need to spen...",0,0
2,You will have to run many errands gathering su...,0,0
3,"In the event you move, you will have to repeat...",0,0
4,Be honest with yourself about whether you have...,0,1
5,You will have to fill in any holes or gaps in ...,0,0
6,"You will also have to hide heavier objects, li...",0,0
7,"Ferrets love to climb, and can knock such obje...",0,0
8,"Staircases need to be blocked off, and househo...",0,0
9,Ferrets will chew such objects,0,0


In [60]:
#Calculate recall for text 3
len(results_text_3[(results_text_3['is_summary'] == results_text_3['predictions'])&(results_text_3['predictions']==1)])
len(results_text_3[results_text_3['is_summary']==1])

recall_text_3 = len(results_text_3[(results_text_3['is_summary'] == results_text_3['predictions'])&(results_text_3['predictions']==1)])/len(results_text_3[results_text_3['is_summary']==1])
recall_text_3

0.4

## Calculate the recalls accross articles and evaluate the recall distribution 

In [61]:
recalls = calculate_recall_distribution(X_test, preds_bow_2, y_test)

In [62]:
ids = [item[0] for item in recalls ]
recalls = [item[1] for item in recalls ]

In [63]:
recalls_df = pd.DataFrame(ids , columns=['id'])

In [64]:
recalls_df['recalls']= recalls

In [65]:
recalls_df.head(20)

Unnamed: 0,id,recalls
0,4354,1.0
1,4355,1.0
2,4356,
3,4357,1.0
4,4358,1.0
5,5009,1.0
6,5010,1.0
7,5011,0.0
8,5094,1.0
9,6245,0.0


In [66]:
import matplotlib.pyplot as plt
plt.figure(figsize=(20,10))

plt.hist(recalls_df['recalls'])
plt.show()

<Figure size 2000x1000 with 1 Axes>

In [67]:
recalls_df[recalls_df['recalls'].isnull() == True].count()

id         471
recalls      0
dtype: int64

In [68]:
recalls_df[recalls_df['recalls'].isnull() == False].count()

id         2048
recalls    2048
dtype: int64

## Using LDA for classification

In [89]:
X_train_dtm.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [69]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=10,random_state=42)
#fit model to the training data
#X_train_dtm.toarray().shape
lda_dtf = lda.fit_transform(X_train_dtm)

## Using KMeans on these modelled topics 

In [90]:
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
true_k = 2
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(lda_dtf)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=2, n_init=1, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [92]:
unsupervised_preds = model.predict(lda_dtf)

In [93]:
unsupervised_preds.reshape(-1,1)

array([[1],
       [1],
       [1],
       ...,
       [1],
       [1],
       [0]], dtype=int32)

In [94]:
print(f'Confusion Matrix: \n {metrics.confusion_matrix(y_train, unsupervised_preds.reshape(-1,1))}')

Confusion Matrix: 
 [[14603 43747]
 [ 2049  9601]]


In [95]:
print(f'Classification Report: \n\n {metrics.classification_report(y_train, unsupervised_preds.reshape(-1,1))}')

Classification Report: 

               precision    recall  f1-score   support

           0       0.88      0.25      0.39     58350
           1       0.18      0.82      0.30     11650

    accuracy                           0.35     70000
   macro avg       0.53      0.54      0.34     70000
weighted avg       0.76      0.35      0.37     70000



Precision here is very low

## Use LDA with Logistic Regression for classification 