# TL;DR In-depth Analysis

## Preprocessing

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
wikihow_subset = pd.read_csv('./datasets/wikihow_sep_subset.csv')

In [3]:
wikihow_subset.head(3)

Unnamed: 0,text_id,sentence,is_summary,words,title,sentence_len,tfidf_score,title_similarity,title_sim_categories,help,...,want,many,ask,good,find,work,go,include,important,know
0,0,Sell yourself first,yes,"['Sell', 'yourself', 'first']",How to Sell Fine Art Online,3,1.725841,0.82891,Strong,0,...,0,0,0,0,0,0,0,0,0,0
1,0,"Before doing anything else, stop and sum up yo...",no,"['Before', 'doing', 'anything', 'else,', 'stop...",How to Sell Fine Art Online,12,3.421057,0.806864,Strong,0,...,0,0,0,0,0,0,0,0,0,0
2,0,"Now, think about how to translate that to an o...",no,"['Now,', 'think', 'about', 'how', 'to', 'trans...",How to Sell Fine Art Online,11,3.064425,0.857768,Strong,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Factorize `is_summary`
wikihow_subset['summary_id'] = wikihow_subset['is_summary'].apply(lambda x: 1 if x == 'yes' else 0)

In [5]:
# Features to keep
features = wikihow_subset[['text_id','sentence', 'summary_id','sentence_len','tfidf_score', 'title_similarity']]

In [6]:
# Value counts of the classes to get an idea of the null accuracy
features.summary_id.value_counts()

0    83438
1    16562
Name: summary_id, dtype: int64

In [7]:
#Computing null accuracy
null_accuracy = 83438/(16562+83438)
null_accuracy

0.83438

## Performing Cross-Validation on CountVectorizer and MultinomialNB

In [8]:
# Splitting the dataset 
from sklearn.model_selection import train_test_split
X = features[['sentence', 'sentence_len', 'tfidf_score','title_similarity']]
y = features['summary_id']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
X_train_no_feat = X_train['sentence']
X_test_no_feat = X_test['sentence']

In [10]:
#Performing Cross Validation for the best combination of parameters
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.pipeline import Pipeline
# Define a pipeline combining a text feature extractor with a simple classifier
pipeline = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('nb', MultinomialNB()),
])


parameters = {
    'vect__min_df': (2, 5, 10, 15, 20, 30, 40, 50),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'nb__alpha': (0.1, 1, 5, 10, 50),
}

In [11]:
# Find the best parameters for both the feature extraction and the classifier
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

In [12]:
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
print(parameters)
grid_search.fit(X_train_no_feat, y_train)
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'nb']
parameters:
{'vect__min_df': (2, 5, 10, 15, 20, 30, 40, 50), 'vect__ngram_range': ((1, 1), (1, 2)), 'nb__alpha': (0.1, 1, 5, 10, 50)}
Fitting 3 folds for each of 80 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   51.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  4.2min finished


Best score: 0.845
Best parameters set:
	nb__alpha: 1
	vect__min_df: 2
	vect__ngram_range: (1, 2)


## Creating a BOW model with the best parameters 

In [13]:
vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=2)
Xtrain_dtm = vectorizer.fit_transform(X_train_no_feat)
Xtrain_dtm_dense = Xtrain_dtm.toarray() 
len(Xtrain_dtm_dense)

70000

In [14]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vectorizer.transform(X_test_no_feat)
X_test_dtm

<30000x100222 sparse matrix of type '<class 'numpy.int64'>'
	with 676598 stored elements in Compressed Sparse Row format>

## Using Multinomial Naive Bayes with the best parameters

In [15]:
best_nb = MultinomialNB()

In [16]:
# train the model using X_train_dtm
best_nb.fit(Xtrain_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [17]:
# make class predictions for X_test_dtm
y_pred_class = best_nb.predict(X_test_dtm)

In [18]:
# calculate performance metrics
from sklearn import metrics
print(f'Confusion Matrix: \n {metrics.confusion_matrix(y_test, y_pred_class)}')

Confusion Matrix: 
 [[24356   604]
 [ 2987  2053]]


In [19]:
print(f'Classification Report: \n\n {metrics.classification_report(y_test, y_pred_class)}')

Classification Report: 

               precision    recall  f1-score   support

           0       0.89      0.98      0.93     24960
           1       0.77      0.41      0.53      5040

    accuracy                           0.88     30000
   macro avg       0.83      0.69      0.73     30000
weighted avg       0.87      0.88      0.86     30000



Low recall, high precision -> classifier is very picky and misses a lot of actual summary sentences.

In [20]:
print(f'Accuracy: {metrics.accuracy_score(y_test, y_pred_class)}')

Accuracy: 0.8803


Our model performs slightly better than the null accuracy, we will need to do some parameter tuning and add some features in order to improve it.

In [21]:
# false positive summaries (non summaries incorrectly classified as summaries)
X_test[y_pred_class > y_test].head()

Unnamed: 0,sentence,sentence_len,tfidf_score,title_similarity
79156,"For flat scars, choose a color that neutralize...",10,2.920918,0.782336
31631,Don t forget to dab the alum block in cold wat...,13,3.386033,0.830021
26303,Watch how much salt you use while cooking don ...,18,3.807531,0.817822
5223,"Roast cauliflower with olive oil, turmeric, cu...",12,3.392271,0.486347
866,If you learn about more than 1 casino in your ...,25,4.44526,0.879589


In [22]:
# false negative summaries (summaries incorrectly classified as non summaries)
X_test[y_pred_class < y_test].head()

Unnamed: 0,sentence,sentence_len,tfidf_score,title_similarity
35795,Drink more green tea,4,1.940366,0.465952
575,Choose the same last name as your parent s sta...,12,2.959494,0.823162
99089,Cut up carrots,3,1.732051,0.595426
87070,Brush your teeth,3,1.704851,0.562389
82585,Clean your tongue once per day,6,2.341083,0.760526


In [23]:
# Left: proba the class is 0, Right: proba the class is 1 
best_nb.predict_proba(X_test_dtm)

array([[9.99999991e-01, 8.71325601e-09],
       [1.00000000e+00, 8.61438939e-21],
       [1.04333470e-01, 8.95666530e-01],
       ...,
       [9.88583994e-01, 1.14160060e-02],
       [1.00000000e+00, 1.08789904e-11],
       [4.69466275e-01, 5.30533725e-01]])

In [24]:
y_pred_prob = best_nb.predict_proba(X_test_dtm)[:, 1]
metrics.roc_auc_score(y_test, y_pred_prob)

0.9342752332303115

### Which words are more prone to be in a summary ?

In [25]:
# store the vocabulary of X_train
X_train_tokens = vectorizer.get_feature_names()
len(X_train_tokens)

100222

In [26]:
# Naive Bayes counts the number of times each token appears in each class
best_nb.feature_count_

array([[ 6.,  2., 84., ...,  2.,  6.,  2.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [27]:
# rows represent classes, columns represent tokens
best_nb.feature_count_.shape

(2, 100222)

In [28]:
# number of times each token appears across all summary sentences
non_summary_token_count = best_nb.feature_count_[0, :]
non_summary_token_count

array([ 6.,  2., 84., ...,  2.,  6.,  2.])

In [29]:
# number of times each token appears across all non-summary sentences
summary_token_count = best_nb.feature_count_[1, :]
summary_token_count

array([0., 0., 0., ..., 0., 0., 0.])

In [30]:
# create a DataFrame of tokens with their separate ham and spam counts
tokens = pd.DataFrame({'token':X_train_tokens, 'summary':summary_token_count, 'non_summary':non_summary_token_count}).set_index('token')
tokens.head()

Unnamed: 0_level_0,summary,non_summary
token,Unnamed: 1_level_1,Unnamed: 2_level_1
00,0.0,6.0
00 year,0.0,2.0
000,0.0,84.0
000 000,0.0,4.0
000 americans,0.0,2.0


In [31]:
# examine 5 random DataFrame rows
tokens.sample(5, random_state=42)

Unnamed: 0_level_0,summary,non_summary
token,Unnamed: 1_level_1,Unnamed: 2_level_1
the team,0.0,2.0
use acupressure,1.0,2.0
foods including,0.0,8.0
move upwards,0.0,2.0
personal,21.0,130.0


In [32]:
# Naive Bayes counts the number of observations in each class
best_nb.class_count_

array([58478., 11522.])

## Random Forest Classifier

In [33]:
from sklearn.ensemble import RandomForestClassifier
#Performing Cross Validation for the best combination of parameters
# Define a pipeline combining a text feature extractor with a simple classifier
# Maximum number of levels in tree
max_depth = [80, 90, 100, 110]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [8, 10, 12]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

rfc = RandomForestClassifier(random_state=42,n_jobs=-1)
param_grid = {
    #try with 500
    'n_estimators': [200, 500],
    'max_depth':[4,5,6,7,8]
}

In [34]:
# Find the best parameters for both the feature extraction and the classifier
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5)
CV_rfc.fit(Xtrain_dtm, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=-1,
                                              oob_score=False, random_state=42,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'ma

In [35]:
CV_rfc.best_params_

{'max_depth': 4, 'n_estimators': 200}

In [36]:
rf=RandomForestClassifier(random_state=42, n_estimators= 200, max_depth=4, n_jobs=-1)

In [37]:
rf.fit(Xtrain_dtm, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=4, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [38]:
# make class predictions for X_test_dtm
y_pred_class_rf = rf.predict(X_test_dtm)
y_pred_class_rf

array([0, 0, 0, ..., 0, 0, 0])

In [39]:
print(f'Confusion Matrix: \n {metrics.confusion_matrix(y_test, y_pred_class_rf)}')

Confusion Matrix: 
 [[24960     0]
 [ 5040     0]]


In [40]:
print(f'Classification Report: \n\n {metrics.classification_report(y_test, y_pred_class_rf)}')

Classification Report: 

               precision    recall  f1-score   support

           0       0.83      1.00      0.91     24960
           1       0.00      0.00      0.00      5040

    accuracy                           0.83     30000
   macro avg       0.42      0.50      0.45     30000
weighted avg       0.69      0.83      0.76     30000



In [41]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class_rf)

0.832

The Naive Bayes Classifier seems to to an overall better job than Random Forest

## Using TF IDF

In [42]:
#Divide X_train by groups of summary sentences and non summary sentences
X_train.head()
X_y_train_df= pd.DataFrame(data=X_train_no_feat, columns= ['sentence'])
X_y_train_df['summary_id'] = y_train
X_train_sum = X_y_train_df[X_y_train_df['summary_id']==1]['sentence']
X_train_no_sum = X_y_train_df[X_y_train_df['summary_id']==0]['sentence']

In [43]:
#Performing Cross Validation for the best combination of parameters
from sklearn.feature_extraction.text import TfidfVectorizer
# Define a pipeline combining a text feature extractor with a simple classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('tfidf_nb', MultinomialNB()),
])


parameters = {
    'tfidf__min_df': (2, 5, 10, 15, 20, 30, 40, 50),
    'tfidf__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'tfidf_nb__alpha': (0.1, 1, 5, 10, 50),
}

In [44]:
# Find the best parameters for both the feature extraction and the classifier
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

In [45]:
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
print(parameters)
grid_search.fit(X_train_no_feat, y_train)
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['tfidf', 'tfidf_nb']
parameters:
{'tfidf__min_df': (2, 5, 10, 15, 20, 30, 40, 50), 'tfidf__ngram_range': ((1, 1), (1, 2)), 'tfidf__use_idf': (True, False), 'tfidf_nb__alpha': (0.1, 1, 5, 10, 50)}
Fitting 3 folds for each of 160 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   37.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:  8.0min finished


Best score: 0.849
Best parameters set:
	tfidf__min_df: 2
	tfidf__ngram_range: (1, 2)
	tfidf__use_idf: True
	tfidf_nb__alpha: 0.1


In [46]:
#Representing text as tfidf score of different unigrams and bigrams 
from sklearn.feature_extraction.text import TfidfVectorizer 
tfidf=TfidfVectorizer(min_df =2, ngram_range=(1,2), use_idf = True)

#Fitting the vectorizer to summary and non summary sentences separately
tfidf.fit(X_train_no_sum, X_train_sum)
tf_idf_vectors = tfidf.transform(X_train_no_feat)

#Converting the sparse matrix into a dense matrix
tf_idf_vectors.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [47]:
X_test_tfidf = tfidf.transform(X_test_no_feat)

## Use MultinomialNB with tf-idf 

Should we be using Naive Bayes with tf-idf ? naive bayes works better with integers...

In [48]:
nb_tfidf = MultinomialNB(alpha = 0.1)

In [49]:
nb_tfidf.fit(tf_idf_vectors, y_train)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [50]:
nb_tfidf_pred = nb_tfidf.predict(X_test_tfidf)

In [51]:
#Accuracy
metrics.accuracy_score(y_test, nb_tfidf_pred)

0.8762333333333333

In [52]:
print(f'Confusion Matrix: \n {metrics.confusion_matrix(y_test, nb_tfidf_pred)}')

Confusion Matrix: 
 [[24604   356]
 [ 3357  1683]]


In [53]:
print(f'Classification Report: \n\n {metrics.classification_report(y_test, nb_tfidf_pred)}')

Classification Report: 

               precision    recall  f1-score   support

           0       0.88      0.99      0.93     24960
           1       0.83      0.33      0.48      5040

    accuracy                           0.88     30000
   macro avg       0.85      0.66      0.70     30000
weighted avg       0.87      0.88      0.85     30000



## Using Tf-idf with Random Forest

In [54]:
#Performing Cross Validation for the best combination of parameters

tfidf_rfc = RandomForestClassifier(random_state=42,n_jobs=-1)
param_grid = {
    
    'n_estimators': [200, 500],
    'max_depth':[4,5,6,7,8]
}

In [55]:
# Find the best parameters for both the feature extraction and the classifier
tfidf_CV_rfc = GridSearchCV(estimator=tfidf_rfc, param_grid=param_grid, cv=5)
tfidf_CV_rfc.fit(tf_idf_vectors, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=-1,
                                              oob_score=False, random_state=42,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'ma

In [56]:
tfidf_CV_rfc.best_params_

{'max_depth': 4, 'n_estimators': 200}

In [57]:
# Now use best params for rf

In [58]:
rf_tfidf = RandomForestClassifier(random_state=42,n_jobs=-1,max_depth= 4, n_estimators= 200)

In [59]:
rf_tfidf.fit(tf_idf_vectors, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=4, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [60]:
rf_tfidf_pred = rf_tfidf.predict(X_test_tfidf)

In [61]:
#Accuracy
metrics.accuracy_score(y_test, rf_tfidf_pred)

0.832

In [62]:
#Confusion matrix
print(f'Confusion Matrix: \n {metrics.confusion_matrix(y_test, rf_tfidf_pred)}')

Confusion Matrix: 
 [[24960     0]
 [ 5040     0]]


In [63]:
#Classification report
print(f'Classification Report: \n\n {metrics.classification_report(y_test, rf_tfidf_pred)}')

Classification Report: 

               precision    recall  f1-score   support

           0       0.83      1.00      0.91     24960
           1       0.00      0.00      0.00      5040

    accuracy                           0.83     30000
   macro avg       0.42      0.50      0.45     30000
weighted avg       0.69      0.83      0.76     30000



## Saving Dataframes to add more features later


In [64]:
type(tf_idf_vectors)

scipy.sparse.csr.csr_matrix

In [65]:
#Save tfidf sparse matrices  
import scipy.sparse
scipy.sparse.save_npz('./datasets/train_sparse_matrix.npz', tf_idf_vectors)
scipy.sparse.save_npz('./datasets/test_sparse_matrix.npz', X_test_tfidf)

In [66]:
#Save feature names 
import pickle
with open("tfidf_features.txt", 'wb') as f:
    pickle.dump(tfidf.get_feature_names(), f) 

In [67]:
#Save training and testing dataframes (without tfidf)
X_train.to_csv('./datasets/wikihow_X_train.csv', index=False)
X_test.to_csv('./datasets/wikihow_X_test.csv', index=False)

In [68]:
#Save labels  
y_train.to_csv('./datasets/wikihow_y_train.csv', index=False)
y_test.to_csv('./datasets/wikihow_y_test.csv', index=False)