In [3]:
import numpy as np
import pandas as pd
import warnings
from sklearn import metrics
import pickle
import scipy.sparse
import os
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
warnings.filterwarnings("ignore")

## Loading the datasets 

In [4]:
# Load sparse matrices
tf_idf_vectors = scipy.sparse.load_npz('./datasets/train_sparse_matrix.npz')
X_test_tfidf = scipy.sparse.load_npz('./datasets/test_sparse_matrix.npz')

In [5]:
# Load tfidf feature names list
# if file exists we have already pickled a list
if os.path.isfile("tfidf_features.txt"):
    with open("tfidf_features.txt", 'rb') as f:
        tfidf_feature_names = pickle.load(f)

In [6]:
#Load X_train and X_test
X_train = pd.read_csv('./datasets/wikihow_X_train.csv') 
X_test = pd.read_csv('./datasets/wikihow_X_test.csv') 

In [7]:
#Load y_train and y_test 
y_train = pd.read_csv('./datasets/wikihow_y_train.csv', header = None) 
y_test = pd.read_csv('./datasets/wikihow_y_test.csv', header = None)

## Reconstructing Dataframes for analysis 

In [8]:
X_train_ft = X_train[['sentence','sentence_len']]

In [9]:
#Reshaping the sentence lengths array 
sent_lengths = np.array(X_train_ft['sentence_len'].values).reshape(-1, 1)

In [10]:
#Converting sentence lengths array to sparse matrix
sparse_sent_lengths = scipy.sparse.csr_matrix(sent_lengths)

In [11]:
#Concatenating the two sparse arrays  
X_train_feats = scipy.sparse.hstack([tf_idf_vectors,sparse_sent_lengths ])

In [12]:
#Same with test set 
X_test_ft = X_test[['sentence','sentence_len']]

In [13]:
#Reshaping the sentence lengths array 
sent_lengths_test = np.array(X_test_ft['sentence_len'].values).reshape(-1, 1)

In [14]:
#Converting sentence lengths array to sparse matrix
sparse_sent_lengths_test = scipy.sparse.csr_matrix(sent_lengths_test)

In [15]:
#Concatenating the two sparse arrays  
X_test_feats = scipy.sparse.hstack([X_test_tfidf,sparse_sent_lengths_test ])

## Using KMeans for text clustering

In [18]:
true_k = 2
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X_train_feats)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=2, n_init=1, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [20]:
predictions = model.predict(X_test_feats)

## Measuring performance 

In [23]:
print(f'Confusion Matrix: \n {metrics.confusion_matrix(y_test, predictions)}')

Confusion Matrix: 
 [[12477 12483]
 [  113  4927]]


In [25]:
print(f'Classification Report: \n\n {metrics.classification_report(y_test, predictions)}')

Classification Report: 

               precision    recall  f1-score   support

           0       0.99      0.50      0.66     24960
           1       0.28      0.98      0.44      5040

    accuracy                           0.58     30000
   macro avg       0.64      0.74      0.55     30000
weighted avg       0.87      0.58      0.63     30000



Compared to the supervised techniques, this method gives overall lower performance scores.

## Adding the sentence tf-idf score to see if the model performs better 

In [39]:
X_train_ft = X_train[['sentence_len', 'tfidf_score']]

In [40]:
sent_tfidf = np.array(X_train_ft['tfidf_score'].values).reshape(-1, 1)

In [41]:
#Converting sentence tfidf array to sparse matrix
sparse_sent_tfidf = scipy.sparse.csr_matrix(sent_tfidf)

In [42]:
X_train_feats_1 = scipy.sparse.hstack([X_train_feats ,sparse_sent_tfidf])
X_train_feats_1

<70000x95347 sparse matrix of type '<class 'numpy.float64'>'
	with 1779327 stored elements in COOrdinate format>

In [43]:
#Same for test set
X_test_ft = X_test[['sentence','tfidf_score']]
sent_tfidf_score_test = np.array(X_test_ft['tfidf_score'].values).reshape(-1, 1)
#Converting sentence tfidf array to sparse matrix
sparse_sent_tfidf_score_test = scipy.sparse.csr_matrix(sent_tfidf_score_test)
#Concatenating the two sparse arrays  
X_test_feats_1 = scipy.sparse.hstack([X_test_feats,sparse_sent_tfidf_score_test])

In [44]:
model_2 = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model_2.fit(X_train_feats_1)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=2, n_init=1, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [46]:
predictions_2 = model_2.predict(X_test_feats_1)

In [47]:
print(f'Confusion Matrix: \n {metrics.confusion_matrix(y_test, predictions_2)}')

Confusion Matrix: 
 [[12483 12477]
 [ 4927   113]]


In [48]:
print(f'Classification Report: \n\n {metrics.classification_report(y_test, predictions_2)}')

Classification Report: 

               precision    recall  f1-score   support

           0       0.72      0.50      0.59     24960
           1       0.01      0.02      0.01      5040

    accuracy                           0.42     30000
   macro avg       0.36      0.26      0.30     30000
weighted avg       0.60      0.42      0.49     30000

