In [None]:
# Perform clustering 

import os
import re
import time

import pickle
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans

import clustools as ct

In [None]:
# Gather files

# Set the contents directory
CONTENTS_DIR = './site_contents/'

# Get all file names from the directory.

file_names = [file for file in os.listdir(CONTENTS_DIR)]

# Read the text of each file
file_contents = []

for name in file_names:
    
    with open(CONTENTS_DIR + name, 'r') as content:
        site_text = content.read()
    
    file_contents.append(site_text)
    
# Store contents in dataframe

websites_df = pd.DataFrame({'site': map(lambda name: name.replace('.txt','' ), file_names),
                            'raw_text': file_contents})

# Add column with split text, and one with the length of the split text.
websites_df['wordcount'] = websites_df['raw_text'].apply(lambda mytext: len(mytext.split()))

# Drop short or empty texts

min_words = 100

websites_df.drop(websites_df[websites_df['wordcount']< min_words].index, inplace=True)

In [None]:
# TF-IdF vectorization

# get stop words list
stop_words = ct.get_stopwords()

# remove numbers and lowercase text
my_preprocessor = ct.remove_numbers_lower

# Select min and max doc frquency. 

min_freq = 2 # parameter to remove very uncommon words.
max_freq = 0.4 # parameter to remove too common words.


# Changed token patterns to keep words with {min_letters} or more only.

min_letters = 3
my_tokens = '(?u)\\b\\w{'+ str(min_letters -1) + '}\\w+\\b'

#Create Vectorizer

vectorizer = TfidfVectorizer(min_df = min_freq,
                             max_df = max_freq,
                             preprocessor= my_preprocessor,
                             stop_words=stop_words,
                             token_pattern = my_tokens,
                             ngram_range=(1,1)
                            )


In [None]:
# Building vocabulary and generating the document term matrix (dtm).

dtm = vectorizer.fit_transform(websites_df['raw_text'])

# Build an inverse vocabulary dictionary to retrieve words easily by id. 
# A WARNING  appears due to some stopwords. To be fixed in the future.

inv_vocab = {  w_id: word  for word, w_id in vectorizer.vocabulary_.items() }

In [None]:
# LSA. 

#Perform SVD acrros 20 dimensions and find an optimal number of dimensions to keep 
# Use scree plot elbow method. We could do up to n_docs, but it seems too much.

# With a larger datasatet I'd check at least 100.

# We need to use truncated SVD because we are dealing with an sparse matrix.
# Create the SVD object. 

decomposer = TruncatedSVD(20, n_iter=10)

# Perform SVD/LSA and get the transformed doc vectors. 
decomposer.fit(dtm)

In [None]:
# Get singular values for scree plot.
singular_values = decomposer.singular_values_

dimension_number = np.array(list(range(len(singular_values))))

# Get recomended number of dimensions to keep.

elbow_idx = ct.ClusterDecissionHelper.elbow_finder(dimension_number, singular_values, plot = False )

# Account for index starting at 0. I we keep up to and including dimension n (where elbow is)
# we are keeping n+1 dimensions in total.
dim_number = elbow_idx +1 

print(f'Number of dimensions to keep according to scree plot: {dim_number} ')

In [None]:
# Now get reduced vectors for the optimal number of dimensions found

dim_reducer = TruncatedSVD(n_components = dim_number, n_iter= 10 )

reduced_dtm =  dim_reducer.fit_transform(dtm)

In [None]:
# Use decider to determine K via silhouette scores.
# Check from 2 to 20 clusters, in 10 repetitions.

decider = ct.ClusterDecissionHelper(max_clusters=20, repeat=10)

decider.compute_scores(reduced_dtm, sil_score =True)

In [None]:
best_K = decider.get_K_from_silhouettes()

print(f'The best value of K found was K = {best_K} .')

In [None]:
# Perform Clustering

# Create a model for the best K

KMeans_model = KMeans(n_clusters = best_K, n_init=20, max_iter= 100)

In [None]:
# Find the clusters
KMeans_model.fit(reduced_dtm)

# Write the labels into the dataframe.
websites_df['cluster_label'] =  KMeans_model.labels_

# How many elements in each cluster?
websites_df.groupby('cluster_label')['site'].count()

In [None]:
# Get words and words weight representative of each cluster.

cluster_df = ct.interpret_clusters(KMeans_model, inv_vocab, dim_reducer)

In [None]:
# Save results

results_dir = './cluster_results/'

timestr = time.strftime("%Y%m%d_%H%M", time.localtime())

# File names
model_file = results_dir + timestr + '_fitted_kmeans_model.pickle'
vectorizer_file = results_dir + timestr + '_fitted_tfidf.pickle'
reducer_file = results_dir + timestr + '_fitted_TSVD.pickle'

clustered_sites_file = results_dir + timestr +'_clustered_sites.csv'
cluster_description_file = results_dir + timestr +'_cluster_descriptions.csv'

# Save clustered sites and cluster descriptions.
websites_df.sort_values('cluster_label').to_csv(clustered_sites_file ,columns=['site', 'cluster_label'])
cluster_df.to_csv(cluster_description_file)

# Save KMeans model, vectorizer and dim_reducer, which we will need for treating the data and 
# classifying later
pickle.dump(KMeans_model, open(model_file, 'wb'))
pickle.dump(vectorizer, open(vectorizer_file, 'wb'))
pickle.dump(dim_reducer, open(reducer_file, 'wb'))