In [None]:
# MO444-A 2s/2017 - Third assignment
#
#         Group 05
#
# - Anderson Rossanez (124136)
# - Bruno Branta Lopes (31470)
#

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import pairwise_distances
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import scale

def loadIDs():
    lines=[]
    with open('documents/ids', "r") as f:
        lines = f.read().splitlines()
    return np.asarray(lines)

def loadFeatures():
    features = pd.read_csv('documents/data.csv', sep=',', header=None)
    return features.values

def loadDocument(docID):
    data = None
    with open('documents/docs/' + docID, 'r') as docfile:
        data = docfile.read().replace('\n', '')
    return data

def computeKMeansMetrics(model, data):
    labels = model.fit_predict(data)
    cost = model.inertia_
    sil_avg = silhouette_score(data, labels)
    return labels, cost, sil_avg

In [None]:
ids = loadIDs()
print('IDs shape: {}'.format(ids.shape))

data = loadFeatures()
print('Data shape: {}'.format(data.shape))

In [None]:
# Let's look at the 5 first instances of the IDs
print(ids[0:5])

In [None]:
# Now, let's have a look at the 5 first instances of the data (bag-of-words feature vectors)
data_frame = pd.DataFrame(data)
data_frame.head(5)

In [None]:
# Checking the value ranges
print('Min. value: {}'.format(np.min(data.flatten())))
print('Max. value: {}'.format(np.max(data.flatten())))

# Let's scale and normalize the data
data = scale(data)

normalizer = Normalizer()
normalizer = normalizer.fit(data)

data = normalizer.transform(data)

In [None]:
# Attempting Mini-Batch K means with some different number of centroids.
kmeans, labels, costs, sil_avgs = [], [], [], []
num_centroids = [2, 4, 8, 16, 32, 64, 128, 256]
for i in num_centroids:
    model = MiniBatchKMeans(n_clusters=i, init='k-means++', batch_size=1000, random_state=1)
    myLabels, cost, sil_avg = computeKMeansMetrics(model, data)
    kmeans.append(model)
    labels.append(myLabels)
    costs.append(cost)
    sil_avgs.append(sil_avg)
    print('%d clusters: Cost %5.2f ; Silhouette avg %5.5f' % (i, cost, sil_avg))

In [None]:
fig, ax1 = plt.subplots()

# Plot costs curve
ax1.set_xlabel('centroids')
ax1.errorbar(num_centroids, costs, costs_sd, 
             fmt='-o', ecolor='b', capthick=1, capsize=3, color='b')
ax1.set_ylabel('cost', color='b')
ax1.tick_params('y', colors='b')

# Plot the silhouette averages
ax2 = ax1.twinx()
ax2.errorbar(num_centroids, sil_avgs, sil_avgs_sd, 
             fmt='-o', ecolor='g', capthick=1, capsize=3, color='g')
ax2.set_ylabel('silhouettes', color='g')
ax2.tick_params('y', colors='g')

fig.tight_layout()
plt.show()

In [None]:
# Chosen number of centroids: 64
chosen_model = kmeans[5]
chosen_model_labels = labels[5]

In [None]:
# Let's check three sample groups, their medoid documents, and two other closer documents.
for i in [26, 32, 58]:
    print('Group: %d' % i)
    center = chosen_model.cluster_centers_[i]
    group_points = data[chosen_model_labels == i]
    group_doc_ids = ids[chosen_model_labels == i]

    # Let's find the medoid and their closest points
    distancesMatrix = pairwise_distances(group_points, metric='euclidean')
    closest_to_medoid_indexes = np.argsort(distancesMatrix.sum(axis=0))
    medoid_index = closest_to_medoid_indexes[0]

    print(' - Medoid doc: {}'.format(group_doc_ids[medoid_index]))
    print(loadDocument(group_doc_ids[medoid_index]))
    print(' - ')
    print(' -- Closest to medoid doc: {}'.format(group_doc_ids[closest_to_medoid_indexes[1]]))
    print(loadDocument(group_doc_ids[closest_to_medoid_indexes[1]]))
    print(' -- ')
    print(' --- Second closest to medoid doc: {}'.format(group_doc_ids[closest_to_medoid_indexes[2]]))
    print(loadDocument(group_doc_ids[closest_to_medoid_indexes[2]]))
    print(' --- ')