## Summary
* Clustering function
* Load corpus
* Load dataframe of the corpus
* ELBOW method
* KMEANS training
* Bench kmeans
* Save cluster ID to file
* Clusters analisis
* Join labels as string and save it into a file

## K-MEANS CLUSTERING

In [None]:
from collections import defaultdict, Counter
from time import time
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
import pandas as pd
import itertools

from ipynb.fs.full import paths
from ipynb.fs.full.utils_functions import load_corpus

###  Constants

In [None]:
K_CLUSTERS = 12
K_ITERS = 300

### Clustering function

In [None]:
def clustering(vector, n_clusters, n_iter):
    kmeans = KMeans(n_clusters=n_clusters, max_iter=n_iter, random_state=120)
    kmeans.fit(vector)
    v_labels = kmeans.predict(vector)
    return kmeans, v_labels

In [None]:
# Getting data.
processed_data, data = load_corpus()
# List with all the words in `processed_data` (repeated included).
distribution_words = Counter(list(itertools.chain.from_iterable(processed_data)))

### Load DataFrame of the corpus

In [None]:
data_str = [' '.join(lst) for lst in processed_data]
data_feature = pd.read_pickle(paths.DF_PATH)

# Elbow method

In [None]:
# Run the Kmeans algorithm and get the index of data points clusters
sse = []
list_k = list(range(5, 50))

for k in list_k:
    km = KMeans(n_clusters=k)
    km.fit(data_feature)
    sse.append(km.inertia_)

# Plot sse against k
plt.figure(figsize=(6, 6))
plt.plot(list_k, sse, '-o')
plt.xlabel(r'Number of clusters *k*')
plt.ylabel('Sum of squared distance');

In [None]:
kmeans, cluster_ids = clustering(data_feature, K_CLUSTERS, K_ITERS)
#associate each word with its label
c = list(zip(cluster_ids, data_str))

cluster = defaultdict(set)
for idc, word in c:
    cluster[idc].add(word)

### Save cluster_ids to file

In [None]:
df_cluster_ids = pd.DataFrame(data=cluster_ids)
df_cluster_ids.to_pickle(paths.IDS_CLUSTER_PATH)

### Cluster Analysis

In [None]:
from gensim.models import FastText
from collections import Counter

MOST_COMMON = 15
MOST_COMMON_FOR_LABELS = 5
MOST_SIMILAR = 5

In [None]:
# Load fasttext model
model = FastText.load(paths.FASTTEXT_MODEL_PATH)

In [None]:
# Get most representative words for each cluster
labels = []
for clus in cluster:
    print("ID cluster: ", clus)
    words = []
    for sentence in cluster[clus]:
        sentence=sentence.split(' ')
        for word in sentence:
            words.append(word)
    counter = Counter(words)
    for word in counter:
        if distribution_words[word] == 0:
            continue
        counter[word] = counter[word] / distribution_words[word] 
    counter.pop('', None)
    most_common = counter.most_common(MOST_COMMON)
    print(' '.join([tupla[0] for tupla in most_common]),'\n')
    words = [pair[0] for pair in most_common]
    for word in most_common:
        most_similar = model.wv.most_similar(word[0],topn=MOST_SIMILAR)
        for pair in most_similar:
            words.append(pair[0])
    counter = Counter(words)
    most_common = counter.most_common(MOST_COMMON_FOR_LABELS)
    labels.append([pair[0] for pair in most_common])

### Join labels as string and save it to file

In [None]:
cluster_labels = [
    'Dinero | Finanzas',
    'Dormir',
    'Cumpleaños | Navidad',
    'Juntada | Comida',
    'Feliciones por logro',
    'Familia | Villa Maria',
    'Personas del CMU',
    'Tramites | Viajes',
    'Responsabilidades | CMU',
    'Bardo | Otros',
    'Saludo | Despedida',
    'Archivos | Tramites pasantia'
]

In [None]:
with open(paths.CLUSTER_LABELS_PATH,"w") as f:
    f.write('\n'.join(cluster_labels))