## K-MEANS CLUSTERING

In [None]:
import pandas as pd
data = ['perro','gato','gato','jabu','sol']
df = pd.DataFrame(data,columns=['food'])
import seaborn as sbn

In [None]:
#df['food'].value_counts()['perro']
#df['food'].unique()
key = df['food'].value_counts()
for i,r in key.iterrows():
    print(r,' | ',i)
    break

In [None]:
from collections import defaultdict, Counter
from time import time
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
import pandas as pd


from ipynb.fs.full.utils_functions import load_corpus

###  Constants

In [None]:
K_CLUSTERS = 15
K_ITERS = 250

LABELS_PATH = "data/labels_kmeans"
FASTTEXT_MODEL_PATH = 'models/fasttext/fasttext_5_4_50.pkl'
DF_PATH = 'data/fasttext/df_from_mean_fasttext_5_4_50.pkl'
KMEANS_CLUSTER_IDS = 'data/cluster_ids.pkl'

### Clustering function

In [None]:
def clustering(vector, n_clusters, n_iter):
    kmeans = KMeans(n_clusters=n_clusters, max_iter=n_iter, random_state=120)
    kmeans.fit(vector)
    v_labels = kmeans.predict(vector)
    return kmeans, v_labels

In [None]:
# Getting data.
data, original_data = load_corpus()

st = []
for lst in data:
    st += lst
distribution_words = Counter(st)

### Load DataFrame of the corpus

In [None]:
data_str = [' '.join(lst) for lst in data]
data_feature = pd.read_pickle(DF_PATH)

### Scaled vectors ?

from sklearn import preprocessing
import pandas as pd

print(data_feature.shape)
row=data_feature[0]
print(data_feature)
X_scaled = preprocessing.scale(row)
xpd = pd.DataFrame(data=X_scaled)
print(row)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(data_feature), columns=data_feature.columns)


# Elbow method

In [None]:
# Run the Kmeans algorithm and get the index of data points clusters
sse = []
list_k = list(range(5, 40))

for k in list_k:
    km = KMeans(n_clusters=k)
    km.fit(data_feature)
    sse.append(km.inertia_)

# Plot sse against k
plt.figure(figsize=(6, 6))
plt.plot(list_k, sse, '-o')
plt.xlabel(r'Number of clusters *k*')
plt.ylabel('Sum of squared distance');

In [None]:
kmeans, cluster_ids = clustering(data_feature, K_CLUSTERS, K_ITERS)
#associate each word with its label
c = list(zip(cluster_ids, data_str))

cluster = defaultdict(set)
for idc, word in c:
    cluster[idc].add(word)

In [None]:
def bench_k_means(estimator, name, data):
    t0 = time()
    estimator.fit(data)
    print('%-9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
          % (name, (time() - t0), estimator.inertia_,
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
             metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)))

### Save cluster_ids to file

In [None]:
df_cluster_ids = pd.DataFrame(data=cluster_ids)
df_cluster_ids.to_pickle(KMEANS_CLUSTER_IDS)

___

### Cluster Analysis

In [None]:
from gensim.models import FastText
from collections import Counter

MOST_COMMON = 10
MOST_COMMON_FOR_LABELS = 5
MOST_SIMILAR = 5

In [None]:
# Load fasttext model
model = FastText.load(FASTTEXT_MODEL_PATH)

In [None]:
# Get most representative words for each cluster
labels = []
for clus in cluster:
    words = []
    for sentence in cluster[clus]:
        sentence=sentence.split(' ')
        for word in sentence:
            words.append(word)
    counter = Counter(words)
    for word in counter:
        if distribution_words[word] == 0:
            continue
        counter[word] = counter[word] / distribution_words[word] 
    counter.pop('', None)
    most_common = counter.most_common(MOST_COMMON)
    print(most_common,'\n\n')
    words = [pair[0] for pair in most_common]
    for word in most_common:
        most_similar = model.wv.most_similar(word[0],topn=MOST_SIMILAR)
        for pair in most_similar:
            words.append(pair[0])
    counter = Counter(words)
    most_common = counter.most_common(MOST_COMMON_FOR_LABELS)
    labels.append([pair[0] for pair in most_common])

In [None]:
cluster[8]

### Join labels as string and save it to file

In [None]:
st = ''
for x in labels:
    st+= ', '.join(x)
    st+='\n'

In [None]:
with open(LABELS_PATH,"w") as f:
    f.write(st)