# Import of libraries

In [1]:
import pandas as pd

In [2]:
from sklearn import cluster as ct

In [3]:
from matplotlib import pyplot as plt

In [4]:
from sklearn import metrics as mt

# Function definition

In [5]:
def show_table(list_names_algorithm, list_names_metrics, list_values_metrics):

    """
    This function is responsible for creating a table and displaying it.

    Args:
        parameter1 (list): Receives a list with the names of the algorithms worked on.
        parameter2 (list): Receives a list with the names of the defined metrics.
        parameter3 (list): Receives a list of lists with the resulting metric values. 
                           The size of that list must be equal to the size of the list of the first parameter, 
                           Otherwise, an error will occur.

    Returns:
        returns a table in dataframe format.

    Example:
        algorithm = ['A','B', 'C']
        metrics = ['aa', 'bb', 'cc']
        values = [[1], [2], [3]]
        show_table(algorithm, metrics, values)
    """
    
    # Create a dictionary = table
    tab = {}

    # Add elements to the dictionary by assigning a value to a specific key
    tab['Algorithm Name'] = list_names_algorithm

    # Error handling
    try:
        # Scrolling through the list
        for names in list_names_metrics:
            # Add elements to the dictionary by assigning a value to a specific key
            tab[names] = list_values_metrics[list_names_metrics.index(names)]

        # Create a DataFrame from data
        df_tab = pd.DataFrame(tab)
            
    except ValueError:
        
        print('Unable to execute this command! Check the size and type of the third parameter.')
        
    else:
        # Returns a dataframe
        return df_tab

# Loading data

In [6]:
x = pd.read_csv('../Datasets/ensaio_clusterizacao/X_dataset.csv')

In [None]:
plt.scatter(x.iloc[:, 0], x.iloc[:, 1])

# Training unsupervised learning algorithms

### K-Means

In [17]:
model_kmeans = ct.KMeans(n_clusters=6, n_init='auto')

In [18]:
kmeans = model_kmeans.fit(x)

#### Obtendo o melhor parametro K

In [None]:
#Elbow Method

# Valores de clusters
n_clusters = list(range(2, 100))

# Lista para armezenar os valores da silhouette score
list_ss = []

for n in n_clusters:

    # Treinamento do modelo
    model_kmeans = ct.KMeans(n_clusters=n, n_init='auto').fit(x)

    # Previsão
    labels = model_kmeans.predict(x)

    # Performance
    ss = mt.silhouette_score(x, labels)

    list_ss.append(ss)

In [None]:
# Plotando a performance em função do numero de clusters
plt.plot(n_clusters, list_ss, marker='o')
plt.xlabel('Numero de Clusters')
plt.ylabel('Silhouette Score')

In [None]:
# O melhor valor de clusters ou seja valor de n
n = list_ss.index(max(list_ss)) + 2
n

In [None]:
# Plotando os dados do dataset e os clusters formandos junto com os seu centroides
plt.scatter(x.iloc[:, 0], x.iloc[:, 1], c=labels)

for i in range(len(model_kmeans.cluster_centers_)):
    plt.scatter( 
        model_kmeans.cluster_centers_[i, 0],
        model_kmeans.cluster_centers_[i, 1],
        marker='*',
        c='orange',
        s=160
    )

### Affinity Propagation

In [9]:
model_af = ct.AffinityPropagation(preference=-55)

In [10]:
af = model_af.fit(x)

#### Obtendo o melhor parametro Preference

In [None]:
#Elbow Method

# Valores de clusters
preference = list(range(-100, -1))

# Lista para armezenar os valores da silhouette score
list_ss = []

for p in preference:

    # Treinamento do modelo
    model_af = ct.AffinityPropagation(preference=p).fit(x)

    # Clusterização
    labels = model_af.predict(x)

    # Performance
    ss = mt.silhouette_score(x, labels)

    list_ss.append(ss)

In [None]:
# Plotando a performance em função do numero de clusters
plt.plot(preference, list_ss, marker='o')
plt.xlabel('Numero de Preference')
plt.ylabel('Silhouette Score')

In [None]:
# O melhor valor de preference ou seja valor de n
p = list_ss.index(max(list_ss)) + 2
p

In [None]:
# Plotando os dados do dataset e os clusters formandos junto com os seu centroides
plt.scatter(x.iloc[:, 0], x.iloc[:, 1], c=labels)

for i in range(len(model_af.cluster_centers_)):
    plt.scatter( 
        model_af.cluster_centers_[i, 0],
        model_af.cluster_centers_[i, 1],
        marker='*',
        c='orange',
        s=160
    )

# Cluterization of observations

### K-Means

In [19]:
labels_kmeans = kmeans.predict(x)

### Affinity Propagation

In [12]:
labels_af = model_af.predict(x)

# Performance

In [21]:
list_ss = []

### K-Means

In [22]:
ss_kmeans = mt.silhouette_score(x, labels_kmeans)
list_ss.append(ss_kmeans)
ss_kmeans

0.180299894818958

### Affinity Propagation

In [23]:
ss_af = mt.silhouette_score(x, labels_af)
list_ss.append(ss_af)
ss_af

0.20118042030593164

## Tabela de Performance sobre os dados

In [24]:
show_table(['K-Means', 'Affinity Propagation'], ['Silhouette Score'], [list_ss])

Unnamed: 0,Algorithm Name,Silhouette Score
0,K-Means,0.1803
1,Affinity Propagation,0.20118
