In [11]:
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import scale
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

def mean_movs_by_hour(df):
    res = df.groupby(
    ['Date', 'Period', 'Id_station'], as_index = False)['Inputs', 'Outputs'].sum()

    res2 = res.groupby(
    ['Id_station', 'Period'], as_index = False)['Inputs', 'Outputs'].mean()
    
    return res2

def scale_data(df):
    aux_df = pd.DataFrame()
    aux_df['Id_station'] = df['Id_station']
    aux_df['Inputs'] = df['Inputs']
    aux_df['Outputs'] = df['Outputs']
    
    aux_df.set_index('Id_station', inplace = True)
    
    datos_scaled = scale(X=aux_df, axis=0, with_mean=True, with_std=True)

    datos_scaled = pd.DataFrame(datos_scaled, columns=aux_df.columns, index=aux_df.index)
    
    return datos_scaled

def n_clusters(df):
    
    range_n_clusters = range(1, 15)
    inertias = []

    for n_clusters in range_n_clusters:
        modelo_kmeans = KMeans(
                            n_clusters   = n_clusters, 
                            n_init       = 20, 
                            random_state = 123)

        modelo_kmeans.fit(df)
        inertias.append(modelo_kmeans.inertia_)

    fig, ax = plt.subplots(1, 1, figsize=(6, 3.84))
    ax.plot(range_n_clusters, inertias, marker='o')
    ax.set_title("Evolución de la varianza intra-cluster total")
    ax.set_xlabel('Número clusters')
    ax.set_ylabel('Intra-cluster (inertia)');
    
def agrupar(n, df):
    modelo_kmeans = KMeans(n_clusters=n, n_init=25, random_state=123)
    modelo_kmeans.fit(X=df)
    
    return modelo_kmeans.labels_

def see_groups(n, data, df):
    labels = agrupar(n, data)

    df['Group'] = labels

    hour_bygroup = df.groupby(['Group'], as_index = False)['Inputs', 'Outputs'].mean()

    x = np.arange(n)
    width = 0.25

    plt.figure(figsize=(15, 8))
    plt.bar(x, hour_bygroup['Inputs'], width = width, label = 'Inputs')
    plt.bar(x+width, hour_bygroup['Outputs'], width = width, label = 'Outputs')

    plt.xticks(x, hour_bygroup['Group'])
    plt.legend()
    plt.title('Mean inputs/outputs by group')
    
    return (df, hour_bygroup)

def cluster_for_hour(hour, df):
    hour = df[df['Hour'] == hour]

    data = scale_data(hour)

    n_clusters(data)
    
    return hour, data