<a href="https://colab.research.google.com/github/rjnakano/manejo-inventarios-pdg/blob/main/Notebooks/Silueta_bootstrapV1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [97]:
# pip install --upgrade kmodes
# !pip install plotnine
# !pip install google.colab

In [98]:
import pathlib
import pandas as pd
import numpy as np
from pylab import *

from sklearn.preprocessing import StandardScaler

from kmodes.kprototypes import KPrototypes

import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cmx

import seaborn as sns
%matplotlib inline
sns.set_context('poster')
sns.set_style('white')
sns.set_color_codes()

from plotnine import *
import plotnine

In [6]:
plot_kwds = {'alpha' : 0.5, 's' : 1, 'linewidths':0}

In [9]:
if str(pathlib.Path().absolute()).find('content') >= 0:
    from google.colab import drive
    drive.mount('/content/gdrive')
    sourcepath = '/content/gdrive/MyDrive/TDGdata'
else:
    sourcepath = '../Datos/Transformed'
print(sourcepath) 

../Datos/Transformed


In [99]:
# Dataset de inventarios sin dimensiones de los productos
data1 = pd.read_csv("{}/df042421.csv".format(sourcepath))

In [100]:
data1['AISLE']=data1['AISLE'].astype(str)

count    1.624400e+04
mean    -2.652209e-16
std      1.000031e+00
min     -1.438117e-01
25%     -8.904898e-02
50%     -3.557862e-02
75%     -2.920971e-02
max      8.095984e+01
Name: QTYVAR, dtype: float64

In [15]:
data1.dtypes

AISLE        object
IG           object
HTS          object
PLANNING     object
ABC          object
UOM          object
CAT          object
COSTX       float64
QTYVAR      float64
dtype: object

In [16]:
data1.head(5)

Unnamed: 0,AISLE,IG,HTS,PLANNING,ABC,UOM,CAT,COSTX,QTYVAR
0,17,IG56,HTS119,1/1,B,PC,CAT17,-0.029671,-0.037167
1,17,IG56,HTS119,1/1,B,PC,CAT17,0.092655,-0.006905
2,17,IG56,HTS119,1/1,C,PC,CAT17,0.266138,0.020477
3,17,IG56,HTS119,1/1,B,PC,CAT17,-0.024812,-0.044242
4,15,IG56,HTS119,1/1,C,PC,CAT17,-0.077628,-0.046455


In [17]:
# scikit-learn bootstrap
from sklearn.utils import resample

In [21]:
# Matriz de distancias
dm = pd.read_csv("{}/distances.csv".format(sourcepath))
dm=dm.to_numpy()

In [77]:
import numba as nb
@nb.njit
def dist_num_njit(data):
    scores = np.zeros((len(data),len(data)), dtype=np.float64)
    valor = nb.float64(0)

    for i in range(len(data)):
        for j in range(i+1, len(data)):
            valor = nb.float64(np.sum((data[i] - data[j]) ** 2))
            scores[i][j] = valor
    return scores

def dist_cat(data, catWeight):
    scores = np.zeros((len(data),len(data)), dtype=np.float64)
    for i in range(len(data)):
        for j in range(i+1, len(data)):
            scores[i][j] = np.float64(np.sum(data[i]!=data[j])*catWeight)
    return scores

@nb.njit
def traspose_njit (data):
    scores = data
    for i in range(len(data)):
        for j in range(1+i,len(data)):
            scores[j][i] = scores[i][j]
    return scores 

first_run = True # Se utiliza para precompilar los algoritmos mejorados con numba

In [78]:
# Coeficiente silueta para K 2-4
kValues = [3, 4]
gamma = 0.5
nBootstrap = 20
# Inicializamos la matriz de coeficientes silueta con ceros
silueta = [ [ 0 for j in range(nBootstrap + 1) ] for i in range(len(kValues)) ]
catColumnsPos = [data1.columns.get_loc(col) for col in list(data1.select_dtypes('object').columns)]
numColumnsPos = [data1.columns.get_loc(col) for col in list(data1.select_dtypes('float64').columns)]
N = data1.shape[0]

In [79]:
from sklearn.metrics import silhouette_score
import time
start_time = time.time()
for k in kValues:
  print('k = ', k)
  kprototype = KPrototypes(n_jobs = -1, n_clusters = k, init = 'Huang', gamma = gamma, random_state = 0)
  kprototype.fit_predict(data1, categorical = catColumnsPos)
  print("--- %s seconds ---" % (time.time() - start_time))
  silueta[k-min(kValues)][0] = silhouette_score(dm, kprototype.labels_, metric='precomputed')
  print("Coeficiente silueta --- %s seconds ---" % (time.time() - start_time))
  print(silueta[k-min(kValues)][0])
  for b in range(1,nBootstrap+1):
    print('bootstrap # ',b);
    boot = resample(data1, replace=True, n_samples=data1.shape[0], random_state=b)
    # Fit the cluster
    kprototypebs = KPrototypes(n_jobs = -1, n_clusters = k, init = 'Huang', gamma = gamma, random_state = 1)
    kprototypebs.fit_predict(boot, categorical = catColumnsPos)
    print("--- %s seconds ---" % (time.time() - start_time));
    
    # boot dividido en atributos numericas y categoricos
    dataNum=boot.iloc[:,numColumnsPos]
    dataNum=dataNum.to_numpy()
    dataCat=boot.iloc[:,catColumnsPos]
    dataCat=dataCat.to_numpy()
    
    # Precompila el algoritmo de numba
    if first_run == True:
        print('Precompilar calculador de distancias')
        %time dist_num_np = dist_num_njit(dataNum[range(10)])
        %time dist_cat_np = dist_cat(dataCat[range(10)], gamma)
        %time scores = np.sum([dist_cat_np, dist_num_np], axis=0)
        %time scores = traspose_njit(scores)
        first_run = False
    
    print('Cálculo de distancias bootstrap {} con {} clusters'.format(b,k))
    %time dist_num_np = dist_num_njit(dataNum[range(N)])
    %time dist_cat_np = dist_cat(dataCat[range(N)], gamma)
    %time scores = np.sum([dist_cat_np, dist_num_np], axis=0)
    %time scores = traspose_njit(scores)
    
    
#     # Inicializamos la matriz de distancias con ceros
#     scores = [ [ 0 for i in range(N) ] for j in range(N) ]
#     # Calculamos la mitad de la matriz distancias
#     for i in range(N):
#       for j in range(i+1,N):
#         scores[i][j]=np.sum((dataNum[i] - dataNum[j]) ** 2) + gamma * np.sum(dataCat[i]!=dataCat[j])
#     print("Matriz de distancias --- %s seconds ---" % (time.time() - start_time))
#     # Completamos la matriz
#     for i in range(0,N):
#       for j in range(0,i):
#         scores[i][j]=scores[j][i]
    print("Espejo --- %s seconds ---" % (time.time() - start_time))
    silueta[k-min(kValues)][b] = silhouette_score(scores, kprototypebs.labels_, metric='precomputed')
    print("Coeficiente silueta --- %s seconds ---" % (time.time() - start_time))
    print(silueta[k-min(kValues)][b])

k =  3
--- 21.467751264572144 seconds ---
Coeficiente silueta --- 25.111001014709473 seconds ---
0.7182138218409522
bootstrap #  1
--- 49.28831100463867 seconds ---
Precompilar calculador de distancias
Wall time: 357 ms
Wall time: 0 ns
Wall time: 0 ns
Wall time: 93.4 ms
Cálculo de distancias bootstrap 1 con 3 clusters
Wall time: 15.7 s
Wall time: 16min 48s
Wall time: 2.12 s
Wall time: 994 ms
Espejo --- 1077.578875541687 seconds ---
Coeficiente silueta --- 1078.7569379806519 seconds ---
0.09330388061499972
bootstrap #  2
--- 1103.2861075401306 seconds ---
Cálculo de distancias bootstrap 2 con 3 clusters
Wall time: 15.6 s
Wall time: 16min 41s
Wall time: 2.29 s
Wall time: 987 ms
Espejo --- 2124.099014043808 seconds ---
Coeficiente silueta --- 2125.2709085941315 seconds ---
0.0795551613919883
bootstrap #  3
--- 2155.676869869232 seconds ---
Cálculo de distancias bootstrap 3 con 3 clusters
Wall time: 16.1 s
Wall time: 16min 59s
Wall time: 2.19 s
Wall time: 995 ms
Espejo --- 3194.59007263183

In [34]:
# data1['Cluster'] = kprototype.labels_

In [80]:
silueta

[[0.7182138218409522,
  0.09330388061499972,
  0.0795551613919883,
  0.08579445355521252,
  0.7848836886437416,
  0.7580501112931499,
  0.09604875554522227,
  0.08272228147006884,
  0.08775924380328577,
  0.1008091939490596,
  0.10999866521757545,
  0.12736999263905782,
  0.08529892407285644,
  0.10477446840618314,
  0.7925131592452925,
  0.08379381515543731,
  0.7799059521528896,
  0.1250009241378602,
  0.07779559025875957,
  0.08067551807392272,
  0.08299601690496457],
 [0.7507712291842021,
  0.123667752437498,
  0.7566801991889119,
  0.7449492595601183,
  0.773342891525885,
  0.7369201592353902,
  0.7501565513476068,
  0.7134535728495904,
  0.7723386443651569,
  0.7503756482354623,
  0.7426714582364835,
  0.7760128588092476,
  0.11629638724585832,
  0.12512664791436034,
  0.7802855781172742,
  0.1404263553693604,
  0.7481649464247122,
  0.11307373834995871,
  0.744130313013626,
  0.7652018924137288,
  0.13717480315670955]]

In [81]:
np.savetxt('{}/silueta.csv'.format(sourcepath),silueta,delimiter=',')

In [82]:
for i in kValues:
  print('Mediana para k = ',i);
  print(median(silueta[i-min(kValues)][1:nBootstrap+1]));

Mediana para k =  3
0.094676318080111
Mediana para k =  4
0.7445397862868721


In [83]:
silueta = pd.DataFrame(silueta)

In [96]:
silueta_transposed = silueta.T[1:]
silueta_transposed.columns = kValues
silueta_transposed.describe()

Unnamed: 0,3,4
count,20.0,20.0
mean,0.230952,0.565522
std,0.281465,0.295675
min,0.077796,0.113074
25%,0.083594,0.139613
50%,0.094676,0.74454
75%,0.125593,0.758811
max,0.792513,0.780286
