<a href="https://colab.research.google.com/github/rjnakano/manejo-inventarios-pdg/blob/main/Notebooks/Silueta_bootstrapV1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# pip install --upgrade kmodes

Requirement already up-to-date: kmodes in /usr/local/lib/python3.7/dist-packages (0.11.0)


In [1]:
import pathlib
import pandas as pd
import numpy as np
from pylab import *

from sklearn.preprocessing import StandardScaler

from kmodes.kprototypes import KPrototypes

import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cmx

import seaborn as sns
%matplotlib inline
sns.set_context('poster')
sns.set_style('white')
sns.set_color_codes()

from plotnine import *
import plotnine

In [2]:
plot_kwds = {'alpha' : 0.5, 's' : 1, 'linewidths':0}

In [3]:
if str(pathlib.Path().absolute()).find('content') >= 0:
    from google.colab import drive
    drive.mount('/content/gdrive')
    sourcepath = '/content/gdrive/MyDrive/TDGdata'
else:
    sourcepath = '../Datos/Transformed'
print(sourcepath)  


../Datos/Transformed


In [4]:
# Dataset de inventarios sin dimensiones de los productos
data1 = pd.read_csv("{}/dfNoSupervisado_fixed.csv".format(sourcepath))

In [8]:
data1['AISLE']=data1['AISLE'].astype(str)
data1.drop('Unnamed: 0', inplace=True, axis=1)

In [9]:
data1.dtypes

AISLE        object
IG           object
HTS          object
PLANNING     object
ABC          object
UOM          object
CAT          object
COSTX       float64
QTYVAR      float64
dtype: object

In [67]:
data1.head(5)

Unnamed: 0,AISLE,IG,HTS,PLANNING,ABC,UOM,CAT,COSTX,QTYVAR
0,17,IG56,HTS119,1/1,B,PC,CAT17,-0.029976,-0.037185
1,17,IG56,HTS119,1/1,B,PC,CAT17,0.092472,-0.006928
2,17,IG56,HTS119,1/1,C,PC,CAT17,0.26613,0.02045
3,17,IG56,HTS119,1/1,B,PC,CAT17,-0.025112,-0.044259
4,15,IG56,HTS119,1/1,C,PC,CAT17,-0.077982,-0.046471


In [69]:
# scikit-learn bootstrap
from sklearn.utils import resample

In [12]:
# Matriz de distancias
dm = pd.read_csv("{}/scores_fixed.csv".format(sourcepath),header=None)
dm=dm.to_numpy()

In [70]:
type(dm)

numpy.ndarray

In [162]:
import numba as nb

def get_scores(idx, dm, N):
    scores = set_scores(N)
    return generate_distance(idx, dm, N, scores)

def set_scores(N):
    return np.array([[0.0]*N]*N, dtype=np.float64)

@nb.njit
def generate_distance(idx, dm, N, scores):
    for i in range(N):
        for j in range(i+1,N):
            scores[i][j]=dm[idx[i]][idx[j]]
    # Espejo
    for i in range(0,N):
        for j in range(0,i):
              scores[i][j]=scores[j][i]
    return scores

In [163]:
# Precompilar
boot = resample(data1, replace=True, n_samples=5, random_state=1)
%time scores = get_scores(boot.index.to_numpy(),dm,5)
scores

Wall time: 335 ms


array([[0.        , 2.50001447, 3.1198955 , 3.55266256, 3.18729462],
       [2.50001447, 0.        , 2.12192583, 3.05702773, 2.68987114],
       [3.1198955 , 2.12192583, 0.        , 2.65774094, 2.00754673],
       [3.55266256, 3.05702773, 2.65774094, 0.        , 2.59655895],
       [3.18729462, 2.68987114, 2.00754673, 2.59655895, 0.        ]])

In [164]:

# Coeficiente silueta para K 2-3
kValues = [2, 3]
gamma = 0.5
nBootstrap = 100
# Inicializamos la matriz de coeficientes silueta con ceros
silueta = [ [ 0 for j in range(nBootstrap + 1) ] for i in range(len(kValues)) ]
catColumnsPos = [data1.columns.get_loc(col) for col in list(data1.select_dtypes('object').columns)]
N = data1.shape[0]


In [165]:
from sklearn.metrics import silhouette_score
import time
start_time = time.time()
for k in kValues:
  print('Dataset original, k = ', k)
  kprototype = KPrototypes(n_jobs = -1, n_clusters = k, init = 'Huang', gamma = gamma, random_state = 0)
  kprototype.fit_predict(data1, categorical = catColumnsPos)
  print("--- %s seconds ---" % (time.time() - start_time))
  silueta[k-min(kValues)][0] = silhouette_score(dm, kprototype.labels_, metric='precomputed')
  print("Coeficiente silueta --- %s seconds ---" % (time.time() - start_time))
  print(silueta[k-min(kValues)][0])

for b in range(1,nBootstrap+1):
  print('bootstrap # ',b);
  boot = resample(data1, replace=True, n_samples=N, random_state=b)
  %time scores = get_scores(boot.index.to_numpy(),dm,N)
  for k in kValues:
    print('Bootstrap, k = ', k)
    kprototypebs = KPrototypes(n_jobs = -1, n_clusters = k, init = 'Huang', gamma = gamma, random_state = 1)
    kprototypebs.fit_predict(boot, categorical = catColumnsPos)
    print("--- %s seconds ---" % (time.time() - start_time));
    silueta[k-min(kValues)][b] = silhouette_score(scores, kprototypebs.labels_, metric='precomputed')
    print("Coeficiente silueta --- %s seconds ---" % (time.time() - start_time))
    print(silueta[k-min(kValues)][b])

Dataset original, k =  2
--- 23.36100673675537 seconds ---
Coeficiente silueta --- 27.1089084148407 seconds ---
0.7368900959494543
Dataset original, k =  3
--- 51.971904039382935 seconds ---
Coeficiente silueta --- 55.70210289955139 seconds ---
0.7187617117905637
bootstrap #  1
Wall time: 10.8 s
Bootstrap, k =  2
--- 80.63299107551575 seconds ---
Coeficiente silueta --- 82.06686162948608 seconds ---
0.785403719753729
Bootstrap, k =  3
--- 109.46121096611023 seconds ---
Coeficiente silueta --- 110.88413977622986 seconds ---
0.8004287100371626
bootstrap #  2
Wall time: 10.8 s
Bootstrap, k =  2
--- 138.17328190803528 seconds ---
Coeficiente silueta --- 139.5953242778778 seconds ---
0.7482729047623066
Bootstrap, k =  3
--- 162.19521760940552 seconds ---
Coeficiente silueta --- 163.40697646141052 seconds ---
0.060572932085086593
bootstrap #  3
Wall time: 10.8 s
Bootstrap, k =  2
--- 189.0046558380127 seconds ---
Coeficiente silueta --- 190.43293976783752 seconds ---
0.7833639734870986
Boots



--- 5639.808018922806 seconds ---
Coeficiente silueta --- 5641.225229978561 seconds ---
0.7508434857975423
Bootstrap, k =  3
--- 5660.868723869324 seconds ---
Coeficiente silueta --- 5662.288898944855 seconds ---
0.7699123974458983
bootstrap #  89
Wall time: 10.9 s
Bootstrap, k =  2
--- 5691.605539798737 seconds ---
Coeficiente silueta --- 5693.027708292007 seconds ---
0.7547151519859796
Bootstrap, k =  3
--- 5714.368524551392 seconds ---
Coeficiente silueta --- 5715.543384313583 seconds ---
0.08574052858872579
bootstrap #  90
Wall time: 10.8 s
Bootstrap, k =  2
--- 5741.6545560359955 seconds ---
Coeficiente silueta --- 5742.841382026672 seconds ---
0.07268186403910713
Bootstrap, k =  3
--- 5765.902525424957 seconds ---
Coeficiente silueta --- 5767.104283094406 seconds ---
0.08408826995279198
bootstrap #  91
Wall time: 10.8 s
Bootstrap, k =  2
--- 5796.41894030571 seconds ---
Coeficiente silueta --- 5797.833154439926 seconds ---
0.69197080412929
Bootstrap, k =  3
--- 5817.62619304657 s

In [166]:
for i in kValues:
  print('Mediana para k = ',i);
  print(median(silueta[i-min(kValues)][1:nBootstrap+1]));

Mediana para k =  2
0.7552700048506455
Mediana para k =  3
0.09700181894374368


In [170]:
silueta_df = pd.DataFrame(silueta)
#np.savetxt("/content/gdrive/MyDrive/TDGdata/siluetaboot.csv", silueta, delimiter=",")

In [173]:
silueta_df = silueta_df.T
silueta_df.columns=['k2','k3']

In [177]:
silueta_df[1:].describe()

Unnamed: 0,k2,k3
count,100.0,100.0
mean,0.746529,0.276295
std,0.168257,0.307544
min,0.064186,0.060573
25%,0.735904,0.084205
50%,0.75527,0.097002
75%,0.783549,0.764586
max,0.998792,0.802114


In [175]:
silueta_df.to_csv('{}/silueta_fixed_data.csv'.format(sourcepath))

In [182]:

print('Mediana k2 = {}'.format(median(silueta_df.k2[1:])))

Mediana k2 = 0.7552700048506455


In [180]:
median(silueta_df.k3[1:])

0.09700181894374368

In [188]:
for i in silueta_df.columns:
    print('Mediana {} = {}'.format(i,median(silueta_df[i][1:])))

Mediana k2 = 0.7552700048506455
Mediana k3 = 0.09700181894374368
