<a href="https://colab.research.google.com/github/rjnakano/manejo-inventarios-pdg/blob/main/Notebooks/Silueta_bootstrapV1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# pip install --upgrade kmodes

Requirement already up-to-date: kmodes in /usr/local/lib/python3.7/dist-packages (0.11.0)


In [1]:
import pathlib
import pandas as pd
import numpy as np
from pylab import *

from sklearn.preprocessing import StandardScaler

from kmodes.kprototypes import KPrototypes

import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cmx

import seaborn as sns
%matplotlib inline
sns.set_context('poster')
sns.set_style('white')
sns.set_color_codes()

from plotnine import *
import plotnine

In [2]:
plot_kwds = {'alpha' : 0.5, 's' : 1, 'linewidths':0}

In [3]:
if str(pathlib.Path().absolute()).find('content') >= 0:
    from google.colab import drive
    drive.mount('/content/gdrive')
    sourcepath = '/content/gdrive/MyDrive/TDGdata'
else:
    sourcepath = '../Datos/Transformed'
print(sourcepath)  


../Datos/Transformed


In [4]:
# Dataset de inventarios sin dimensiones de los productos
data1 = pd.read_csv("{}/dfNoSupervisado_fixed.csv".format(sourcepath))

In [8]:
data1['AISLE']=data1['AISLE'].astype(str)
data1.drop('Unnamed: 0', inplace=True, axis=1)

In [9]:
data1.dtypes

AISLE        object
IG           object
HTS          object
PLANNING     object
ABC          object
UOM          object
CAT          object
COSTX       float64
QTYVAR      float64
dtype: object

In [67]:
data1.head(5)

Unnamed: 0,AISLE,IG,HTS,PLANNING,ABC,UOM,CAT,COSTX,QTYVAR
0,17,IG56,HTS119,1/1,B,PC,CAT17,-0.029976,-0.037185
1,17,IG56,HTS119,1/1,B,PC,CAT17,0.092472,-0.006928
2,17,IG56,HTS119,1/1,C,PC,CAT17,0.26613,0.02045
3,17,IG56,HTS119,1/1,B,PC,CAT17,-0.025112,-0.044259
4,15,IG56,HTS119,1/1,C,PC,CAT17,-0.077982,-0.046471


In [69]:
# scikit-learn bootstrap
from sklearn.utils import resample

In [12]:
# Matriz de distancias
dm = pd.read_csv("{}/scores_fixed.csv".format(sourcepath),header=None)
dm=dm.to_numpy()

In [70]:
type(dm)

numpy.ndarray

In [77]:
data1.iloc[[3]].index[0]

3

In [86]:
data1.index[2]

2

In [84]:
type(resample(data1, replace=True, n_samples=5, random_state=2))

pandas.core.frame.DataFrame

In [80]:
dm[data1.iloc[[1]].index[0]][data1.iloc[[3]].index[0]]

0.015219723329571548

In [94]:

# df = pd.DataFrame([[0]*5]*5)
# for i in range(5):
#       for j in range(i+1,5):
#         df[i][j]=dm[df[i]][df[j]]

In [146]:
import numba as nb

def get_scores(idx, dm, N):
    scores = set_scores(N)
    return generate_distance(idx, dm, N, scores)

def set_scores(N):
    return np.array([[0.0]*N]*N, dtype=np.float64)

@nb.njit
def generate_distance(idx, dm, N, scores):
    for i in range(N):
        for j in range(i+1,N):
            scores[i][j]=dm[idx[i]][idx[j]]
    # Espejo
    for i in range(0,N):
        for j in range(0,i):
              scores[i][j]=scores[j][i]
    return scores

In [148]:
# Precompilar
boot = resample(data1, replace=True, n_samples=5, random_state=1)
%time scores = get_scores(boot.index.to_numpy(),dm,5)
scores

Wall time: 0 ns


array([[0.        , 2.50001447, 3.1198955 , 3.55266256, 3.18729462],
       [2.50001447, 0.        , 2.12192583, 3.05702773, 2.68987114],
       [3.1198955 , 2.12192583, 0.        , 2.65774094, 2.00754673],
       [3.55266256, 3.05702773, 2.65774094, 0.        , 2.59655895],
       [3.18729462, 2.68987114, 2.00754673, 2.59655895, 0.        ]])

In [153]:

# Coeficiente silueta para K 2-3
kValues = [2, 3]
gamma = 0.5
nBootstrap = 2
# Inicializamos la matriz de coeficientes silueta con ceros
silueta = [ [ 0 for j in range(nBootstrap + 1) ] for i in range(len(kValues)) ]
catColumnsPos = [data1.columns.get_loc(col) for col in list(data1.select_dtypes('object').columns)]
N = data1.shape[0]


In [154]:
from sklearn.metrics import silhouette_score
import time
start_time = time.time()
for k in kValues:
  print('Dataset original, k = ', k)
  kprototype = KPrototypes(n_jobs = -1, n_clusters = k, init = 'Huang', gamma = gamma, random_state = 0)
  kprototype.fit_predict(data1, categorical = catColumnsPos)
  print("--- %s seconds ---" % (time.time() - start_time))
  silueta[k-min(kValues)][0] = silhouette_score(dm, kprototype.labels_, metric='precomputed')
  print("Coeficiente silueta --- %s seconds ---" % (time.time() - start_time))
  print(silueta[k-min(kValues)][0])

for b in range(1,nBootstrap+1):
  print('bootstrap # ',b);
  boot = resample(data1, replace=True, n_samples=N, random_state=b)
  %time scores = get_scores(boot.index.to_numpy(),dm,N)
  for k in kValues:
    print('Bootstrap, k = ', k)
    kprototypebs = KPrototypes(n_jobs = -1, n_clusters = k, init = 'Huang', gamma = gamma, random_state = 1)
    kprototypebs.fit_predict(boot, categorical = catColumnsPos)
    print("--- %s seconds ---" % (time.time() - start_time));
    silueta[k-min(kValues)][b] = silhouette_score(scores, kprototypebs.labels_, metric='precomputed')
    print("Coeficiente silueta --- %s seconds ---" % (time.time() - start_time))
    print(silueta[k-min(kValues)][b])

Dataset original, k =  2
--- 22.26518201828003 seconds ---
Coeficiente silueta --- 26.087936401367188 seconds ---
0.7368900959494543
Dataset original, k =  3
--- 50.059807538986206 seconds ---
Coeficiente silueta --- 53.779966831207275 seconds ---
0.7187617117905637
bootstrap #  1
Wall time: 10.8 s
Bootstrap, k =  2
--- 78.05259346961975 seconds ---
Coeficiente silueta --- 79.52582693099976 seconds ---
0.785403719753729
Bootstrap, k =  3
--- 103.82108449935913 seconds ---
Coeficiente silueta --- 105.27886033058167 seconds ---
0.8004287100371626
bootstrap #  2
Wall time: 11.2 s
Bootstrap, k =  2
--- 132.58505082130432 seconds ---
Coeficiente silueta --- 134.02788996696472 seconds ---
0.7482729047623066
Bootstrap, k =  3
--- 156.52967166900635 seconds ---
Coeficiente silueta --- 157.7429060935974 seconds ---
0.060572932085086593


In [155]:
boot

Unnamed: 0,AISLE,IG,HTS,PLANNING,ABC,UOM,CAT,COSTX,QTYVAR
7336,19,IG52,HTS119,1/0,C,PC,CAT16,0.068203,-0.031972
2575,2,IG31,HTS55,1/1,A,YD,CAT12,0.183982,-0.003018
6637,13,IG45,HTS24,1/1,B,BG,CAT17,0.042379,0.239467
13896,11,IG34,HTS15,2/0,C,BX,CAT8,-0.081282,-0.143814
11798,4,IG33,HTS36,1/1,B,YD,CAT12,-0.103650,-0.143814
...,...,...,...,...,...,...,...,...,...
10475,22,IG7,HTS95,1/0,C,BX,CAT8,-0.194974,-0.143814
6498,13,IG53,HTS15,1/1,B,PC,CAT16,0.085206,0.027962
2605,7,IG25,HTS48,1/1,A,BG,CAT7,-0.161566,-0.143814
13185,15,IG46,HTS119,1/1,A,BX,CAT17,0.202427,0.779192


In [158]:
dm[7336][9379]

3.071558599002888

In [160]:
scores[16238][0]

3.071558599002888

In [106]:
for i in kValues:
  print('Mediana para k = ',i);
  print(median(silueta[i-min(kValues)][1:nBootstrap+1]));

Mediana para k =  2
0.7718973828791865
Mediana para k =  3
0.08642952100349402
Mediana para k =  4
0.44017397581320494


In [None]:
#np.savetxt("/content/gdrive/MyDrive/TDGdata/siluetaboot.csv", silueta, delimiter=",")