<div style="width:100%; overflow:hidden; background-color:#F1F1E6; padding: 10px; border-style: outset; color:#17469e">
    <div style="width: 80%; float: left;">
    <h2 align="center">Universidad de Sonora</h2>
    <hr style="border-width: 3px; border-color:#17469e">
          <h1>Reconocimiento de patrones: Preparación de los datos</h1>          
          <h4>Ramón Soto C. <a href="mailto:rsotoc@moviquest.com/">(rsotoc@moviquest.com)</a></h4>
    </div>
    <div style="float: right;">
    <img src="images/escudo_unison.png">
    </div>
</div>

## Caso de estudio: [*Stack Overflow 2018 Developer Survey*](https://www.kaggle.com/stackoverflow/stack-overflow-2018-developer-survey)

Como caso de estudio principal en el presente curso hemos seleccionado la encuesta de desarrolladores 2018 de *Stack Overflow* disponible en [Kaggle](https://www.kaggle.com). En este esta etapa realizaremos el análisis de agrupamientos.

### 4. Modelado - ISODATA

<div style="margin-top: 6px; border: 1px solid #cfcfcf; padding: 8px 12px; border-radius:2px; background-color:#f7f7f7; ">
... ahora utilizamos la técnica ISODATA para identificar prototipos de clases. <br>Inicializamos el contexto y cargamos los datos:
</div>

In [1]:
"""
Reconocimiento de patrones: ISODATA
"""

#from scipy.spatial.distance import squareform

# Inicializar el ambiente
import sys
import numpy as np
import pandas as pd
import json
import pickle
#import math
import random
#import time

from IPython.display import display, HTML
from collections import Counter
from operator import itemgetter
#from scipy.spatial.distance import euclidean, pdist, squareform

np.set_printoptions(precision=2, suppress=True) # Cortar la impresión de decimales a 1
pd.set_option('display.max_columns', 130)
pd.set_option('max_colwidth', 80)

LARGER_DISTANCE = sys.maxsize
TALK = True # TALK = True, imprime resultados parciales

In [2]:
path = "Data sets/Stack Overflow Survey/"

# Recuperar encabezados de columnas en orden original
with open(path + 'survey_results_public_transformed.headers', 'rb') as file:  
    headers = pickle.load(file)

# Recuperar diccionarios... sólo por si se requieren
with open(path + 'survey_results_public_transformed.dicts', 'rb') as file:  
    dict_of_dicts = pickle.load(file)

with open(path + 'survey_results_public_transformed.json') as f:
    dict_json = json.load(f)
df = pd.DataFrame.from_dict(dict_json)

df = df.sample(n=100).reset_index(drop=True)


# Reordenar las columnas de acuerdo al orden original
df = df.reindex(headers, axis=1)

DATA_LEN = df.shape[0]

# Agregar una columna "cluster" inicializada a null 
df["Cluster"] = np.nan

In [3]:
var_str = ['Hobby', 'OpenSource', 'Country', 'Student', 'Employment', 'FormalEducation', 
         'UndergradMajor', 'CompanySize', 'YearsCoding', 'YearsCodingProf', 'UpdateCV', 
         'JobSatisfaction', 'CareerSatisfaction', 'HopeFiveYears', 'JobSearchStatus', 
         'LastNewJob', 'TimeFullyProductive', 'AgreeDisagree1', 'AgreeDisagree2', 
         'AgreeDisagree3', 'OperatingSystem', 'NumberMonitors', 'CheckInCode', 'AdBlocker', 
         'AdBlockerDisable', 'AdsAgreeDisagree1', 'AdsAgreeDisagree2', 'AdsAgreeDisagree3', 
         'AIDangerous', 'AIInteresting', 'AIResponsible', 'AIFuture', 'EthicsChoice', 
         'EthicsReport', 'EthicsResponsible', 'EthicalImplications', 'HoursComputer', 
         'StackOverflowRecommend', 'StackOverflowVisit', 'StackOverflowHasAccount', 
         'StackOverflowParticipate', 'StackOverflowJobs', 'StackOverflowDevStory', 
         'StackOverflowJobsRecommend', 'StackOverflowConsiderMember', 'HypotheticalTools1', 
         'HypotheticalTools2', 'HypotheticalTools3', 'HypotheticalTools4', 'WakeTime', 
         'HypotheticalTools5', 'HoursOutside', 'SkipMeals', 'Exercise', 'EducationParents', 
         'Age', 'Dependents', 'SurveyTooLong', 'SurveyEasy']
var_list = ['DevType', 'CommunicationTools', 'EducationTypes', 'SelfTaughtTypes', 
         'HackathonReasons', 'LanguageDesireNextYear', 'DatabaseWorkedWith', 
         'DatabaseDesireNextYear', 'PlatformWorkedWith', 'PlatformDesireNextYear', 
         'FrameworkWorkedWith', 'FrameworkDesireNextYear', 'IDE', 'Methodology', 
         'VersionControl', 'AdBlockerReasons', 'AdsActions', 'ErgonomicDevices', 
         'RaceEthnicity', 'LanguageWorkedWith']
var_ranks = ['AssessJob', 'AssessBenefits', 'JobContactPriorities', 'JobEmailPriorities', 
             'AdsPriorities']
var_float = 'ConvertedSalary'

def distance_qual(x, y):
    # Número de variables; si var_float es array, modificar "+ 1" por "+ len(var_float)"
    numvars = len(var_str) + len(var_list) + len(var_ranks) + 1
    
    distancia = abs(x.ConvertedSalary - y.ConvertedSalary)
    if pd.isnull(distancia):
        distancia = 0
        numvars -= 1
        
    for col in var_str:
        if x[col] != y[col]:
            distancia += 1
        
    for col in var_list:
        num_vars = len(x[col]) + len(y[col])
        d = 0
        if num_vars > 0:
            d = (2*len(set(x[col] + y[col])) - num_vars) / num_vars
        distancia += d

    for col in var_ranks:
        d = 0
        max_vars = max(len(x[col]), len(y[col]))
        if len(x[col]) != 0 and len(y[col]) != 0:
            for v in range(len(x[col])):
                if x[col][v] != y[col][v]:
                    d += 1
        else:
            d += max_vars
        
        if d != 0:
            d /= max_vars
        distancia += d

    return distancia / numvars
    
def decode(dataframe):
    new_df = dataframe.copy(deep=True)
    
    for col in var_str:
        if col in list(dataframe) and col in dict_of_dicts:
            for index, row in dataframe.iterrows():
                value = dict_of_dicts[col][row[col]]
                new_df.at[clusters.index[index], col] = value
                
    for index, row in dataframe.iterrows():
        new_df.at[clusters.index[index], 'ConvertedSalary'] = row['ConvertedSalary'] * 200000
    
    for col in var_list:
        if col in list(dataframe):
            for index, row in dataframe.iterrows():
                values_list = row[col].copy()
                for i in range(len(values_list)):
                    values_list[i] = dict_of_dicts[col][values_list[i]]
                new_df.at[clusters.index[index], col] = values_list
                
    return new_df

<div style="margin-top: 6px; border: 1px solid #cfcfcf; padding: 8px 12px; border-radius:2px; background-color:#f7f7f7; ">
A continuación ejecutamos el algoritmo ISODATA:
</div>

1) Definir los valores de $k_{init}, n_{min}, I_{max}, \sigma_{max}, L_{min}$ y $P_{max}$:

In [4]:
K_INIT = 5
N_MIN = 15
I_MAX = 10
S_MAX = .95 # La desviación estándar está normalizada
L_MIN = .75 # Las distancis están normalizadas
P_MAX = 2

NUM_CLUSTERS = K_INIT # valor de k
iteration = 0

2) Seleccionar de manera arbitraria *k* puntos en el espacio de características como centros iniciales de los clusters (centroides o centros de masa).

In [5]:
# Inicializar los centroides
centroids = df.sample(n=NUM_CLUSTERS).reset_index(drop=True)

3) Asignar cada punto del conjunto de datos al cluster donde la distancia del punto al centroide es menor.

In [6]:
elim = False
members = []

def update_clusters():
    global NUM_CLUSTERS, elim, members, centroids
    changed = False
    cluster_col_index = df.shape[1] - 1
    
    if TALK :
        print("Actualizando clusters")
    for index, row in df.iterrows():
        minDistance = LARGER_DISTANCE
        currentCluster = 0
        
        # Buscar la menor distancia del punto a un centroide
        for i, r in centroids.iterrows():
            dist = distance_qual(row, r)
            if(dist < minDistance):
                minDistance = dist
                currentCluster = i
        
        # Si hay cambio, realizarlo y levantar la bandera 'changed'
        if(pd.isnull(row['Cluster']) or row['Cluster'] != currentCluster):
            df.iloc[index, cluster_col_index] = currentCluster
            changed = True  
            
    # Contabilizar los elementos en cada cluster   
    members = [0] * NUM_CLUSTERS
    for i in range(NUM_CLUSTERS):
        members[i] = df[df["Cluster"]==i].count()["Cluster"]
        if (TALK) : 
            print("El cluster ", i, " incluye ", members[i], "miembros.")
    if (TALK) : 
        print()

    to_eliminate = []
    for j in range(NUM_CLUSTERS):
        if members[j] < N_MIN:
            to_eliminate.append(j)
    if len(to_eliminate) > 0:
        elim = True
        if (TALK) : 
            print("Clusters a eliminar:", to_eliminate)
        # Eliminar los centroides seleccionados
        centroids.drop(to_eliminate, inplace=True)    
        centroids = centroids.reset_index(drop=True)
        NUM_CLUSTERS = centroids.shape[0]
        changed = True
    else :
        elim = False
        
    if changed:
        for index, row in df.iterrows():
            minDistance = LARGER_DISTANCE
            currentCluster = 0

            for j, rc in centroids.iterrows():
                dist = distance_qual(row, rc)
                #print(j, dist, currentCluster, minDistance)
                if(dist < minDistance):
                    minDistance = dist
                    currentCluster = j
            
            if(pd.isnull(row['Cluster']) or row['Cluster'] != currentCluster):
                df.iloc[index, cluster_col_index] = currentCluster
                
        # Contabilizar los elementos en cada cluster   
        members = [0] * NUM_CLUSTERS
        for i in range(NUM_CLUSTERS):
            members[i] = df[df["Cluster"]==i].count()["Cluster"]
            if (TALK) : 
                print("El cluster ", i, " incluye ", members[i], "miembros.")
        if (TALK) : 
            print()
        
    return changed

# --------------------------
# Actualizar los clusters
KEEP_WALKING = update_clusters()




Actualizando clusters
El cluster  0  incluye  4 miembros.
El cluster  1  incluye  20 miembros.
El cluster  2  incluye  10 miembros.
El cluster  3  incluye  35 miembros.
El cluster  4  incluye  31 miembros.

Clusters a eliminar: [0, 2]
El cluster  0  incluye  28 miembros.
El cluster  1  incluye  39 miembros.
El cluster  2  incluye  33 miembros.



4) Calcular los centroides a partir de los puntos en cada cluster. 

In [7]:
def update_centroids():    
    for cl_j in range(NUM_CLUSTERS):
        means = [0] * (df.shape[1] - 1)
        
        # Seleccionar registros en el cluster cl_j
        df_clusterj = df[df["Cluster"] == cl_j]

        # Media en los datos numéricos
        col = 'ConvertedSalary'
        centroids.at[centroids.index[cl_j], col] = df_clusterj[col].mean()
        
        # Moda en las columnas 'simples' (en var_str)
        mode = df_clusterj[var_str].mode()
        for col in mode:
            centroids.at[centroids.index[cl_j], col] = mode[col].values[0]

        # Moda en las columnas con listas de longitud variable (en var_list)
        for col in var_list:
            mean_len = 0
            vars_list = []
            for index, row in df_clusterj.iterrows():
                mean_len += len(row[col])
                vars_list = vars_list + row[col]
            mean_len /= df_clusterj.shape[0]
            counter = Counter(vars_list)
            mean_list = []
            for v in counter.most_common(round(mean_len + 0.5)):
                mean_list.append(v[0])
            centroids.at[centroids.index[cl_j], col] = mean_list

            
        # Moda en las columnas con listas de longitud fija (en var_ranks)
        ranges = [11, 12, 6, 8, 8]
        # Para cada variable en var_list, obtener el número de componentes en el vector
        # y el nombre de la columna
        for i, col in zip(range(len(ranges)), var_ranks):
            # Inicializar una matriz (lista de listas, en realidad), con tantos renglones como 
            # componentes tiene el vector de la variable. Cada renglón tiene todos los valores 
            # utilizados en cada posición del vector
            vars = []
            for j in range(ranges[i] - 1):
                vars.append([])

            # Recorrer todos los elementos actualmente en el cluster para rellenar la matriz
            for index, row in df_clusterj.iterrows():
                # Si el vector de la variable no está vacío...
                if len(row[col]) > 0:
                    # Para cada componente en el vector...
                    for j in range(len(row[col])):
                        # Si no es 0
                        if row[col][j] != '0':
                            # Agregarla al renglón actual en la matriz
                            vars[j].append(row[col][j])

            # Contabilizar ocurrencias de cada componente. Crear una matriz con el orden para
            # cada componente como renglones
            most_commons = []
            for j in range(ranges[i] - 1):
                counter=Counter(vars[j])
                most_commons.append(counter.most_common(ranges[i] - 1))

            # Inicializar vector. Se escoge el valor más popular en la primera componente
            vars_list = [most_commons[0][0][0]]
            # Para cada componente a partir de la segunda...
            for j in range(1, ranges[i] - 1):
                # Buscar la componente más común...
                for c in most_commons[j]:
                    # Siempre y cuando no esté utilizada...
                    if c[0] not in vars_list[:j]:
                        # Agregarla al vector y...
                        vars_list.append(c[0])
                        # Dejar de buscar.
                        break
            if len(vars_list) < ranges[i] - 1:
                vars_list = vars_list + list(set(range(1, ranges[i])) - set(vars_list))
            centroids.at[centroids.index[cl_j], col] = vars_list
    return

# --------------------------
# Actualizar los centroides
update_centroids()

In [8]:
display(centroids)

Unnamed: 0,Hobby,OpenSource,Country,Student,Employment,FormalEducation,UndergradMajor,CompanySize,DevType,YearsCoding,YearsCodingProf,JobSatisfaction,CareerSatisfaction,HopeFiveYears,JobSearchStatus,LastNewJob,UpdateCV,ConvertedSalary,CommunicationTools,TimeFullyProductive,EducationTypes,SelfTaughtTypes,HackathonReasons,AgreeDisagree1,AgreeDisagree2,AgreeDisagree3,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,FrameworkWorkedWith,FrameworkDesireNextYear,IDE,OperatingSystem,NumberMonitors,Methodology,VersionControl,CheckInCode,AdBlocker,AdBlockerDisable,AdBlockerReasons,AdsAgreeDisagree1,AdsAgreeDisagree2,AdsAgreeDisagree3,AdsActions,AIDangerous,AIInteresting,AIResponsible,AIFuture,EthicsChoice,EthicsReport,EthicsResponsible,EthicalImplications,StackOverflowRecommend,StackOverflowVisit,StackOverflowHasAccount,StackOverflowParticipate,StackOverflowJobs,StackOverflowDevStory,StackOverflowJobsRecommend,StackOverflowConsiderMember,HypotheticalTools1,HypotheticalTools2,HypotheticalTools3,HypotheticalTools4,HypotheticalTools5,WakeTime,HoursComputer,HoursOutside,SkipMeals,ErgonomicDevices,Exercise,EducationParents,RaceEthnicity,Age,Dependents,SurveyTooLong,SurveyEasy,LanguageWorkedWith,AssessJob,AssessBenefits,JobContactPriorities,JobEmailPriorities,AdsPriorities,Cluster
0,1,0,USA,0,0,1,6,4,"[0, 12, 11, 4]",10,0,3,3,6,2,3,7,0.135474,"[8, 0, 5]",3,"[8, 1]","[5, 7, 0]",[4],0,1,4,"[18, 27, 4, 5]","[14, 17, 19]","[17, 14]","[14, 0]","[14, 0, 2]","[5, 6]",[5],"[10, 18, 19]",3,2,"[0, 9]","[1, 4]",2,2,3,"[6, 1]",1,1,0,[2],2,3,3,1,0,0,2,2,10,2,2,4,1,2,5,2,0,2,2,3,1,5,1,0,3,[0],3,6,[6],1,0,1,4,"[18, 31, 14, 5, 27, 1]","[10, 8, 3, 1, 2, 6, 4, 5, 9, 7]","[1, 6, 3, 10, 11, 2, 8, 5, 9, 7, 4]","[2, 1, 5, 4, 3]","[7, 4, 3, 6, 1, 2, 5]","[2, 4, 1, 5, 3, 7, 6]",
1,1,0,USA,0,0,1,6,8,"[0, 19, 12]",7,11,5,5,6,2,5,7,0.020988,[4],3,[8],"[5, 7]",[0],0,0,0,"[27, 18, 31]",[14],[13],"[14, 22]","[14, 2]",[5],[5],"[10, 18]",3,2,[0],[1],2,2,0,[0],1,1,0,[2],2,3,3,1,0,0,0,2,10,1,2,1,0,2,5,1,2,2,4,3,3,5,1,0,3,[0],0,1,[6],1,0,1,2,"[14, 5, 18, 31]","[8, 6, 5, 1, 2, 9, 7, 4, 10, 3]","[1, 6, 3, 10, 5, 4, 8, 2, 11, 7, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]","[3, 1, 2, 4, 5]","[6, 7, 5, 3, 2, 1, 4]","[2, 5, 3, 1, 6, 7, 4]",
2,1,1,USA,0,0,1,6,1,"[12, 0, 11, 15]",9,10,3,3,6,2,3,7,0.151119,"[4, 8, 5, 10]",0,"[8, 1, 7, 3]","[5, 7, 0, 8]","[0, 4]",0,2,1,"[18, 14, 5, 17, 34]","[14, 19, 13, 17]","[13, 6, 18]","[14, 2, 0]","[14, 2, 18]","[5, 1, 6]","[5, 6, 1]","[19, 15, 6, 10]",3,2,"[0, 9, 4, 8]","[1, 4]",2,2,3,"[2, 6]",1,1,0,"[3, 2]",2,3,0,1,1,3,2,2,10,2,2,4,2,0,5,2,3,2,0,1,3,5,2,0,3,[2],3,1,[6],1,0,1,0,"[18, 14, 5, 17, 31, 34, 3, 1]","[7, 5, 8, 2, 1, 6, 3, 4, 10, 9]","[1, 7, 2, 10, 6, 4, 8, 3, 11, 9, 5]","[2, 1, 5, 4, 3]","[5, 3, 1, 2, 4, 7, 6]","[1, 5, 2, 3, 7, 4, 6]",


In [9]:
deltas = []
delta = 0
def update_deltas():
    global deltas, delta, centroids
    deltas = [0] * NUM_CLUSTERS
    N = 0
    for j, rc in centroids.iterrows():
        n = 0
        for i, row in df[df["Cluster"]==j].iterrows():
            deltas[j] += distance_qual(row, rc)
            n += 1
        delta += deltas[j]
        deltas[j] /= n
        N += n
    delta /= N
    
    if TALK : 
        print("Las distancias medias en cada cluster son:\n", deltas)   
        print("\nLa distancia media promedio es:", delta)   
        
    return

update_deltas()

Las distancias medias en cada cluster son:
 [0.5861377678992618, 0.7597348615802445, 0.5283972861152516]

La distancia media promedio es: 0.6347862754461217


In [10]:
import math

def std_dev():
    # Inicializar vector de desviaciones estándar... los valores actuales son inserbibles
    std_vectors = centroids.copy()
    
    for c in range(NUM_CLUSTERS) :
        df_c = df[(df["Cluster"]==c)]
        
        # Para cada variable numérica...
        df_cj = df_c[pd.notnull(df_c['ConvertedSalary'])]

        s = math.sqrt(sum(abs(df_cj["ConvertedSalary"] - 
                              centroids.iloc[c]["ConvertedSalary"])) / (df_cj.shape[0] - 1))
        std_vectors.loc[c, "ConvertedSalary"] = s
        
        for col in var_str:
            diff = sum(df_cj[col] != centroids.iloc[c][col])
            s = math.sqrt(diff / (df_cj.shape[0] - 1))
            std_vectors.loc[c, col] = s
        
        for col in var_list:
            y = centroids.iloc[c][col]
            diff = 0
            for i, row in df_cj.iterrows():
                x = row[col]
                num_vars = len(x) + len(y)
                if num_vars > 0:
                    diff += (2*len(set(x + y)) - num_vars) / num_vars
            s = math.sqrt(diff / (df_cj.shape[0] - 1))
            std_vectors.loc[c, col] = s
        
        for col in var_ranks:
            y = centroids.iloc[c][col]
            for i, row in df_cj.iterrows():
                diff = 0
                x = row[col]
                max_vars = max(len(x), len(y))
                if len(x) != 0 and len(y) != 0:
                    for v in range(len(x)):
                        if x[v] != y[v]:
                            diff += 1
                else:
                    diff += max_vars

                if diff != 0:
                    diff /= max_vars
            s = math.sqrt(diff / (df_cj.shape[0] - 1))
            std_vectors.loc[c, col] = s
         
    return std_vectors

display(std_dev())

Unnamed: 0,Hobby,OpenSource,Country,Student,Employment,FormalEducation,UndergradMajor,CompanySize,DevType,YearsCoding,YearsCodingProf,JobSatisfaction,CareerSatisfaction,HopeFiveYears,JobSearchStatus,LastNewJob,UpdateCV,ConvertedSalary,CommunicationTools,TimeFullyProductive,EducationTypes,SelfTaughtTypes,HackathonReasons,AgreeDisagree1,AgreeDisagree2,AgreeDisagree3,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,FrameworkWorkedWith,FrameworkDesireNextYear,IDE,OperatingSystem,NumberMonitors,Methodology,VersionControl,CheckInCode,AdBlocker,AdBlockerDisable,AdBlockerReasons,AdsAgreeDisagree1,AdsAgreeDisagree2,AdsAgreeDisagree3,AdsActions,AIDangerous,AIInteresting,AIResponsible,AIFuture,EthicsChoice,EthicsReport,EthicsResponsible,EthicalImplications,StackOverflowRecommend,StackOverflowVisit,StackOverflowHasAccount,StackOverflowParticipate,StackOverflowJobs,StackOverflowDevStory,StackOverflowJobsRecommend,StackOverflowConsiderMember,HypotheticalTools1,HypotheticalTools2,HypotheticalTools3,HypotheticalTools4,HypotheticalTools5,WakeTime,HoursComputer,HoursOutside,SkipMeals,ErgonomicDevices,Exercise,EducationParents,RaceEthnicity,Age,Dependents,SurveyTooLong,SurveyEasy,LanguageWorkedWith,AssessJob,AssessBenefits,JobContactPriorities,JobEmailPriorities,AdsPriorities,Cluster
0,0.272166,0.544331,0.881917,0.471405,0.430331,0.793492,0.720082,0.816497,0.75922,0.902671,0.83887,0.793492,0.745356,0.860663,0.720082,0.793492,0.83887,0.377062,0.815362,0.745356,0.765665,0.766835,0.941499,0.83887,0.881917,0.860663,0.835904,0.842811,0.879614,0.859125,0.850079,0.904038,0.895806,0.843013,0.816497,0.666667,0.746066,0.666667,0.816497,0.430331,0.745356,0.844737,0.83887,0.793492,0.816497,0.846197,0.860663,0.793492,0.7698,0.57735,0.693889,0.693889,0.666667,0.57735,0.544331,0.7698,0.509175,0.693889,0.693889,0.7698,0.666667,0.83887,0.902671,0.816497,0.816497,0.860663,0.881917,0.902671,0.816497,0.816497,0.666667,0.93953,0.816497,0.881917,0.628539,0.7698,0.57735,0.7698,0.860663,0.754652,0.161015,0.183494,0.121716,0.102869,0.16265,
1,0.188982,0.681385,0.866025,0.731925,0.779194,0.755929,0.866025,0.681385,0.809967,0.801784,0.654654,0.534522,0.626783,0.944911,0.866025,0.866025,0.963624,0.195056,0.99403,0.963624,0.915475,0.932896,0.994628,0.886405,0.92582,0.963624,0.918826,0.935414,0.964506,0.908295,0.917702,0.972846,0.951815,0.928663,0.906327,0.823754,0.946799,0.823754,0.906327,0.823754,0.886405,0.886405,0.886405,0.886405,0.92582,0.935414,0.944911,0.944911,0.886405,0.866025,0.92582,0.906327,0.92582,0.845154,0.886405,0.92582,0.886405,0.981981,0.92582,0.944911,0.823754,0.92582,0.981981,0.92582,0.981981,0.944911,0.963624,0.906327,0.886405,0.886405,0.886405,0.972846,0.944911,0.886405,0.886405,0.92582,0.845154,0.906327,0.944911,0.861133,0.188982,0.136775,0.188982,0.188982,0.188982,
2,0.258199,0.632456,0.912871,0.57735,0.547723,0.707107,0.60553,0.912871,0.699598,0.894427,0.894427,0.83666,0.83666,0.83666,0.658281,0.816497,0.752773,0.396666,0.729764,0.795822,0.590469,0.601797,0.835236,0.632456,0.795822,0.752773,0.771511,0.766194,0.883865,0.881407,0.870152,0.72204,0.809272,0.748064,0.707107,0.632456,0.649862,0.578174,0.483046,0.408248,0.658281,0.844591,0.774597,0.774597,0.795822,0.678233,0.816497,0.707107,0.774597,0.447214,0.658281,0.658281,0.795822,0.447214,0.408248,0.730297,0.182574,0.774597,0.547723,0.875595,0.856349,0.60553,0.774597,0.856349,0.912871,0.875595,0.875595,0.816497,0.68313,0.68313,0.658281,0.859586,0.856349,0.816497,0.57735,0.68313,0.516398,0.632456,0.816497,0.605662,0.152753,0.1557,0.141421,0.169031,0.154303,


In [11]:
def divide_clusters():
    global NUM_CLUSTERS, centroids

    if TALK :
        display(centroids)
    
    # Cálculo de desviaciones estandar
    sigma_vect = std_dev()   
    if TALK :
        display(sigma_vect)
    
    candidates = []
    for c, s_row in sigma_vect.iterrows():
        for col in s_row:
            if col > S_MAX :
                candidates.append(c)
                break # Sucio... pero eficiente :-) ... ya encontramos un atributo con elevada sigma

    if TALK :
        print("Posibles clusters a dividir:", candidates)
    
    divided = False
    to_eliminate = []
    for c in candidates:
        cond = NUM_CLUSTERS < K_INIT/2 or (deltas[c] > delta and members[c] > 2 * N_MIN)
        if cond: 
            d = 0
            # Obtener dos puntos "suficientemente separados", no es el óptimo, 
            # pero son buenos candidatos a buen costo
            count = 0
            while d < deltas[c] and count < 5000:
                s1 = df[df["Cluster"]==c].sample(n=2)
                d = distance_qual(s1.iloc[0], s1.iloc[1])
                count += 1
            if count < 5000:
                to_eliminate.append(c)
                centroids = centroids.append(s1)
                NUM_CLUSTERS += 1
            
    if len(to_eliminate) > 0 :
        if TALK : 
            print("Clusters a eliminar:", to_eliminate)
            print("")
        centroids.drop(to_eliminate, inplace=True)
        centroids = centroids.reset_index(drop=True)
        update_clusters()
        update_centroids()
        if TALK : 
            display(centroids)
            print("")
            
    return 

divide_clusters()    


Unnamed: 0,Hobby,OpenSource,Country,Student,Employment,FormalEducation,UndergradMajor,CompanySize,DevType,YearsCoding,YearsCodingProf,JobSatisfaction,CareerSatisfaction,HopeFiveYears,JobSearchStatus,LastNewJob,UpdateCV,ConvertedSalary,CommunicationTools,TimeFullyProductive,EducationTypes,SelfTaughtTypes,HackathonReasons,AgreeDisagree1,AgreeDisagree2,AgreeDisagree3,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,FrameworkWorkedWith,FrameworkDesireNextYear,IDE,OperatingSystem,NumberMonitors,Methodology,VersionControl,CheckInCode,AdBlocker,AdBlockerDisable,AdBlockerReasons,AdsAgreeDisagree1,AdsAgreeDisagree2,AdsAgreeDisagree3,AdsActions,AIDangerous,AIInteresting,AIResponsible,AIFuture,EthicsChoice,EthicsReport,EthicsResponsible,EthicalImplications,StackOverflowRecommend,StackOverflowVisit,StackOverflowHasAccount,StackOverflowParticipate,StackOverflowJobs,StackOverflowDevStory,StackOverflowJobsRecommend,StackOverflowConsiderMember,HypotheticalTools1,HypotheticalTools2,HypotheticalTools3,HypotheticalTools4,HypotheticalTools5,WakeTime,HoursComputer,HoursOutside,SkipMeals,ErgonomicDevices,Exercise,EducationParents,RaceEthnicity,Age,Dependents,SurveyTooLong,SurveyEasy,LanguageWorkedWith,AssessJob,AssessBenefits,JobContactPriorities,JobEmailPriorities,AdsPriorities,Cluster
0,1,0,USA,0,0,1,6,4,"[0, 12, 11, 4]",10,0,3,3,6,2,3,7,0.135474,"[8, 0, 5]",3,"[8, 1]","[5, 7, 0]",[4],0,1,4,"[18, 27, 4, 5]","[14, 17, 19]","[17, 14]","[14, 0]","[14, 0, 2]","[5, 6]",[5],"[10, 18, 19]",3,2,"[0, 9]","[1, 4]",2,2,3,"[6, 1]",1,1,0,[2],2,3,3,1,0,0,2,2,10,2,2,4,1,2,5,2,0,2,2,3,1,5,1,0,3,[0],3,6,[6],1,0,1,4,"[18, 31, 14, 5, 27, 1]","[10, 8, 3, 1, 2, 6, 4, 5, 9, 7]","[1, 6, 3, 10, 11, 2, 8, 5, 9, 7, 4]","[2, 1, 5, 4, 3]","[7, 4, 3, 6, 1, 2, 5]","[2, 4, 1, 5, 3, 7, 6]",
1,1,0,USA,0,0,1,6,8,"[0, 19, 12]",7,11,5,5,6,2,5,7,0.020988,[4],3,[8],"[5, 7]",[0],0,0,0,"[27, 18, 31]",[14],[13],"[14, 22]","[14, 2]",[5],[5],"[10, 18]",3,2,[0],[1],2,2,0,[0],1,1,0,[2],2,3,3,1,0,0,0,2,10,1,2,1,0,2,5,1,2,2,4,3,3,5,1,0,3,[0],0,1,[6],1,0,1,2,"[14, 5, 18, 31]","[8, 6, 5, 1, 2, 9, 7, 4, 10, 3]","[1, 6, 3, 10, 5, 4, 8, 2, 11, 7, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]","[3, 1, 2, 4, 5]","[6, 7, 5, 3, 2, 1, 4]","[2, 5, 3, 1, 6, 7, 4]",
2,1,1,USA,0,0,1,6,1,"[12, 0, 11, 15]",9,10,3,3,6,2,3,7,0.151119,"[4, 8, 5, 10]",0,"[8, 1, 7, 3]","[5, 7, 0, 8]","[0, 4]",0,2,1,"[18, 14, 5, 17, 34]","[14, 19, 13, 17]","[13, 6, 18]","[14, 2, 0]","[14, 2, 18]","[5, 1, 6]","[5, 6, 1]","[19, 15, 6, 10]",3,2,"[0, 9, 4, 8]","[1, 4]",2,2,3,"[2, 6]",1,1,0,"[3, 2]",2,3,0,1,1,3,2,2,10,2,2,4,2,0,5,2,3,2,0,1,3,5,2,0,3,[2],3,1,[6],1,0,1,0,"[18, 14, 5, 17, 31, 34, 3, 1]","[7, 5, 8, 2, 1, 6, 3, 4, 10, 9]","[1, 7, 2, 10, 6, 4, 8, 3, 11, 9, 5]","[2, 1, 5, 4, 3]","[5, 3, 1, 2, 4, 7, 6]","[1, 5, 2, 3, 7, 4, 6]",


Unnamed: 0,Hobby,OpenSource,Country,Student,Employment,FormalEducation,UndergradMajor,CompanySize,DevType,YearsCoding,YearsCodingProf,JobSatisfaction,CareerSatisfaction,HopeFiveYears,JobSearchStatus,LastNewJob,UpdateCV,ConvertedSalary,CommunicationTools,TimeFullyProductive,EducationTypes,SelfTaughtTypes,HackathonReasons,AgreeDisagree1,AgreeDisagree2,AgreeDisagree3,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,FrameworkWorkedWith,FrameworkDesireNextYear,IDE,OperatingSystem,NumberMonitors,Methodology,VersionControl,CheckInCode,AdBlocker,AdBlockerDisable,AdBlockerReasons,AdsAgreeDisagree1,AdsAgreeDisagree2,AdsAgreeDisagree3,AdsActions,AIDangerous,AIInteresting,AIResponsible,AIFuture,EthicsChoice,EthicsReport,EthicsResponsible,EthicalImplications,StackOverflowRecommend,StackOverflowVisit,StackOverflowHasAccount,StackOverflowParticipate,StackOverflowJobs,StackOverflowDevStory,StackOverflowJobsRecommend,StackOverflowConsiderMember,HypotheticalTools1,HypotheticalTools2,HypotheticalTools3,HypotheticalTools4,HypotheticalTools5,WakeTime,HoursComputer,HoursOutside,SkipMeals,ErgonomicDevices,Exercise,EducationParents,RaceEthnicity,Age,Dependents,SurveyTooLong,SurveyEasy,LanguageWorkedWith,AssessJob,AssessBenefits,JobContactPriorities,JobEmailPriorities,AdsPriorities,Cluster
0,0.272166,0.544331,0.881917,0.471405,0.430331,0.793492,0.720082,0.816497,0.75922,0.902671,0.83887,0.793492,0.745356,0.860663,0.720082,0.793492,0.83887,0.377062,0.815362,0.745356,0.765665,0.766835,0.941499,0.83887,0.881917,0.860663,0.835904,0.842811,0.879614,0.859125,0.850079,0.904038,0.895806,0.843013,0.816497,0.666667,0.746066,0.666667,0.816497,0.430331,0.745356,0.844737,0.83887,0.793492,0.816497,0.846197,0.860663,0.793492,0.7698,0.57735,0.693889,0.693889,0.666667,0.57735,0.544331,0.7698,0.509175,0.693889,0.693889,0.7698,0.666667,0.83887,0.902671,0.816497,0.816497,0.860663,0.881917,0.902671,0.816497,0.816497,0.666667,0.93953,0.816497,0.881917,0.628539,0.7698,0.57735,0.7698,0.860663,0.754652,0.161015,0.183494,0.121716,0.102869,0.16265,
1,0.188982,0.681385,0.866025,0.731925,0.779194,0.755929,0.866025,0.681385,0.809967,0.801784,0.654654,0.534522,0.626783,0.944911,0.866025,0.866025,0.963624,0.195056,0.99403,0.963624,0.915475,0.932896,0.994628,0.886405,0.92582,0.963624,0.918826,0.935414,0.964506,0.908295,0.917702,0.972846,0.951815,0.928663,0.906327,0.823754,0.946799,0.823754,0.906327,0.823754,0.886405,0.886405,0.886405,0.886405,0.92582,0.935414,0.944911,0.944911,0.886405,0.866025,0.92582,0.906327,0.92582,0.845154,0.886405,0.92582,0.886405,0.981981,0.92582,0.944911,0.823754,0.92582,0.981981,0.92582,0.981981,0.944911,0.963624,0.906327,0.886405,0.886405,0.886405,0.972846,0.944911,0.886405,0.886405,0.92582,0.845154,0.906327,0.944911,0.861133,0.188982,0.136775,0.188982,0.188982,0.188982,
2,0.258199,0.632456,0.912871,0.57735,0.547723,0.707107,0.60553,0.912871,0.699598,0.894427,0.894427,0.83666,0.83666,0.83666,0.658281,0.816497,0.752773,0.396666,0.729764,0.795822,0.590469,0.601797,0.835236,0.632456,0.795822,0.752773,0.771511,0.766194,0.883865,0.881407,0.870152,0.72204,0.809272,0.748064,0.707107,0.632456,0.649862,0.578174,0.483046,0.408248,0.658281,0.844591,0.774597,0.774597,0.795822,0.678233,0.816497,0.707107,0.774597,0.447214,0.658281,0.658281,0.795822,0.447214,0.408248,0.730297,0.182574,0.774597,0.547723,0.875595,0.856349,0.60553,0.774597,0.856349,0.912871,0.875595,0.875595,0.816497,0.68313,0.68313,0.658281,0.859586,0.856349,0.816497,0.57735,0.68313,0.516398,0.632456,0.816497,0.605662,0.152753,0.1557,0.141421,0.169031,0.154303,


Posibles clusters a dividir: [1]
Clusters a eliminar: [1]

Actualizando clusters
El cluster  0  incluye  30 miembros.
El cluster  1  incluye  38 miembros.
El cluster  2  incluye  2 miembros.
El cluster  3  incluye  30 miembros.

Clusters a eliminar: [2]
El cluster  0  incluye  31 miembros.
El cluster  1  incluye  39 miembros.
El cluster  2  incluye  30 miembros.



IndexError: list index out of range

In [None]:
def mix_clusters():
    global centroids, num_clusters
    dist_matrix = np.triu(squareform(pdist(centroids, 'euclidean')))
    
    flag = math.floor(dist_matrix.max() * 10)
    dist_matrix[dist_matrix == 0] = flag
    
    mixed = False
    to_eliminate = []
    # to_eliminate contendrá la mitad de los clusters unidos...
    while (dist_matrix.min() < flag and len(to_eliminate) < P_MAX/2) :
        dist_min = dist_matrix.min()
        idx = (dist_matrix==dist_min).argmax()
        z1 = idx // len(centroids)
        z2 = idx % len(centroids)        

        if dist_min < L_MIN:
            display(centroids)
            z = [sum(x)/2 for x in zip(centroids.iloc[z1], centroids.iloc[z2])]
            centroids.iloc[z1] = z
            to_eliminate.append(z2)
            num_clusters -= 1
            mixed = True
            if TALK:
                print("Unificando clusters {} y {}.\nSe creará nuevo centroide en {}\n"
                      .format(z1, z2, z))

        dist_matrix[z1][z2] = flag
            
    centroids.drop(to_eliminate, inplace=True)
    centroids = centroids.reset_index(drop=True)
    
    if mixed :
        update_clusters()
        update_centroids()

    return

#mix_clusters()