<div style="width:100%; overflow:hidden; background-color:#F1F1E6; padding: 10px; border-style: outset; color:#17469e">
    <div style="width: 80%; float: left;">
    <h2 align="center">Universidad de Sonora</h2>
    <hr style="border-width: 3px; border-color:#17469e">
          <h1>Reconocimiento de patrones: Preparación de los datos</h1>          
          <h4>Ramón Soto C. <a href="mailto:rsotoc@moviquest.com/">(rsotoc@moviquest.com)</a></h4>
    </div>
    <div style="float: right;">
    <img src="images/escudo_unison.png">
    </div>
</div>

## Caso de estudio: [*Stack Overflow 2018 Developer Survey*](https://www.kaggle.com/stackoverflow/stack-overflow-2018-developer-survey)

Como caso de estudio principal en el presente curso hemos seleccionado la encuesta de desarrolladores 2018 de *Stack Overflow* disponible en [Kaggle](https://www.kaggle.com). En este esta etapa realizaremos el análisis de agrupamientos.

### 4. Modelado - ISODATA

<div style="margin-top: 6px; border: 1px solid #cfcfcf; padding: 8px 12px; border-radius:2px; background-color:#f7f7f7; ">
... ahora utilizamos la técnica ISODATA para identificar prototipos de clases. <br>Inicializamos el contexto y cargamos los datos:
</div>

In [1]:
"""
Reconocimiento de patrones: ISODATA
"""

#from scipy.spatial.distance import squareform

# Inicializar el ambiente
import sys
import numpy as np
import pandas as pd
import json
import pickle
#import math
import random
#import time

from IPython.display import display, HTML
from collections import Counter
from operator import itemgetter
#from scipy.spatial.distance import euclidean, pdist, squareform

np.set_printoptions(precision=2, suppress=True) # Cortar la impresión de decimales a 1
pd.set_option('display.max_columns', 130)
pd.set_option('max_colwidth', 80)

LARGER_DISTANCE = sys.maxsize
TALK = True # TALK = True, imprime resultados parciales

In [2]:
path = "Data sets/Stack Overflow Survey/"

# Recuperar encabezados de columnas en orden original
with open(path + 'survey_results_public_transformed.headers', 'rb') as file:  
    headers = pickle.load(file)

# Recuperar diccionarios... sólo por si se requieren
with open(path + 'survey_results_public_transformed.dicts', 'rb') as file:  
    dict_of_dicts = pickle.load(file)

with open(path + 'survey_results_public_transformed.json') as f:
    dict_json = json.load(f)
df = pd.DataFrame.from_dict(dict_json)
#df = df.sample(n=2000).reset_index(drop=True)

# Reordenar las columnas de acuerdo al orden original
df = df.reindex(headers, axis=1)

DATA_LEN = df.shape[0]

# Agregar una columna "cluster" inicializada a null 
df["Cluster"] = np.nan

In [3]:
var_str = ['Hobby', 'OpenSource', 'Country', 'Student', 'Employment', 'FormalEducation', 
         'UndergradMajor', 'CompanySize', 'YearsCoding', 'YearsCodingProf', 'UpdateCV', 
         'JobSatisfaction', 'CareerSatisfaction', 'HopeFiveYears', 'JobSearchStatus', 
         'LastNewJob', 'TimeFullyProductive', 'AgreeDisagree1', 'AgreeDisagree2', 
         'AgreeDisagree3', 'OperatingSystem', 'NumberMonitors', 'CheckInCode', 'AdBlocker', 
         'AdBlockerDisable', 'AdsAgreeDisagree1', 'AdsAgreeDisagree2', 'AdsAgreeDisagree3', 
         'AIDangerous', 'AIInteresting', 'AIResponsible', 'AIFuture', 'EthicsChoice', 
         'EthicsReport', 'EthicsResponsible', 'EthicalImplications', 'HoursComputer', 
         'StackOverflowRecommend', 'StackOverflowVisit', 'StackOverflowHasAccount', 
         'StackOverflowParticipate', 'StackOverflowJobs', 'StackOverflowDevStory', 
         'StackOverflowJobsRecommend', 'StackOverflowConsiderMember', 'HypotheticalTools1', 
         'HypotheticalTools2', 'HypotheticalTools3', 'HypotheticalTools4', 'WakeTime', 
         'HypotheticalTools5', 'HoursOutside', 'SkipMeals', 'Exercise', 'EducationParents', 
         'Age', 'Dependents', 'SurveyTooLong', 'SurveyEasy']
var_list = ['DevType', 'CommunicationTools', 'EducationTypes', 'SelfTaughtTypes', 
         'HackathonReasons', 'LanguageDesireNextYear', 'DatabaseWorkedWith', 
         'DatabaseDesireNextYear', 'PlatformWorkedWith', 'PlatformDesireNextYear', 
         'FrameworkWorkedWith', 'FrameworkDesireNextYear', 'IDE', 'Methodology', 
         'VersionControl', 'AdBlockerReasons', 'AdsActions', 'ErgonomicDevices', 
         'RaceEthnicity', 'LanguageWorkedWith']
var_ranks = ['AssessJob', 'AssessBenefits', 'JobContactPriorities', 'JobEmailPriorities', 
             'AdsPriorities']
var_float = 'ConvertedSalary'

def distance_qual(x, y):
    # Número de variables; si var_float es array, modificar "+ 1" por "+ len(var_float)"
    numvars = len(var_str) + len(var_list) + len(var_ranks) + 1
    
    distancia = abs(x.ConvertedSalary - y.ConvertedSalary)
    if pd.isnull(distancia):
        distancia = 0
        numvars -= 1
        
    for col in var_str:
        if x[col] != y[col]:
            distancia += 1
        
    for col in var_list:
        num_vars = len(x[col]) + len(y[col])
        d = 0
        if num_vars > 0:
            d = (2*len(set(x[col] + y[col])) - num_vars) / num_vars
        distancia += d

    for col in var_ranks:
        d = 0
        max_vars = max(len(x[col]), len(y[col]))
        if len(x[col]) != 0 and len(y[col]) != 0:
            for v in range(len(x[col])):
                if x[col][v] != y[col][v]:
                    d += 1
        else:
            d += max_vars
        
        if d != 0:
            d /= max_vars
        distancia += d

    return distancia / numvars
    
def decode(dataframe):
    new_df = dataframe.copy(deep=True)
    
    for col in var_str:
        if col in list(dataframe) and col in dict_of_dicts:
            for index, row in dataframe.iterrows():
                value = dict_of_dicts[col][row[col]]
                new_df.at[clusters.index[index], col] = value
                
    for index, row in dataframe.iterrows():
        new_df.at[clusters.index[index], 'ConvertedSalary'] = row['ConvertedSalary'] * 200000
    
    for col in var_list:
        if col in list(dataframe):
            for index, row in dataframe.iterrows():
                values_list = row[col].copy()
                for i in range(len(values_list)):
                    values_list[i] = dict_of_dicts[col][values_list[i]]
                new_df.at[clusters.index[index], col] = values_list
                
    return new_df

<div style="margin-top: 6px; border: 1px solid #cfcfcf; padding: 8px 12px; border-radius:2px; background-color:#f7f7f7; ">
A continuación ejecutamos el algoritmo ISODATA:
</div>

1) Definir los valores de $k_{init}, n_{min}, I_{max}, \sigma_{max}, L_{min}$ y $P_{max}$:

In [4]:
K_INIT = 7
N_MIN = 1000
I_MAX = 10
S_MAX = 0.75 # La desviación estándar está normalizada
DC_MAX = 3 # El cluster sólo se divide cuando hay al menos estas variables con s>S_MAX
L_MIN = 0.5 # Las distancia están normalizadas
D_MAX = 0.5 # Distancia media máxima al centroide
P_MAX = 2

NUM_CLUSTERS = K_INIT # valor de k
iteration = 0

2) Seleccionar de manera arbitraria *k* puntos en el espacio de características como centros iniciales de los clusters (centroides o centros de masa).

In [5]:
# Inicializar los centroides
centroids = df.sample(n=NUM_CLUSTERS).reset_index(drop=True)
display(centroids)

Unnamed: 0,Hobby,OpenSource,Country,Student,Employment,FormalEducation,UndergradMajor,CompanySize,DevType,YearsCoding,YearsCodingProf,JobSatisfaction,CareerSatisfaction,HopeFiveYears,JobSearchStatus,LastNewJob,UpdateCV,ConvertedSalary,CommunicationTools,TimeFullyProductive,EducationTypes,SelfTaughtTypes,HackathonReasons,AgreeDisagree1,AgreeDisagree2,AgreeDisagree3,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,FrameworkWorkedWith,FrameworkDesireNextYear,IDE,OperatingSystem,NumberMonitors,Methodology,VersionControl,CheckInCode,AdBlocker,AdBlockerDisable,AdBlockerReasons,AdsAgreeDisagree1,AdsAgreeDisagree2,AdsAgreeDisagree3,AdsActions,AIDangerous,AIInteresting,AIResponsible,AIFuture,EthicsChoice,EthicsReport,EthicsResponsible,EthicalImplications,StackOverflowRecommend,StackOverflowVisit,StackOverflowHasAccount,StackOverflowParticipate,StackOverflowJobs,StackOverflowDevStory,StackOverflowJobsRecommend,StackOverflowConsiderMember,HypotheticalTools1,HypotheticalTools2,HypotheticalTools3,HypotheticalTools4,HypotheticalTools5,WakeTime,HoursComputer,HoursOutside,SkipMeals,ErgonomicDevices,Exercise,EducationParents,RaceEthnicity,Age,Dependents,SurveyTooLong,SurveyEasy,LanguageWorkedWith,AssessJob,AssessBenefits,JobContactPriorities,JobEmailPriorities,AdsPriorities,Cluster
0,1,0,THA,0,2,7,11.0,8,[16],7,11,5,5,,1,5.0,,0.0,[],,[],[],[],,,,[],[],[],[],[],[],[],[],,,[],[],,,,[],,,,[],,,,,,,,,,,,,,,,,,,,,,,,,,[],,,[],,,,,[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0]",[],
1,1,0,ISR,0,5,1,,8,"[0, 11, 12, 19, 5]",0,0,5,0,5.0,2,4.0,,0.0,[],,[8],"[5, 8]",[],2.0,3.0,1.0,"[14, 18, 25, 31, 4, 5]",[14],[14],[],[],[],[],[15],3.0,2.0,[],[6],0.0,2.0,3.0,"[2, 5, 6]",,,,[3],,,3.0,2.0,1.0,0.0,,2.0,10.0,1.0,2.0,3.0,2.0,2.0,0.0,1.0,,,,,,9.0,1.0,2.0,2.0,[],1.0,3.0,[6],0.0,0.0,1.0,1.0,"[14, 18, 25, 31, 5]","[9, 3, 7, 6, 1, 8, 5, 2, 10, 4]","[4, 7, 2, 6, 11, 3, 10, 8, 5, 1, 9]","[2, 4, 1, 3, 5]","[6, 3, 7, 4, 2, 1, 5]",[],
2,1,1,GBR,0,2,1,4.0,8,"[0, 1, 11, 12]",1,10,1,3,0.0,1,1.0,7.0,0.0,[],,[],[],[],,,,[],[],[],[],[],[],[],[],,,[],[],,,,[],,,,[],,,,,,,,,10.0,5.0,2.0,0.0,2.0,0.0,5.0,2.0,4.0,0.0,3.0,4.0,4.0,5.0,1.0,0.0,3.0,[0],0.0,,[],,,,,[],"[8, 9, 3, 6, 7, 2, 10, 1, 4, 5]","[1, 10, 6, 2, 8, 4, 9, 3, 7, 11, 5]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0]",[],
3,0,0,TUR,0,0,1,6.0,6,[12],7,7,3,7,4.0,2,3.0,,,[],,[],[],[],,,,[],[],[],[],[],[],[],[],,,[],[],,,,[],,,,[],,,,,,,,,,,,,,,,,,,,,,,,,,[],,,[],,,,,[],[],[],[],[],[],
4,1,0,BEL,0,0,1,6.0,2,[0],2,11,5,5,,1,,,0.0,[],,[],[],[],,,,[],[],[],[],[],[],[],[],,,[],[],,,,[],,,,[],,,,,,,,,10.0,2.0,1.0,,2.0,,6.0,1.0,,,,,,,,,,[],,,[],,,,,[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0]",[],
5,0,1,USA,0,0,3,6.0,7,"[1, 10, 15]",10,10,3,0,0.0,2,4.0,6.0,,"[7, 8]",5.0,"[3, 4, 7, 8]","[5, 7, 8]","[0, 2, 4, 5, 6]",0.0,2.0,1.0,"[14, 17, 18, 2, 28, 32, 33]",[20],[20],"[0, 1, 12, 18, 2, 20, 4, 5, 8]","[0, 11, 12, 18, 2, 20, 25, 4, 8]",[5],"[1, 5, 6, 7, 8, 9]",[],,,[],[],,,,[],,,,[],,,,,,,,,8.0,5.0,2.0,0.0,0.0,2.0,5.0,2.0,3.0,3.0,3.0,3.0,0.0,6.0,1.0,1.0,0.0,[2],2.0,,[],,,,,"[13, 14, 17, 18, 23, 27, 28, 5]","[2, 5, 7, 6, 3, 8, 9, 1, 10, 4]","[1, 2, 3, 10, 7, 4, 5, 9, 11, 8, 6]","[3, 1, 2, 4, 5]","[4, 2, 1, 3, 7, 6, 5]",[],
6,1,0,POL,0,0,0,6.0,3,[12],9,7,6,7,2.0,2,4.0,3.0,0.0,"[0, 4, 8]",5.0,"[5, 8]",[0],[],0.0,1.0,0.0,"[18, 29]","[14, 17]","[14, 17]",[15],[15],[],[6],[15],2.0,2.0,"[0, 4, 9]",[1],2.0,2.0,3.0,[4],1.0,1.0,0.0,"[0, 3]",0.0,2.0,1.0,0.0,1.0,3.0,0.0,2.0,9.0,1.0,2.0,4.0,1.0,2.0,5.0,0.0,3.0,0.0,0.0,3.0,1.0,7.0,1.0,0.0,3.0,[],1.0,,[6],1.0,0.0,0.0,4.0,[29],"[7, 1, 9, 2, 5, 6, 8, 3, 10, 4]","[1, 6, 2, 10, 3, 8, 11, 4, 9, 7, 5]","[4, 1, 5, 2, 3]","[6, 1, 7, 3, 2, 5, 4]","[1, 4, 3, 6, 5, 7, 2]",


3) Asignar cada punto del conjunto de datos al cluster donde la distancia del punto al centroide es menor.

In [6]:
def update_clusters():
    global NUM_CLUSTERS, centroids
    changed = False
    cluster_col_index = df.shape[1] - 1
    
    if TALK :
        print("Actualizando clusters")
    for index, row in df.iterrows():
        dists = []
        for i, r in centroids.iterrows():
            dists.append(distance_qual(row, r))
        cluster = np.argmin(dists)
        
        # Si hay cambio, realizarlo y levantar la bandera 'changed'
        if(pd.isnull(row['Cluster']) or row['Cluster'] != cluster):
            df.iloc[index, cluster_col_index] = cluster
            changed = True
            
    # Contabilizar los elementos en cada cluster   
    to_eliminate = []
    for i in range(NUM_CLUSTERS):
        members = df[df["Cluster"]==i].count()["Cluster"]
        if members < N_MIN:
            to_eliminate.append(i)
        if (TALK) : 
            print("El cluster ", i, " incluye ", members, "miembros.")
    if (TALK) : 
        print()

    if len(to_eliminate) > 0:
        if (TALK) : 
            print("Clusters a eliminar:", to_eliminate)
        
        # Eliminar los centroides seleccionados
        centroids.drop(to_eliminate, inplace=True)    
        centroids = centroids.reset_index(drop=True)
        
        # Reetiquetar los registros afectados
        eliminated = 0
        for i in to_eliminate:
            i_e = i - eliminated
            # Reetiquetar como Null los registros en cada cluster eliminado
            df.loc[df.Cluster == i_e, 'Cluster'] = np.nan
            # Recorrer las etiquetas para coincidir con los nuevos índices
            for cj in range(i_e + 1, NUM_CLUSTERS):
                df.loc[df.Cluster == cj, 'Cluster'] = cj - 1
            # Actualizar el número actual de centroides
            NUM_CLUSTERS -= 1
            eliminated += 1
            
#        if (TALK) : 
#            for i in range(NUM_CLUSTERS):
#                members = df[df["Cluster"]==i].count()["Cluster"]
#                print("El cluster ", i, " incluye ", members, "miembros.")

        changed = True
        
    if changed:
        if TALK : 
            faltantes = df[pd.isnull(df["Cluster"])].shape[0]
            if faltantes > 0:
                print("Faltan por clasificar", faltantes, "miembros.\n")
            else :
                print()
                
        # Reclasificar los registros afectados
        if centroids.shape[0] > 1:
            for index, row in df[pd.isnull(df["Cluster"])].iterrows():
                dists = []
                for i, r in centroids.iterrows():
                    dists.append(distance_qual(row, r))
                df.iloc[index, cluster_col_index] = np.argmin(dists)
                
        # Contabilizar los elementos en cada cluster   
        if TALK : 
            for i in range(NUM_CLUSTERS):
                members = df[df["Cluster"]==i].count()["Cluster"]
                print("El cluster ", i, " incluye ", members, "miembros.")
            print()
        
    return changed

# --------------------------
# Actualizar los clusters
KEEP_WALKING = update_clusters()

Actualizando clusters
El cluster  0  incluye  13773 miembros.
El cluster  1  incluye  13370 miembros.
El cluster  2  incluye  4119 miembros.
El cluster  3  incluye  12218 miembros.
El cluster  4  incluye  4413 miembros.
El cluster  5  incluye  814 miembros.
El cluster  6  incluye  49736 miembros.

Clusters a eliminar: [5]
Faltan por clasificar 814 miembros.

El cluster  0  incluye  13773 miembros.
El cluster  1  incluye  13482 miembros.
El cluster  2  incluye  4408 miembros.
El cluster  3  incluye  12268 miembros.
El cluster  4  incluye  4432 miembros.
El cluster  5  incluye  50080 miembros.



4) Calcular los centroides a partir de los puntos en cada cluster. 

In [7]:
def update_centroids():
    global centroids
    
    for cl_j in range(NUM_CLUSTERS):        
        # Seleccionar registros en el cluster cl_j
        df_clusterj = df[df["Cluster"] == cl_j]
        
        centroids.loc[centroids.index[cl_j]] = get_centroide(df_clusterj).loc[0]        
    return

def get_centroide(data):
    # Copiar estructura de la tabla
    df2 = pd.DataFrame(data=None, columns=data.columns)
    #df2.append(pd.Series([np.nan]), ignore_index = True)

    col = 'ConvertedSalary'
    df2.at[0, col] = data[col].mean()

    # Moda en las columnas 'simples' (en var_str)
    mode = data[var_str].mode()
    for col in mode:
        df2.at[0, col] = mode[col].values[0]

    # Moda en las columnas con listas de longitud variable (en var_list)
    for col in var_list:
        mean_len = 0
        vars_list = []
        for index, row in data.iterrows():
            mean_len += len(row[col])
            vars_list = vars_list + row[col]
        mean_len /= data.shape[0]
        counter = Counter(vars_list)
        mean_list = []
        for v in counter.most_common(round(mean_len + 0.5)):
            mean_list.append(v[0])
        df2.at[0, col] = mean_list


    # Moda en las columnas con listas de longitud fija (en var_ranks)
    ranges = [11, 12, 6, 8, 8]
    # Para cada variable en var_list, obtener el número de componentes en el vector
    # y el nombre de la columna
    for i, col in zip(range(len(ranges)), var_ranks):
        # Inicializar una matriz (lista de listas, en realidad), con tantos renglones como 
        # componentes tiene el vector de la variable. Cada renglón tiene todos los valores 
        # utilizados en cada posición del vector
        vars = []
        for j in range(ranges[i] - 1):
            vars.append([])

        # Recorrer todos los elementos actualmente en el cluster para rellenar la matriz
        for index, row in data.iterrows():
            # Si el vector de la variable no está vacío...
            if len(row[col]) > 0:
                # Para cada componente en el vector...
                for j in range(len(row[col])):
                    # Si no es 0
                    if row[col][j] != '0':
                        # Agregarla al renglón actual en la matriz
                        vars[j].append(row[col][j])

        
        # Contabilizar ocurrencias de cada componente. Crear una matriz con el orden para
        # cada componente como renglones
        most_commons = []
        for j in range(ranges[i] - 1):
            counter = Counter(vars[j])
            #most_commons.append(counter.most_common(ranges[i] - 1))
            most_commons.append(counter.most_common())

        # Inicializar vector. Se escoge el valor más popular en la primera componente
        if len(most_commons) > 0 and len(most_commons[0]) > 0:
            vars_list = [most_commons[0][0][0]]
            # Para cada componente a partir de la segunda...
            for j in range(1, ranges[i] - 1):
                # Buscar la componente más común...
                for c in most_commons[j]:
                    # Siempre y cuando no esté utilizada...
                    if c[0] not in vars_list[:j]:
                        # Agregarla al vector y...
                        vars_list.append(c[0])
                        # Dejar de buscar.
                        break

        if len(vars_list) < ranges[i] - 1:
            for i in set(range(1, ranges[i])):
                if str(i) not in vars_list:
                    vars_list.append(str(i))
        df2.at[0, col] = vars_list

    return df2

# --------------------------
# Actualizar los centroides
update_centroids()

In [8]:
display(centroids)

Unnamed: 0,Hobby,OpenSource,Country,Student,Employment,FormalEducation,UndergradMajor,CompanySize,DevType,YearsCoding,YearsCodingProf,JobSatisfaction,CareerSatisfaction,HopeFiveYears,JobSearchStatus,LastNewJob,UpdateCV,ConvertedSalary,CommunicationTools,TimeFullyProductive,EducationTypes,SelfTaughtTypes,HackathonReasons,AgreeDisagree1,AgreeDisagree2,AgreeDisagree3,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,FrameworkWorkedWith,FrameworkDesireNextYear,IDE,OperatingSystem,NumberMonitors,Methodology,VersionControl,CheckInCode,AdBlocker,AdBlockerDisable,AdBlockerReasons,AdsAgreeDisagree1,AdsAgreeDisagree2,AdsAgreeDisagree3,AdsActions,AIDangerous,AIInteresting,AIResponsible,AIFuture,EthicsChoice,EthicsReport,EthicsResponsible,EthicalImplications,StackOverflowRecommend,StackOverflowVisit,StackOverflowHasAccount,StackOverflowParticipate,StackOverflowJobs,StackOverflowDevStory,StackOverflowJobsRecommend,StackOverflowConsiderMember,HypotheticalTools1,HypotheticalTools2,HypotheticalTools3,HypotheticalTools4,HypotheticalTools5,WakeTime,HoursComputer,HoursOutside,SkipMeals,ErgonomicDevices,Exercise,EducationParents,RaceEthnicity,Age,Dependents,SurveyTooLong,SurveyEasy,LanguageWorkedWith,AssessJob,AssessBenefits,JobContactPriorities,JobEmailPriorities,AdsPriorities,Cluster
0,1,0,IND,0,0,1,6,8,"[0, 16, 12]",7,11,5,5,0,1,5,7,0.002724,[8],3,[8],[5],[0],0,2,2,"[18, 27]",[14],[14],[14],[14],[5],[5],[10],3,1,[0],[1],2,2,3,[0],1,1,0,[3],1,3,3,1,1,0,2,2,10,1,2,4,1,2,5,2,3,4,4,2,3,5,1,0,3,[2],3,1,[6],6,0,1,0,"[14, 18, 5]","[10, 9, 8, 3, 5, 4, 1, 2, 7, 6]","[1, 9, 2, 10, 11, 3, 7, 4, 8, 6, 5]","[2, 1, 3, 4, 5]","[1, 2, 7, 3, 6, 5, 4]","[1, 5, 2, 4, 6, 7, 3]",
1,1,0,USA,0,0,1,6,8,"[0, 19, 11, 12]",7,0,5,3,6,2,3,7,0.068357,"[5, 8]",3,"[8, 7, 1]","[5, 7, 3]",[0],0,2,1,"[18, 27, 14, 5, 31, 17]","[14, 19]","[14, 13]","[14, 22, 2]","[14, 2, 22]","[5, 0]","[5, 1]","[10, 18, 19]",3,1,"[0, 9]","[1, 6]",2,2,3,"[6, 2]",1,1,0,"[3, 2]",1,3,3,1,1,0,2,2,10,2,2,4,2,2,5,2,3,2,4,4,4,5,2,2,3,[0],3,1,[6],0,0,1,2,"[14, 5, 18, 31, 17, 27, 25]","[9, 8, 7, 1, 2, 3, 10, 4, 6, 5]","[1, 11, 2, 10, 9, 3, 7, 4, 8, 6, 5]","[2, 1, 5, 4, 3]","[1, 3, 7, 2, 4, 6, 5]","[1, 5, 2, 4, 6, 7, 3]",
2,1,1,IND,0,0,1,6,8,"[0, 12, 11, 15]",7,0,3,3,6,2,3,7,0.055957,"[8, 5]",3,"[8, 1]","[5, 7]",[0],0,0,1,"[18, 27, 14, 5]","[14, 19]","[14, 13]","[14, 22]","[14, 2]",[5],"[5, 1]","[10, 18]",3,2,"[0, 9]",[1],2,2,3,[0],1,1,0,[3],1,3,3,1,1,0,2,2,10,5,2,4,2,2,5,2,3,4,3,4,4,5,2,0,3,[0],0,1,[6],1,0,1,2,"[18, 14, 5, 31, 17]","[9, 8, 7, 2, 1, 3, 10, 4, 6, 5]","[1, 2, 3, 10, 9, 4, 11, 7, 8, 6, 5]","[2, 1, 5, 4, 3]","[1, 6, 7, 2, 3, 5, 4]","[1, 4, 2, 3, 6, 7, 5]",
3,1,0,IND,0,0,1,6,8,"[0, 12, 11]",7,0,4,4,6,2,3,7,0.027041,[8],3,[8],[5],[4],0,0,2,"[18, 27]",[14],[14],[14],[14],[5],[5],[10],3,2,[0],[1],2,2,3,[0],1,1,0,[3],1,3,3,1,1,0,2,2,10,5,2,4,1,2,5,2,3,3,4,3,3,6,2,0,3,[2],3,1,[6],1,0,1,0,"[18, 14]","[9, 7, 8, 1, 2, 3, 10, 4, 6, 5]","[1, 2, 3, 10, 7, 9, 6, 5, 11, 8, 4]","[2, 1, 5, 3, 4]","[1, 4, 7, 2, 3, 6, 5]","[1, 4, 2, 3, 7, 6, 5]",
4,1,0,IND,0,0,1,6,4,"[0, 12, 11]",7,11,5,5,6,1,3,7,0.011018,[8],3,[8],[5],[0],0,2,2,"[18, 27]",[14],[14],[14],[14],[5],[5],"[10, 18]",3,2,[0],[1],2,2,3,[0],1,1,0,[3],1,3,3,1,1,0,2,2,10,2,2,4,2,2,5,2,3,2,4,3,3,6,2,0,3,[2],3,1,[6],1,0,1,2,"[14, 18, 5]","[9, 7, 6, 2, 1, 3, 10, 5, 8, 4]","[1, 2, 3, 11, 9, 7, 6, 5, 10, 8, 4]","[2, 1, 5, 4, 3]","[1, 3, 2, 4, 6, 5, 7]","[1, 4, 2, 3, 6, 7, 5]",
5,1,0,USA,0,0,1,6,4,"[0, 12, 11, 15]",9,7,3,3,6,2,3,7,0.160822,"[8, 4, 5]",3,"[8, 7, 1, 5]","[7, 5, 0, 3]","[0, 4]",0,1,1,"[18, 27, 14, 5, 31, 1]","[14, 19, 17]","[17, 13, 14]","[14, 22, 0]","[14, 0, 2]","[5, 1]","[5, 6, 1]","[19, 18, 10]",3,2,"[0, 9, 4]","[1, 4]",2,2,3,"[6, 2]",1,1,0,"[3, 2]",0,3,3,1,1,0,2,2,10,2,2,4,2,2,5,2,3,2,3,3,3,6,2,0,3,[0],3,1,[6],1,0,0,2,"[18, 14, 5, 31, 1, 17, 27]","[9, 8, 7, 1, 2, 3, 10, 4, 6, 5]","[1, 2, 3, 10, 9, 4, 11, 5, 8, 7, 6]","[2, 1, 5, 3, 4]","[1, 5, 2, 3, 4, 7, 6]","[1, 4, 2, 3, 6, 7, 5]",


In [9]:
deltas = []
delta = 0
def update_deltas():
    global deltas, delta, centroids
    deltas = [0] * NUM_CLUSTERS
    N = 0
    for j, rc in centroids.iterrows():
        n = 0
        for i, row in df[df["Cluster"]==j].iterrows():
            deltas[j] += distance_qual(row, rc)
            n += 1
        delta += deltas[j]
        deltas[j] /= n
        N += n
    delta /= N
    
    if TALK : 
        print("Las distancias medias en cada cluster son:\n", deltas)   
        print("\nLa distancia media promedio es:", delta)   
        
    return

update_deltas()

Las distancias medias en cada cluster son:
 [0.8473057786262578, 0.600652298773132, 0.6926852977744938, 0.8489816412146622, 0.7667539218575187, 0.5532136731389661]

La distancia media promedio es: 0.6535740934404904


In [10]:
import math

def std_dev():
    # Inicializar vector de desviaciones estándar... los valores actuales son inserbibles
    std_vectors = centroids.copy()
    
    for c in range(NUM_CLUSTERS) :
        df_c = df[(df["Cluster"]==c)]
        
        # Para cada variable numérica...
        df_cj = df_c[pd.notnull(df_c['ConvertedSalary'])]

        s = math.sqrt(sum(abs(df_cj["ConvertedSalary"] - 
                              centroids.iloc[c]["ConvertedSalary"])) / (df_cj.shape[0] - 1))
        std_vectors.loc[c, "ConvertedSalary"] = s
        
        for col in var_str:
            diff = sum(df_cj[col] != centroids.iloc[c][col])
            s = math.sqrt(diff / (df_cj.shape[0] - 1))
            std_vectors.loc[c, col] = s
        
        for col in var_list:
            y = centroids.iloc[c][col]
            diff = 0
            for i, row in df_cj.iterrows():
                x = row[col]
                num_vars = len(x) + len(y)
                if num_vars > 0:
                    diff += (2*len(set(x + y)) - num_vars) / num_vars
            s = math.sqrt(diff / (df_cj.shape[0] - 1))
            std_vectors.loc[c, col] = s
        
        for col in var_ranks:
            y = centroids.iloc[c][col]
            for i, row in df_cj.iterrows():
                diff = 0
                x = row[col]
                max_vars = max(len(x), len(y))
                if len(x) != 0 and len(y) != 0:
                    for v in range(len(x)):
                        if x[v] != y[v]:
                            diff += 1
                else:
                    diff += max_vars

                if diff != 0:
                    diff /= max_vars
            s = math.sqrt(diff / (df_cj.shape[0] - 1))
            std_vectors.loc[c, col] = s
         
    return std_vectors

display(std_dev())

Unnamed: 0,Hobby,OpenSource,Country,Student,Employment,FormalEducation,UndergradMajor,CompanySize,DevType,YearsCoding,YearsCodingProf,JobSatisfaction,CareerSatisfaction,HopeFiveYears,JobSearchStatus,LastNewJob,UpdateCV,ConvertedSalary,CommunicationTools,TimeFullyProductive,EducationTypes,SelfTaughtTypes,HackathonReasons,AgreeDisagree1,AgreeDisagree2,AgreeDisagree3,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,FrameworkWorkedWith,FrameworkDesireNextYear,IDE,OperatingSystem,NumberMonitors,Methodology,VersionControl,CheckInCode,AdBlocker,AdBlockerDisable,AdBlockerReasons,AdsAgreeDisagree1,AdsAgreeDisagree2,AdsAgreeDisagree3,AdsActions,AIDangerous,AIInteresting,AIResponsible,AIFuture,EthicsChoice,EthicsReport,EthicsResponsible,EthicalImplications,StackOverflowRecommend,StackOverflowVisit,StackOverflowHasAccount,StackOverflowParticipate,StackOverflowJobs,StackOverflowDevStory,StackOverflowJobsRecommend,StackOverflowConsiderMember,HypotheticalTools1,HypotheticalTools2,HypotheticalTools3,HypotheticalTools4,HypotheticalTools5,WakeTime,HoursComputer,HoursOutside,SkipMeals,ErgonomicDevices,Exercise,EducationParents,RaceEthnicity,Age,Dependents,SurveyTooLong,SurveyEasy,LanguageWorkedWith,AssessJob,AssessBenefits,JobContactPriorities,JobEmailPriorities,AdsPriorities,Cluster
0,0.158338,0.6635,0.895361,0.675039,0.70874,0.737073,0.83096,0.640148,0.809179,0.704252,0.484809,0.347137,0.411918,0.977397,0.286231,0.81934,0.985494,0.073133,0.996928,0.996225,0.985004,0.994261,0.997017,0.988593,0.991721,0.992398,0.950582,0.940856,0.969669,0.966647,0.974253,0.969344,0.971256,0.975908,0.910607,0.929423,0.948649,0.896338,0.929904,0.89819,0.932468,0.962163,0.951989,0.955548,0.959328,0.965327,0.979989,0.978847,0.976939,0.966187,0.974147,0.977893,0.984547,0.976137,0.97223,0.983372,0.953516,0.990328,0.981321,0.984395,0.966612,0.979609,0.992661,0.993525,0.993562,0.994388,0.993975,0.99375,0.99093,0.991909,0.985532,0.994662,0.988933,0.998245,0.9956,0.996637,0.993224,0.9942,0.995663,0.913588,0.00863804,0.00863804,0.00863804,0.00863804,0.00863804,
1,0.366397,0.627379,0.892337,0.649289,0.802287,0.761385,0.800236,0.638271,0.742962,0.821829,0.710471,0.736558,0.867978,0.847279,0.671856,0.817576,0.778251,0.306065,0.921768,0.919585,0.679308,0.698741,0.936648,0.768353,0.848696,0.815269,0.768007,0.7861,0.870022,0.813298,0.8423,0.885271,0.878383,0.815525,0.602418,0.729286,0.849282,0.710845,0.843961,0.509114,0.683388,0.843701,0.82795,0.84044,0.869454,0.724706,0.854672,0.821146,0.697032,0.579429,0.694959,0.654453,0.727526,0.515061,0.554406,0.842155,0.379823,0.863488,0.720496,0.800487,0.65384,0.689343,0.872858,0.88272,0.877894,0.868717,0.881902,0.863024,0.774793,0.795213,0.698411,0.898647,0.773862,0.861165,0.670649,0.737157,0.48822,0.659333,0.796119,0.669697,0.00895251,0.00895251,0.00895251,0.00895251,0.00586079,
2,0.290578,0.621135,0.881885,0.541277,0.618057,0.714976,0.709978,0.797695,0.754371,0.865879,0.83827,0.851215,0.8004,0.871003,0.756626,0.826199,0.857318,0.292573,0.914646,0.921927,0.8632,0.904897,0.965824,0.865438,0.921789,0.917225,0.888416,0.865541,0.931484,0.898554,0.92287,0.940306,0.925699,0.901284,0.808777,0.848672,0.847309,0.740186,0.805626,0.79065,0.886057,0.890066,0.8942,0.903818,0.903959,0.900379,0.927702,0.905083,0.892492,0.812542,0.8416,0.862789,0.870711,0.801829,0.523361,0.759645,0.255658,0.846572,0.706927,0.828811,0.650143,0.520192,0.859245,0.877838,0.845068,0.823115,0.832638,0.820174,0.713373,0.744256,0.63149,0.838067,0.803255,0.949648,0.927971,0.925918,0.925094,0.873191,0.936026,0.805742,0.0159475,0.0159475,0.0159475,0.0159475,0.0159475,
3,0.296742,0.665415,0.871302,0.582619,0.497354,0.682825,0.631577,0.886081,0.73729,0.811976,0.783776,0.944722,0.944649,0.824705,0.48247,0.782446,0.948976,0.215256,0.968639,0.953065,0.957659,0.984358,0.991643,0.963428,0.971468,0.978093,0.961853,0.948934,0.973747,0.968756,0.975677,0.971936,0.972258,0.97606,0.918634,0.925263,0.936637,0.89142,0.913251,0.898842,0.937417,0.960686,0.951899,0.957209,0.958152,0.954677,0.971396,0.964293,0.956193,0.929456,0.942956,0.953794,0.953284,0.92983,0.943104,0.965876,0.904157,0.976317,0.966667,0.970109,0.941409,0.936676,0.985943,0.986717,0.985872,0.98721,0.986788,0.979229,0.96148,0.976815,0.968533,0.98442,0.971325,0.99387,0.996928,0.989669,0.991351,0.977738,0.992612,0.94724,0.0117851,0.0117851,0.0117851,0.0117851,0.0117851,
4,0.145535,0.657762,0.889277,0.546722,0.405812,0.687309,0.648103,0.894879,0.768189,0.866643,0.490267,0.399905,0.435513,0.970041,0.322118,0.963148,0.977982,0.144389,0.985345,0.979684,0.949316,0.974645,0.985897,0.952089,0.971145,0.971635,0.94878,0.937203,0.968352,0.954062,0.964884,0.965623,0.967226,0.943475,0.886731,0.898462,0.923718,0.852524,0.899257,0.862239,0.925471,0.946448,0.934683,0.937352,0.945442,0.941271,0.963889,0.945819,0.947453,0.900711,0.917076,0.934173,0.935701,0.903481,0.589048,0.793387,0.484161,0.868564,0.770098,0.836461,0.662449,0.747994,0.942416,0.940141,0.949711,0.952089,0.953338,0.932388,0.869111,0.910435,0.861963,0.951714,0.898197,0.983442,0.965287,0.974936,0.967339,0.961045,0.978955,0.906656,0.0154267,0.0154267,0.0154267,0.0154267,0.0154267,
5,0.388457,0.677761,0.855699,0.407768,0.380009,0.717212,0.639617,0.888825,0.730425,0.878615,0.839448,0.793868,0.784546,0.811126,0.623179,0.816675,0.780562,0.38479,0.757472,0.783566,0.641228,0.637162,0.872724,0.649113,0.847827,0.807392,0.78135,0.781188,0.857559,0.816254,0.840156,0.841414,0.815456,0.81879,0.744547,0.668896,0.651489,0.604646,0.572155,0.499754,0.677383,0.851578,0.751769,0.773029,0.832935,0.711689,0.853213,0.779439,0.752649,0.503161,0.623385,0.750233,0.66733,0.423995,0.548603,0.812034,0.331667,0.78407,0.673732,0.83524,0.679036,0.667426,0.825699,0.831614,0.860986,0.850894,0.857469,0.83101,0.660123,0.770428,0.5718,0.892846,0.79792,0.850279,0.568512,0.672097,0.582017,0.677856,0.793545,0.65391,0.00462122,0.00462122,0.00462122,0.00462122,0.00462122,


In [11]:
def divide_clusters():
    global NUM_CLUSTERS, centroids

    if TALK :
        display(centroids)
    
    # Cálculo de desviaciones estandar
    sigma_vect = std_dev()   
    if TALK :
        display(sigma_vect)
    
    candidates = []
    for c, s_row in sigma_vect.iterrows():
        causes = 0
        for col in s_row:
            if col > S_MAX :
                causes += 1
                if causes > DC_MAX :
                    candidates.append(c)
                    break # Ya encontramos un atributo con sigma elevada... o varios!

    if TALK :
        print("Posibles clusters a dividir:", candidates)
    
    divided = False
    to_eliminate = []
    for c in candidates:
        std = sigma_vect.iloc[c].mean()
        if std < S_MAX :
            members = df[df["Cluster"]==c].count()["Cluster"]
            cond = NUM_CLUSTERS < K_INIT/2 or (deltas[c] > delta and members > 2 * N_MIN)
            if cond: 
                #if (deltas[c] <= D_MAX or NUM_CLUSTERS < K_INIT / 2) :
                d = 0
                # Obtener dos puntos "suficientemente separados", no es el óptimo, 
                # pero son buenos candidatos a buen costo
                count = 0
                while (d < deltas[c] and count < 5000) : 
                    s1 = df[df["Cluster"]==c].sample(n=2)
                    d = distance_qual(s1.iloc[0], s1.iloc[1])
                    count += 1
                if count < 5000:
                    to_eliminate.append(c)
                    centroids = centroids.append(s1)
                    NUM_CLUSTERS += 1
                #else : 
                    # Si la distancia media en el cluster es mayor a D_MAX, se elimina 
                    # (no se divide)
                #    to_eliminate.append(c)
                #    NUM_CLUSTERS -= 1
                
            
    if len(to_eliminate) > 0 :
        if TALK : 
            print("Clusters a eliminar:", to_eliminate)
            print("")
        centroids.drop(to_eliminate, inplace=True)
        centroids = centroids.reset_index(drop=True)
        update_clusters()
        update_centroids()
        if TALK : 
            display(centroids)
            print("")
            
    return 

divide_clusters()    

Unnamed: 0,Hobby,OpenSource,Country,Student,Employment,FormalEducation,UndergradMajor,CompanySize,DevType,YearsCoding,YearsCodingProf,JobSatisfaction,CareerSatisfaction,HopeFiveYears,JobSearchStatus,LastNewJob,UpdateCV,ConvertedSalary,CommunicationTools,TimeFullyProductive,EducationTypes,SelfTaughtTypes,HackathonReasons,AgreeDisagree1,AgreeDisagree2,AgreeDisagree3,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,FrameworkWorkedWith,FrameworkDesireNextYear,IDE,OperatingSystem,NumberMonitors,Methodology,VersionControl,CheckInCode,AdBlocker,AdBlockerDisable,AdBlockerReasons,AdsAgreeDisagree1,AdsAgreeDisagree2,AdsAgreeDisagree3,AdsActions,AIDangerous,AIInteresting,AIResponsible,AIFuture,EthicsChoice,EthicsReport,EthicsResponsible,EthicalImplications,StackOverflowRecommend,StackOverflowVisit,StackOverflowHasAccount,StackOverflowParticipate,StackOverflowJobs,StackOverflowDevStory,StackOverflowJobsRecommend,StackOverflowConsiderMember,HypotheticalTools1,HypotheticalTools2,HypotheticalTools3,HypotheticalTools4,HypotheticalTools5,WakeTime,HoursComputer,HoursOutside,SkipMeals,ErgonomicDevices,Exercise,EducationParents,RaceEthnicity,Age,Dependents,SurveyTooLong,SurveyEasy,LanguageWorkedWith,AssessJob,AssessBenefits,JobContactPriorities,JobEmailPriorities,AdsPriorities,Cluster
0,1,0,IND,0,0,1,6,8,"[0, 16, 12]",7,11,5,5,0,1,5,7,0.002724,[8],3,[8],[5],[0],0,2,2,"[18, 27]",[14],[14],[14],[14],[5],[5],[10],3,1,[0],[1],2,2,3,[0],1,1,0,[3],1,3,3,1,1,0,2,2,10,1,2,4,1,2,5,2,3,4,4,2,3,5,1,0,3,[2],3,1,[6],6,0,1,0,"[14, 18, 5]","[10, 9, 8, 3, 5, 4, 1, 2, 7, 6]","[1, 9, 2, 10, 11, 3, 7, 4, 8, 6, 5]","[2, 1, 3, 4, 5]","[1, 2, 7, 3, 6, 5, 4]","[1, 5, 2, 4, 6, 7, 3]",
1,1,0,USA,0,0,1,6,8,"[0, 19, 11, 12]",7,0,5,3,6,2,3,7,0.068357,"[5, 8]",3,"[8, 7, 1]","[5, 7, 3]",[0],0,2,1,"[18, 27, 14, 5, 31, 17]","[14, 19]","[14, 13]","[14, 22, 2]","[14, 2, 22]","[5, 0]","[5, 1]","[10, 18, 19]",3,1,"[0, 9]","[1, 6]",2,2,3,"[6, 2]",1,1,0,"[3, 2]",1,3,3,1,1,0,2,2,10,2,2,4,2,2,5,2,3,2,4,4,4,5,2,2,3,[0],3,1,[6],0,0,1,2,"[14, 5, 18, 31, 17, 27, 25]","[9, 8, 7, 1, 2, 3, 10, 4, 6, 5]","[1, 11, 2, 10, 9, 3, 7, 4, 8, 6, 5]","[2, 1, 5, 4, 3]","[1, 3, 7, 2, 4, 6, 5]","[1, 5, 2, 4, 6, 7, 3]",
2,1,1,IND,0,0,1,6,8,"[0, 12, 11, 15]",7,0,3,3,6,2,3,7,0.055957,"[8, 5]",3,"[8, 1]","[5, 7]",[0],0,0,1,"[18, 27, 14, 5]","[14, 19]","[14, 13]","[14, 22]","[14, 2]",[5],"[5, 1]","[10, 18]",3,2,"[0, 9]",[1],2,2,3,[0],1,1,0,[3],1,3,3,1,1,0,2,2,10,5,2,4,2,2,5,2,3,4,3,4,4,5,2,0,3,[0],0,1,[6],1,0,1,2,"[18, 14, 5, 31, 17]","[9, 8, 7, 2, 1, 3, 10, 4, 6, 5]","[1, 2, 3, 10, 9, 4, 11, 7, 8, 6, 5]","[2, 1, 5, 4, 3]","[1, 6, 7, 2, 3, 5, 4]","[1, 4, 2, 3, 6, 7, 5]",
3,1,0,IND,0,0,1,6,8,"[0, 12, 11]",7,0,4,4,6,2,3,7,0.027041,[8],3,[8],[5],[4],0,0,2,"[18, 27]",[14],[14],[14],[14],[5],[5],[10],3,2,[0],[1],2,2,3,[0],1,1,0,[3],1,3,3,1,1,0,2,2,10,5,2,4,1,2,5,2,3,3,4,3,3,6,2,0,3,[2],3,1,[6],1,0,1,0,"[18, 14]","[9, 7, 8, 1, 2, 3, 10, 4, 6, 5]","[1, 2, 3, 10, 7, 9, 6, 5, 11, 8, 4]","[2, 1, 5, 3, 4]","[1, 4, 7, 2, 3, 6, 5]","[1, 4, 2, 3, 7, 6, 5]",
4,1,0,IND,0,0,1,6,4,"[0, 12, 11]",7,11,5,5,6,1,3,7,0.011018,[8],3,[8],[5],[0],0,2,2,"[18, 27]",[14],[14],[14],[14],[5],[5],"[10, 18]",3,2,[0],[1],2,2,3,[0],1,1,0,[3],1,3,3,1,1,0,2,2,10,2,2,4,2,2,5,2,3,2,4,3,3,6,2,0,3,[2],3,1,[6],1,0,1,2,"[14, 18, 5]","[9, 7, 6, 2, 1, 3, 10, 5, 8, 4]","[1, 2, 3, 11, 9, 7, 6, 5, 10, 8, 4]","[2, 1, 5, 4, 3]","[1, 3, 2, 4, 6, 5, 7]","[1, 4, 2, 3, 6, 7, 5]",
5,1,0,USA,0,0,1,6,4,"[0, 12, 11, 15]",9,7,3,3,6,2,3,7,0.160822,"[8, 4, 5]",3,"[8, 7, 1, 5]","[7, 5, 0, 3]","[0, 4]",0,1,1,"[18, 27, 14, 5, 31, 1]","[14, 19, 17]","[17, 13, 14]","[14, 22, 0]","[14, 0, 2]","[5, 1]","[5, 6, 1]","[19, 18, 10]",3,2,"[0, 9, 4]","[1, 4]",2,2,3,"[6, 2]",1,1,0,"[3, 2]",0,3,3,1,1,0,2,2,10,2,2,4,2,2,5,2,3,2,3,3,3,6,2,0,3,[0],3,1,[6],1,0,0,2,"[18, 14, 5, 31, 1, 17, 27]","[9, 8, 7, 1, 2, 3, 10, 4, 6, 5]","[1, 2, 3, 10, 9, 4, 11, 5, 8, 7, 6]","[2, 1, 5, 3, 4]","[1, 5, 2, 3, 4, 7, 6]","[1, 4, 2, 3, 6, 7, 5]",


Unnamed: 0,Hobby,OpenSource,Country,Student,Employment,FormalEducation,UndergradMajor,CompanySize,DevType,YearsCoding,YearsCodingProf,JobSatisfaction,CareerSatisfaction,HopeFiveYears,JobSearchStatus,LastNewJob,UpdateCV,ConvertedSalary,CommunicationTools,TimeFullyProductive,EducationTypes,SelfTaughtTypes,HackathonReasons,AgreeDisagree1,AgreeDisagree2,AgreeDisagree3,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,FrameworkWorkedWith,FrameworkDesireNextYear,IDE,OperatingSystem,NumberMonitors,Methodology,VersionControl,CheckInCode,AdBlocker,AdBlockerDisable,AdBlockerReasons,AdsAgreeDisagree1,AdsAgreeDisagree2,AdsAgreeDisagree3,AdsActions,AIDangerous,AIInteresting,AIResponsible,AIFuture,EthicsChoice,EthicsReport,EthicsResponsible,EthicalImplications,StackOverflowRecommend,StackOverflowVisit,StackOverflowHasAccount,StackOverflowParticipate,StackOverflowJobs,StackOverflowDevStory,StackOverflowJobsRecommend,StackOverflowConsiderMember,HypotheticalTools1,HypotheticalTools2,HypotheticalTools3,HypotheticalTools4,HypotheticalTools5,WakeTime,HoursComputer,HoursOutside,SkipMeals,ErgonomicDevices,Exercise,EducationParents,RaceEthnicity,Age,Dependents,SurveyTooLong,SurveyEasy,LanguageWorkedWith,AssessJob,AssessBenefits,JobContactPriorities,JobEmailPriorities,AdsPriorities,Cluster
0,0.158338,0.6635,0.895361,0.675039,0.70874,0.737073,0.83096,0.640148,0.809179,0.704252,0.484809,0.347137,0.411918,0.977397,0.286231,0.81934,0.985494,0.073133,0.996928,0.996225,0.985004,0.994261,0.997017,0.988593,0.991721,0.992398,0.950582,0.940856,0.969669,0.966647,0.974253,0.969344,0.971256,0.975908,0.910607,0.929423,0.948649,0.896338,0.929904,0.89819,0.932468,0.962163,0.951989,0.955548,0.959328,0.965327,0.979989,0.978847,0.976939,0.966187,0.974147,0.977893,0.984547,0.976137,0.97223,0.983372,0.953516,0.990328,0.981321,0.984395,0.966612,0.979609,0.992661,0.993525,0.993562,0.994388,0.993975,0.99375,0.99093,0.991909,0.985532,0.994662,0.988933,0.998245,0.9956,0.996637,0.993224,0.9942,0.995663,0.913588,0.00863804,0.00863804,0.00863804,0.00863804,0.00863804,
1,0.366397,0.627379,0.892337,0.649289,0.802287,0.761385,0.800236,0.638271,0.742962,0.821829,0.710471,0.736558,0.867978,0.847279,0.671856,0.817576,0.778251,0.306065,0.921768,0.919585,0.679308,0.698741,0.936648,0.768353,0.848696,0.815269,0.768007,0.7861,0.870022,0.813298,0.8423,0.885271,0.878383,0.815525,0.602418,0.729286,0.849282,0.710845,0.843961,0.509114,0.683388,0.843701,0.82795,0.84044,0.869454,0.724706,0.854672,0.821146,0.697032,0.579429,0.694959,0.654453,0.727526,0.515061,0.554406,0.842155,0.379823,0.863488,0.720496,0.800487,0.65384,0.689343,0.872858,0.88272,0.877894,0.868717,0.881902,0.863024,0.774793,0.795213,0.698411,0.898647,0.773862,0.861165,0.670649,0.737157,0.48822,0.659333,0.796119,0.669697,0.00895251,0.00895251,0.00895251,0.00895251,0.00586079,
2,0.290578,0.621135,0.881885,0.541277,0.618057,0.714976,0.709978,0.797695,0.754371,0.865879,0.83827,0.851215,0.8004,0.871003,0.756626,0.826199,0.857318,0.292573,0.914646,0.921927,0.8632,0.904897,0.965824,0.865438,0.921789,0.917225,0.888416,0.865541,0.931484,0.898554,0.92287,0.940306,0.925699,0.901284,0.808777,0.848672,0.847309,0.740186,0.805626,0.79065,0.886057,0.890066,0.8942,0.903818,0.903959,0.900379,0.927702,0.905083,0.892492,0.812542,0.8416,0.862789,0.870711,0.801829,0.523361,0.759645,0.255658,0.846572,0.706927,0.828811,0.650143,0.520192,0.859245,0.877838,0.845068,0.823115,0.832638,0.820174,0.713373,0.744256,0.63149,0.838067,0.803255,0.949648,0.927971,0.925918,0.925094,0.873191,0.936026,0.805742,0.0159475,0.0159475,0.0159475,0.0159475,0.0159475,
3,0.296742,0.665415,0.871302,0.582619,0.497354,0.682825,0.631577,0.886081,0.73729,0.811976,0.783776,0.944722,0.944649,0.824705,0.48247,0.782446,0.948976,0.215256,0.968639,0.953065,0.957659,0.984358,0.991643,0.963428,0.971468,0.978093,0.961853,0.948934,0.973747,0.968756,0.975677,0.971936,0.972258,0.97606,0.918634,0.925263,0.936637,0.89142,0.913251,0.898842,0.937417,0.960686,0.951899,0.957209,0.958152,0.954677,0.971396,0.964293,0.956193,0.929456,0.942956,0.953794,0.953284,0.92983,0.943104,0.965876,0.904157,0.976317,0.966667,0.970109,0.941409,0.936676,0.985943,0.986717,0.985872,0.98721,0.986788,0.979229,0.96148,0.976815,0.968533,0.98442,0.971325,0.99387,0.996928,0.989669,0.991351,0.977738,0.992612,0.94724,0.0117851,0.0117851,0.0117851,0.0117851,0.0117851,
4,0.145535,0.657762,0.889277,0.546722,0.405812,0.687309,0.648103,0.894879,0.768189,0.866643,0.490267,0.399905,0.435513,0.970041,0.322118,0.963148,0.977982,0.144389,0.985345,0.979684,0.949316,0.974645,0.985897,0.952089,0.971145,0.971635,0.94878,0.937203,0.968352,0.954062,0.964884,0.965623,0.967226,0.943475,0.886731,0.898462,0.923718,0.852524,0.899257,0.862239,0.925471,0.946448,0.934683,0.937352,0.945442,0.941271,0.963889,0.945819,0.947453,0.900711,0.917076,0.934173,0.935701,0.903481,0.589048,0.793387,0.484161,0.868564,0.770098,0.836461,0.662449,0.747994,0.942416,0.940141,0.949711,0.952089,0.953338,0.932388,0.869111,0.910435,0.861963,0.951714,0.898197,0.983442,0.965287,0.974936,0.967339,0.961045,0.978955,0.906656,0.0154267,0.0154267,0.0154267,0.0154267,0.0154267,
5,0.388457,0.677761,0.855699,0.407768,0.380009,0.717212,0.639617,0.888825,0.730425,0.878615,0.839448,0.793868,0.784546,0.811126,0.623179,0.816675,0.780562,0.38479,0.757472,0.783566,0.641228,0.637162,0.872724,0.649113,0.847827,0.807392,0.78135,0.781188,0.857559,0.816254,0.840156,0.841414,0.815456,0.81879,0.744547,0.668896,0.651489,0.604646,0.572155,0.499754,0.677383,0.851578,0.751769,0.773029,0.832935,0.711689,0.853213,0.779439,0.752649,0.503161,0.623385,0.750233,0.66733,0.423995,0.548603,0.812034,0.331667,0.78407,0.673732,0.83524,0.679036,0.667426,0.825699,0.831614,0.860986,0.850894,0.857469,0.83101,0.660123,0.770428,0.5718,0.892846,0.79792,0.850279,0.568512,0.672097,0.582017,0.677856,0.793545,0.65391,0.00462122,0.00462122,0.00462122,0.00462122,0.00462122,


Posibles clusters a dividir: [0, 1, 2, 3, 4, 5]


In [None]:
def mix_clusters():
    global centroids, NUM_CLUSTERS
    
    # Matriz triangular superior de distancias entre centroides
    dist_lists = []
    for i, rc_i in centroids.iterrows():
        dist_lists.append([])
        for j, rc_j in centroids.iterrows():
            if j <= i:
                dist_lists[i].append(LARGER_DISTANCE)
            else:
                dist_lists[i].append(distance_qual(rc_i, rc_j))
    dist_matrix = np.array(dist_lists)
    
    to_eliminate = []
    # to_eliminate contendrá la mitad de los clusters unidos...
    while (dist_matrix.min() < LARGER_DISTANCE and len(to_eliminate) < P_MAX/2) :
        dist_min = dist_matrix.min()
        idx = (dist_matrix==dist_min).argmax()
        z1 = idx // len(centroids)
        z2 = idx % len(centroids)
        
        if dist_min < L_MIN:
            if TALK:
                print("Unificando clusters {} y {}".format(z1, z2))
                for i in range(NUM_CLUSTERS):
                    members = df[df["Cluster"]==i].count()["Cluster"]
                    print("El cluster ", i, " incluye ", members, "miembros.")
                print()

            # Modificar z1 para contener el centroide entre z1 y z2
            centroids.iloc[z1] = get_centroide(centroids.iloc[[z1, z2]]).loc[0]
            # Marcar puntos en z1 y z2 para reclasificar
            df.loc[df.Cluster == z1, 'Cluster'] = np.nan
            df.loc[df.Cluster == z2, 'Cluster'] = np.nan
            
            # Marcar z2 para eliminación
            to_eliminate.append(z2)
        
        dist_matrix[z1][z2] = LARGER_DISTANCE
        
    if len(to_eliminate) > 0:
        centroids.drop(to_eliminate, inplace=True)
        centroids = centroids.reset_index(drop=True)
        
        # Reetiquetar los registros afectados
        eliminated = 0
        for i in to_eliminate:
            i_e = i - eliminated
            # Recorrer las etiquetas para coincidir con los nuevos índices
            for cj in range(i_e + 1, NUM_CLUSTERS):
                df.loc[df.Cluster == cj, 'Cluster'] = cj - 1
            # Actualizar el número actual de centroides
            NUM_CLUSTERS -= 1
            eliminated += 1
            
        cluster_col_index = df.shape[1] - 1
        for index, row in df[pd.isnull(df["Cluster"])].iterrows():
            dists = []
            for i, r in centroids.iterrows():
                dists.append(distance_qual(row, r))
            df.iloc[index, cluster_col_index] = np.argmin(dists)
        update_centroids()
            
        if (TALK) : 
            # Contabilizar los elementos en cada cluster   
            for i in range(NUM_CLUSTERS):
                members = df[df["Cluster"]==i].count()["Cluster"]
                print("El cluster ", i, " incluye ", members, "miembros")
            print()

    return

#mix_clusters()

In [None]:
# Reproducido aquí para facilitar la ejecución
#iteration +=1 #usar si se está probando dividir/unir demostrativo

I_MAX_INT = 5 # Iteraciones permitidas en cada ciclo k-means

while iteration < I_MAX:
    if (iteration % 2 == 1 or NUM_CLUSTERS <= K_INIT / 2) :
        update_deltas()
        divide_clusters()
    elif (iteration % 2 == 0 or NUM_CLUSTERS > 2 * K_INIT) :
        mix_clusters()
        
    step = 0
    KEEP_WALKING = True
    while KEEP_WALKING and step < I_MAX_INT :
        KEEP_WALKING = update_clusters()
        update_centroids()
            
    iteration += 1
    
if TALK : 
    print ("No más cambios.")

Unificando clusters 3 y 4
El cluster  0  incluye  13773 miembros.
El cluster  1  incluye  13482 miembros.
El cluster  2  incluye  4408 miembros.
El cluster  3  incluye  12268 miembros.
El cluster  4  incluye  4432 miembros.
El cluster  5  incluye  50080 miembros.

El cluster  0  incluye  17278 miembros
El cluster  1  incluye  16775 miembros
El cluster  2  incluye  8139 miembros
El cluster  3  incluye  2757 miembros
El cluster  4  incluye  53494 miembros

Actualizando clusters
El cluster  0  incluye  18845 miembros.
El cluster  1  incluye  17329 miembros.
El cluster  2  incluye  15643 miembros.
El cluster  3  incluye  8467 miembros.
El cluster  4  incluye  38159 miembros.


El cluster  0  incluye  18845 miembros.
El cluster  1  incluye  17329 miembros.
El cluster  2  incluye  15643 miembros.
El cluster  3  incluye  8467 miembros.
El cluster  4  incluye  38159 miembros.

Actualizando clusters
El cluster  0  incluye  18682 miembros.
El cluster  1  incluye  18697 miembros.
El cluster  2  i

Unnamed: 0,Hobby,OpenSource,Country,Student,Employment,FormalEducation,UndergradMajor,CompanySize,DevType,YearsCoding,YearsCodingProf,JobSatisfaction,CareerSatisfaction,HopeFiveYears,JobSearchStatus,LastNewJob,UpdateCV,ConvertedSalary,CommunicationTools,TimeFullyProductive,EducationTypes,SelfTaughtTypes,HackathonReasons,AgreeDisagree1,AgreeDisagree2,AgreeDisagree3,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,FrameworkWorkedWith,FrameworkDesireNextYear,IDE,OperatingSystem,NumberMonitors,Methodology,VersionControl,CheckInCode,AdBlocker,AdBlockerDisable,AdBlockerReasons,AdsAgreeDisagree1,AdsAgreeDisagree2,AdsAgreeDisagree3,AdsActions,AIDangerous,AIInteresting,AIResponsible,AIFuture,EthicsChoice,EthicsReport,EthicsResponsible,EthicalImplications,StackOverflowRecommend,StackOverflowVisit,StackOverflowHasAccount,StackOverflowParticipate,StackOverflowJobs,StackOverflowDevStory,StackOverflowJobsRecommend,StackOverflowConsiderMember,HypotheticalTools1,HypotheticalTools2,HypotheticalTools3,HypotheticalTools4,HypotheticalTools5,WakeTime,HoursComputer,HoursOutside,SkipMeals,ErgonomicDevices,Exercise,EducationParents,RaceEthnicity,Age,Dependents,SurveyTooLong,SurveyEasy,LanguageWorkedWith,AssessJob,AssessBenefits,JobContactPriorities,JobEmailPriorities,AdsPriorities,Cluster
0,1,0,IND,0,0,1,6,8,"[0, 12, 16]",7,11,5,5,0,1,5,7,0.011265,[8],3,[8],[5],[0],0,2,2,"[27, 18, 14]",[14],[14],[14],"[14, 2]",[5],[5],"[10, 15]",3,1,[0],[1],2,2,3,[6],1,1,0,[3],1,3,3,1,1,0,2,2,10,2,2,4,1,2,5,2,3,4,4,2,3,5,1,0,3,[2],3,1,[6],1,0,1,0,"[14, 18, 5]","[1, 9, 7, 2, 4, 3, 10, 5, 8, 6]","[1, 11, 2, 10, 8, 3, 7, 4, 9, 6, 5]","[1, 2, 3, 4, 5]","[1, 4, 7, 2, 6, 5, 3]","[1, 5, 2, 4, 6, 7, 3]",
1,1,0,USA,0,0,1,6,8,"[0, 12, 11, 19]",7,0,5,3,6,2,3,7,0.078202,"[8, 5]",3,"[8, 7, 1]","[5, 7, 3]",[0],0,2,1,"[18, 27, 14, 5, 31]","[14, 19]","[14, 13]","[14, 22, 2]","[14, 2, 22]","[5, 1]","[5, 1]","[18, 19, 10]",3,1,"[0, 9]","[1, 6]",2,2,3,"[6, 2]",1,1,0,"[3, 2]",1,3,3,1,1,0,2,2,10,2,2,4,2,2,5,2,3,2,4,4,4,5,2,2,3,[0],3,1,[6],0,0,1,2,"[14, 18, 5, 31, 17, 27]","[9, 8, 7, 1, 2, 3, 10, 4, 6, 5]","[1, 2, 3, 10, 9, 4, 7, 5, 11, 6, 8]","[2, 1, 5, 4, 3]","[1, 3, 7, 2, 4, 6, 5]","[1, 5, 2, 4, 6, 7, 3]",
2,1,1,IND,0,0,1,6,8,"[0, 12, 11, 15]",7,0,3,3,6,2,3,7,0.092327,"[8, 4, 7]",3,"[8, 1, 7]","[5, 7, 3]","[0, 4]",0,0,1,"[18, 14, 27, 5, 31]","[14, 19, 17]","[13, 14, 17]","[14, 2, 22]","[14, 2, 0]","[5, 1]","[5, 6]","[19, 10, 15]",3,2,"[0, 9]","[1, 4]",2,1,0,[0],1,1,0,"[0, 3]",1,3,3,1,1,0,2,2,10,5,2,4,2,2,5,2,3,4,3,4,4,5,2,0,3,[0],0,1,[6],1,0,1,2,"[18, 14, 5, 31, 17, 1]","[9, 8, 7, 2, 1, 4, 10, 3, 6, 5]","[1, 2, 3, 10, 6, 4, 11, 5, 9, 8, 7]","[2, 1, 5, 3, 4]","[1, 6, 7, 2, 3, 5, 4]","[1, 4, 2, 3, 6, 7, 5]",
3,0,0,IND,0,0,1,6,4,"[0, 12, 11]",7,0,4,4,6,2,3,7,0.127085,"[8, 4]",3,"[8, 7]","[5, 7]",[4],0,0,2,"[18, 27, 14]","[14, 19]","[14, 13]","[14, 22]","[14, 2]",[5],"[5, 1]","[18, 10]",3,2,"[0, 9]",[1],2,1,0,[0],1,1,0,[3],1,3,3,1,1,0,2,2,10,2,2,4,1,2,5,2,2,2,4,3,3,6,2,0,3,[2],3,1,[6],1,0,1,0,"[18, 14, 5, 31]","[9, 7, 8, 2, 1, 3, 10, 4, 6, 5]","[1, 2, 3, 10, 9, 5, 8, 4, 11, 6, 7]","[2, 1, 5, 3, 4]","[1, 4, 7, 2, 3, 6, 5]","[1, 3, 2, 5, 7, 6, 4]",
4,1,0,USA,0,0,1,6,4,"[0, 12, 11, 6]",9,7,3,3,6,2,3,7,0.165682,"[8, 4, 5]",3,"[8, 7, 1]","[7, 5, 0]",[0],0,1,1,"[18, 27, 14, 5, 31]","[14, 19, 17]","[17, 13, 14]","[14, 22, 0]","[14, 0, 2]","[5, 1]","[5, 6]","[19, 18, 10]",3,2,"[0, 9, 4]","[1, 4]",2,2,3,"[6, 2]",1,1,0,"[3, 2]",0,3,3,1,1,0,2,2,10,2,2,4,2,2,5,2,3,2,3,3,3,6,2,0,3,[0],3,1,[6],1,0,0,2,"[18, 14, 5, 31, 1, 17]","[9, 8, 7, 1, 2, 3, 10, 4, 6, 5]","[1, 2, 3, 10, 9, 4, 11, 5, 8, 7, 6]","[2, 1, 5, 3, 4]","[1, 5, 2, 3, 4, 7, 6]","[1, 4, 2, 3, 6, 7, 5]",


Unnamed: 0,Hobby,OpenSource,Country,Student,Employment,FormalEducation,UndergradMajor,CompanySize,DevType,YearsCoding,YearsCodingProf,JobSatisfaction,CareerSatisfaction,HopeFiveYears,JobSearchStatus,LastNewJob,UpdateCV,ConvertedSalary,CommunicationTools,TimeFullyProductive,EducationTypes,SelfTaughtTypes,HackathonReasons,AgreeDisagree1,AgreeDisagree2,AgreeDisagree3,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,FrameworkWorkedWith,FrameworkDesireNextYear,IDE,OperatingSystem,NumberMonitors,Methodology,VersionControl,CheckInCode,AdBlocker,AdBlockerDisable,AdBlockerReasons,AdsAgreeDisagree1,AdsAgreeDisagree2,AdsAgreeDisagree3,AdsActions,AIDangerous,AIInteresting,AIResponsible,AIFuture,EthicsChoice,EthicsReport,EthicsResponsible,EthicalImplications,StackOverflowRecommend,StackOverflowVisit,StackOverflowHasAccount,StackOverflowParticipate,StackOverflowJobs,StackOverflowDevStory,StackOverflowJobsRecommend,StackOverflowConsiderMember,HypotheticalTools1,HypotheticalTools2,HypotheticalTools3,HypotheticalTools4,HypotheticalTools5,WakeTime,HoursComputer,HoursOutside,SkipMeals,ErgonomicDevices,Exercise,EducationParents,RaceEthnicity,Age,Dependents,SurveyTooLong,SurveyEasy,LanguageWorkedWith,AssessJob,AssessBenefits,JobContactPriorities,JobEmailPriorities,AdsPriorities,Cluster
0,0.19056,0.654078,0.888916,0.65127,0.685375,0.729714,0.799326,0.685173,0.819736,0.743284,0.460668,0.341308,0.399472,0.963694,0.364057,0.853871,0.962685,0.145618,0.987138,0.984922,0.936483,0.965448,0.985608,0.942518,0.956291,0.950759,0.923472,0.919983,0.955346,0.946479,0.944221,0.959398,0.961032,0.945778,0.885725,0.875764,0.935336,0.843929,0.914987,0.820646,0.884031,0.950792,0.929586,0.934531,0.939331,0.927132,0.955826,0.948713,0.934145,0.90123,0.91726,0.931377,0.935392,0.899411,0.851135,0.930213,0.788803,0.935956,0.884156,0.915321,0.823314,0.889853,0.953791,0.958002,0.958379,0.956639,0.960433,0.952451,0.921608,0.931377,0.903384,0.962148,0.937823,0.972926,0.934319,0.959189,0.91844,0.938917,0.95632,0.893126,0.00745149,0.00745149,0.00745149,0.00745149,0.00745149,
1,0.332946,0.589102,0.866051,0.61731,0.693555,0.752887,0.724858,0.755771,0.736409,0.789593,0.675268,0.823583,0.839778,0.817336,0.635389,0.779796,0.799756,0.320637,0.887626,0.890748,0.733713,0.748115,0.934983,0.752967,0.834867,0.844978,0.81638,0.815949,0.888813,0.833362,0.859841,0.882622,0.875917,0.835339,0.716339,0.751501,0.796855,0.727106,0.776733,0.476205,0.64359,0.829391,0.813315,0.824993,0.866739,0.759284,0.851687,0.842187,0.778113,0.619238,0.742164,0.770338,0.756796,0.599332,0.629215,0.834652,0.501681,0.839529,0.758801,0.836721,0.704806,0.735266,0.858587,0.890146,0.838535,0.823112,0.834474,0.840913,0.761506,0.775273,0.707465,0.903524,0.794972,0.885716,0.723018,0.776387,0.632427,0.742485,0.8264,0.720732,0.00771999,0.00771999,0.00771999,0.00771999,0.00771999,
2,0.277739,0.549227,0.869213,0.507696,0.498094,0.698194,0.654449,0.877989,0.72098,0.84753,0.838993,0.792981,0.790468,0.830686,0.637598,0.801424,0.828428,0.338044,0.834939,0.854545,0.739948,0.77549,0.897254,0.763539,0.834902,0.8423,0.823866,0.811906,0.871278,0.850887,0.87013,0.865754,0.853682,0.854061,0.791947,0.737889,0.743853,0.694921,0.688954,0.69945,0.69945,0.69945,0.803903,0.824888,0.858815,0.781131,0.870693,0.850632,0.801424,0.651042,0.736262,0.809959,0.783439,0.631511,0.633131,0.766828,0.471536,0.864893,0.699994,0.873344,0.785119,0.646396,0.85936,0.881216,0.881382,0.836268,0.845144,0.845144,0.744011,0.78385,0.717256,0.895173,0.814463,0.884928,0.794499,0.776121,0.741608,0.730836,0.845283,0.72936,0.00765032,0.00765032,0.00765032,0.00765032,0.00765032,
3,0.774866,0.457982,0.897771,0.467206,0.442824,0.704218,0.694319,0.855145,0.740205,0.837328,0.771432,0.881248,0.87974,0.816922,0.613243,0.808287,0.833924,0.369368,0.861054,0.833469,0.744275,0.797205,0.960466,0.780718,0.844994,0.815993,0.86424,0.802982,0.896255,0.853176,0.898121,0.922503,0.881578,0.855839,0.707978,0.724919,0.757617,0.624977,0.744154,0.66886,0.66886,0.66886,0.811799,0.841283,0.849247,0.828429,0.872271,0.845779,0.810279,0.677872,0.736211,0.789294,0.754279,0.641654,0.644014,0.807113,0.52149,0.841621,0.69541,0.810396,0.566965,0.757039,0.830049,0.830506,0.87974,0.841057,0.856474,0.812149,0.724134,0.808053,0.66445,0.868684,0.774131,0.88522,0.756245,0.776577,0.706235,0.691856,0.820397,0.756433,0.0137712,0.0137712,0.0137712,0.0137712,0.0137712,
4,0.368373,0.653084,0.829834,0.37813,0.345894,0.717604,0.644627,0.865278,0.726317,0.845734,0.80422,0.762377,0.762377,0.808715,0.625978,0.830867,0.798159,0.392777,0.76975,0.796393,0.68469,0.683935,0.91289,0.695521,0.826042,0.810734,0.809036,0.799902,0.875092,0.823268,0.854673,0.857071,0.851972,0.82511,0.751644,0.647183,0.678224,0.638805,0.610904,0.395904,0.610506,0.824784,0.790887,0.805145,0.861566,0.734565,0.836014,0.786948,0.77501,0.57046,0.652364,0.754502,0.692069,0.479409,0.59152,0.813962,0.425842,0.803938,0.66999,0.857876,0.73225,0.717153,0.850162,0.813723,0.847455,0.831763,0.832696,0.808935,0.682839,0.790457,0.613917,0.899664,0.801114,0.871466,0.574438,0.690383,0.63611,0.651818,0.807213,0.69,0.00568954,0.00568954,0.00568954,0.00568954,0.00430089,


Posibles clusters a dividir: [0, 1, 2, 3, 4]
Clusters a eliminar: [3]

Actualizando clusters
El cluster  0  incluye  4234 miembros.
El cluster  1  incluye  14955 miembros.
El cluster  2  incluye  15619 miembros.
El cluster  3  incluye  30071 miembros.
El cluster  4  incluye  2707 miembros.
El cluster  5  incluye  30857 miembros.


El cluster  0  incluye  4234 miembros.
El cluster  1  incluye  14955 miembros.
El cluster  2  incluye  15619 miembros.
El cluster  3  incluye  30071 miembros.
El cluster  4  incluye  2707 miembros.
El cluster  5  incluye  30857 miembros.



Unnamed: 0,Hobby,OpenSource,Country,Student,Employment,FormalEducation,UndergradMajor,CompanySize,DevType,YearsCoding,YearsCodingProf,JobSatisfaction,CareerSatisfaction,HopeFiveYears,JobSearchStatus,LastNewJob,UpdateCV,ConvertedSalary,CommunicationTools,TimeFullyProductive,EducationTypes,SelfTaughtTypes,HackathonReasons,AgreeDisagree1,AgreeDisagree2,AgreeDisagree3,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,FrameworkWorkedWith,FrameworkDesireNextYear,IDE,OperatingSystem,NumberMonitors,Methodology,VersionControl,CheckInCode,AdBlocker,AdBlockerDisable,AdBlockerReasons,AdsAgreeDisagree1,AdsAgreeDisagree2,AdsAgreeDisagree3,AdsActions,AIDangerous,AIInteresting,AIResponsible,AIFuture,EthicsChoice,EthicsReport,EthicsResponsible,EthicalImplications,StackOverflowRecommend,StackOverflowVisit,StackOverflowHasAccount,StackOverflowParticipate,StackOverflowJobs,StackOverflowDevStory,StackOverflowJobsRecommend,StackOverflowConsiderMember,HypotheticalTools1,HypotheticalTools2,HypotheticalTools3,HypotheticalTools4,HypotheticalTools5,WakeTime,HoursComputer,HoursOutside,SkipMeals,ErgonomicDevices,Exercise,EducationParents,RaceEthnicity,Age,Dependents,SurveyTooLong,SurveyEasy,LanguageWorkedWith,AssessJob,AssessBenefits,JobContactPriorities,JobEmailPriorities,AdsPriorities,Cluster
0,1,0,IND,0,0,1,6,8,"[0, 19, 12]",7,11,5,5,0,1,2,7,0.047809,[8],3,"[8, 7, 1]","[5, 7, 3]",[0],0,2,2,"[27, 18, 14, 5, 17]","[14, 17]","[14, 13]","[14, 22, 2]","[14, 2, 18]",[5],"[5, 6]","[10, 15, 17]",3,1,"[0, 9]","[1, 2]",2,2,3,"[6, 1]",1,1,0,"[3, 2]",1,3,3,1,1,0,2,2,10,2,2,4,1,2,5,2,3,4,4,2,3,5,1,0,3,[2],3,1,[6],1,0,1,0,"[14, 18, 5, 27, 17, 31]","[1, 9, 6, 2, 4, 3, 10, 5, 8, 7]","[1, 11, 2, 10, 8, 3, 7, 4, 9, 6, 5]","[1, 2, 5, 4, 3]","[1, 4, 7, 2, 6, 5, 3]","[1, 3, 2, 4, 6, 7, 5]",
1,1,0,USA,0,0,1,6,8,"[0, 12, 11, 19]",7,0,5,3,6,2,3,7,0.093088,"[8, 5]",3,"[8, 7, 1]","[5, 7, 3, 8]","[0, 4]",0,2,1,"[18, 27, 14, 5, 31, 17]","[14, 19, 17]","[14, 13, 19]","[14, 22, 2]","[14, 2, 22]","[5, 1]","[5, 1, 6]","[18, 19, 10, 15]",3,1,"[0, 9]","[1, 6]",2,2,3,"[6, 2]",1,1,0,"[3, 2]",1,3,3,1,1,0,2,2,10,2,2,4,2,2,5,2,3,2,4,4,4,5,2,2,3,[0],3,1,[6],0,0,1,2,"[14, 18, 5, 31, 17, 27, 3]","[9, 8, 7, 1, 2, 3, 10, 4, 6, 5]","[1, 2, 3, 10, 9, 4, 7, 5, 11, 6, 8]","[2, 1, 5, 4, 3]","[1, 3, 7, 2, 4, 6, 5]","[1, 5, 2, 4, 6, 7, 3]",
2,1,1,IND,0,0,1,6,8,"[0, 12, 11, 15]",7,0,3,3,6,2,3,7,0.115608,"[8, 4, 7]",3,"[8, 7, 1, 5]","[5, 7, 3]","[0, 4]",0,0,1,"[18, 14, 27, 5, 31, 17]","[14, 19, 17]","[13, 14, 17]","[14, 2, 22]","[14, 2, 0, 8]","[5, 1]","[5, 6, 1]","[19, 10, 15, 18]",3,2,"[0, 9, 4]","[1, 4]",2,1,0,"[0, 1]",1,1,0,"[0, 3]",1,3,3,1,1,0,2,2,10,5,2,4,2,2,5,2,3,4,3,4,4,5,2,0,3,[0],0,1,[6],1,0,1,2,"[18, 14, 5, 31, 17, 1, 27]","[9, 8, 7, 2, 1, 4, 10, 3, 6, 5]","[1, 2, 3, 10, 6, 4, 11, 5, 9, 8, 7]","[2, 1, 5, 3, 4]","[1, 6, 7, 2, 3, 5, 4]","[1, 4, 2, 5, 6, 7, 3]",
3,1,0,USA,0,0,1,6,4,"[0, 12, 11, 6]",9,7,3,3,6,2,3,7,0.180025,"[8, 4, 5]",3,"[8, 7, 1, 5]","[7, 5, 0, 8]","[0, 4]",0,1,1,"[18, 14, 27, 5, 31]","[14, 19, 17]","[17, 13, 14]","[14, 22, 0]","[14, 0, 2]","[5, 1]","[5, 6]","[19, 18, 10]",3,2,"[0, 9, 4]","[1, 4]",2,2,3,"[6, 2]",1,1,0,"[3, 2]",0,3,3,1,1,0,2,2,10,2,2,4,2,2,5,2,3,2,3,3,3,6,2,0,3,[0],3,1,[6],1,0,0,2,"[18, 14, 5, 31, 1, 17, 27]","[9, 8, 7, 1, 2, 3, 10, 4, 6, 5]","[1, 2, 3, 10, 9, 4, 11, 5, 8, 7, 6]","[2, 1, 5, 3, 4]","[1, 5, 2, 3, 4, 7, 6]","[1, 4, 2, 3, 6, 7, 5]",
4,1,0,USA,0,0,3,6,3,"[0, 12, 11]",9,0,3,1,0,1,4,7,0.147402,"[5, 4]",3,"[8, 7, 5]","[5, 7, 0]",[0],0,2,2,"[31, 27, 18, 14]","[19, 14]","[19, 14]","[22, 14]","[22, 14]",[0],[0],"[18, 10, 17]",3,2,"[0, 9]","[1, 4]",2,1,0,"[0, 1]",1,1,1,"[3, 2]",0,3,0,1,1,0,2,2,10,1,2,4,1,2,5,1,2,2,2,2,2,6,1,2,3,[2],0,1,[6],2,0,1,2,"[31, 18, 14, 5, 27]","[8, 9, 4, 1, 2, 3, 10, 5, 7, 6]","[1, 2, 3, 10, 8, 4, 11, 5, 9, 7, 6]","[2, 1, 5, 3, 4]","[1, 3, 7, 2, 5, 6, 4]","[1, 4, 2, 5, 6, 7, 3]",
5,1,0,IND,0,0,1,6,8,"[0, 12, 11]",7,11,5,5,6,1,5,7,0.011626,[8],3,[8],[5],[0],0,0,2,"[18, 27]",[14],[14],[14],[14],[5],[5],[10],3,2,[0],[1],2,2,3,[0],1,1,0,[3],1,3,3,1,1,0,2,2,10,2,2,4,2,2,5,2,3,3,4,4,4,6,2,0,3,[2],3,1,[6],1,0,1,0,"[14, 18, 5]","[9, 8, 7, 1, 2, 3, 10, 4, 6, 5]","[1, 2, 3, 10, 8, 9, 7, 5, 11, 6, 4]","[2, 1, 5, 4, 3]","[1, 4, 7, 2, 3, 6, 5]","[1, 4, 2, 3, 6, 7, 5]",



Actualizando clusters
El cluster  0  incluye  10503 miembros.
El cluster  1  incluye  18281 miembros.
El cluster  2  incluye  18433 miembros.
El cluster  3  incluye  30140 miembros.
El cluster  4  incluye  9018 miembros.
El cluster  5  incluye  12068 miembros.


El cluster  0  incluye  10503 miembros.
El cluster  1  incluye  18281 miembros.
El cluster  2  incluye  18433 miembros.
El cluster  3  incluye  30140 miembros.
El cluster  4  incluye  9018 miembros.
El cluster  5  incluye  12068 miembros.

Actualizando clusters
El cluster  0  incluye  10534 miembros.
El cluster  1  incluye  17374 miembros.
El cluster  2  incluye  18815 miembros.
El cluster  3  incluye  30004 miembros.
El cluster  4  incluye  9974 miembros.
El cluster  5  incluye  11742 miembros.


El cluster  0  incluye  10534 miembros.
El cluster  1  incluye  17374 miembros.
El cluster  2  incluye  18815 miembros.
El cluster  3  incluye  30004 miembros.
El cluster  4  incluye  9974 miembros.
El cluster  5  incluye  11742 miem

Unnamed: 0,Hobby,OpenSource,Country,Student,Employment,FormalEducation,UndergradMajor,CompanySize,DevType,YearsCoding,YearsCodingProf,JobSatisfaction,CareerSatisfaction,HopeFiveYears,JobSearchStatus,LastNewJob,UpdateCV,ConvertedSalary,CommunicationTools,TimeFullyProductive,EducationTypes,SelfTaughtTypes,HackathonReasons,AgreeDisagree1,AgreeDisagree2,AgreeDisagree3,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,FrameworkWorkedWith,FrameworkDesireNextYear,IDE,OperatingSystem,NumberMonitors,Methodology,VersionControl,CheckInCode,AdBlocker,AdBlockerDisable,AdBlockerReasons,AdsAgreeDisagree1,AdsAgreeDisagree2,AdsAgreeDisagree3,AdsActions,AIDangerous,AIInteresting,AIResponsible,AIFuture,EthicsChoice,EthicsReport,EthicsResponsible,EthicalImplications,StackOverflowRecommend,StackOverflowVisit,StackOverflowHasAccount,StackOverflowParticipate,StackOverflowJobs,StackOverflowDevStory,StackOverflowJobsRecommend,StackOverflowConsiderMember,HypotheticalTools1,HypotheticalTools2,HypotheticalTools3,HypotheticalTools4,HypotheticalTools5,WakeTime,HoursComputer,HoursOutside,SkipMeals,ErgonomicDevices,Exercise,EducationParents,RaceEthnicity,Age,Dependents,SurveyTooLong,SurveyEasy,LanguageWorkedWith,AssessJob,AssessBenefits,JobContactPriorities,JobEmailPriorities,AdsPriorities,Cluster
0,1,0,IND,0,0,1,6,8,"[0, 19, 12]",7,11,5,5,0,1,5,7,0.008914,[8],3,[8],[5],[0],0,0,2,"[27, 18, 14]",[14],[14],[14],"[14, 2]",[5],[5],"[15, 10]",3,1,[0],[1],2,2,3,[6],1,1,0,[3],1,3,3,1,1,0,2,2,10,2,2,4,1,2,5,2,3,3,4,3,3,5,1,0,3,[2],3,1,[6],1,0,1,0,"[14, 18, 5]","[1, 9, 6, 2, 4, 3, 10, 5, 8, 7]","[1, 11, 2, 10, 9, 3, 6, 4, 8, 7, 5]","[1, 2, 5, 4, 3]","[1, 4, 7, 2, 5, 6, 3]","[1, 3, 2, 4, 6, 7, 5]",
1,1,0,USA,0,0,1,6,8,"[0, 12, 11, 19]",7,0,5,3,6,2,3,7,0.075571,"[8, 5]",3,"[8, 7, 1]","[5, 7, 3]",[0],0,2,1,"[18, 27, 14, 5, 31]","[14, 19]","[14, 13]","[14, 22]","[14, 2, 22]","[5, 1]","[5, 6]","[19, 18, 10]",3,1,"[0, 9]","[1, 6]",2,2,3,"[6, 2]",1,1,0,"[3, 2]",1,3,3,1,1,0,2,2,10,2,2,4,2,2,5,2,3,3,4,4,4,5,2,2,3,[0],3,1,[6],0,0,1,2,"[14, 18, 5, 31, 17, 27]","[9, 8, 7, 1, 2, 3, 10, 4, 6, 5]","[1, 2, 3, 10, 8, 4, 7, 5, 11, 6, 9]","[2, 1, 5, 4, 3]","[1, 3, 7, 2, 4, 6, 5]","[1, 5, 2, 4, 6, 7, 3]",
2,1,1,IND,0,0,1,6,8,"[0, 12, 11, 15]",7,0,3,3,6,2,3,7,0.093674,"[8, 4]",3,"[8, 7, 1]","[5, 7, 3]","[4, 0]",0,0,1,"[18, 14, 27, 5, 31]","[14, 19, 17]","[13, 14, 17]","[14, 2, 22]","[14, 2, 0]","[5, 1]","[5, 6]","[19, 10, 15]",3,2,"[0, 9]","[1, 4]",2,1,0,[0],1,1,0,"[0, 3]",1,3,3,1,1,0,2,2,10,5,2,4,2,2,5,2,3,4,3,4,4,5,2,0,3,[0],0,1,[6],1,0,1,2,"[18, 14, 5, 31, 17, 1]","[9, 8, 7, 2, 1, 4, 10, 3, 6, 5]","[1, 2, 3, 10, 6, 9, 11, 4, 8, 7, 5]","[2, 1, 5, 3, 4]","[1, 6, 7, 2, 3, 5, 4]","[1, 5, 2, 4, 6, 7, 3]",
3,1,0,USA,0,0,1,6,4,"[0, 12, 11, 6]",9,7,3,3,6,2,3,7,0.164374,"[8, 4, 5]",3,"[8, 7, 1]","[7, 5, 0]",[0],0,1,1,"[18, 27, 14, 5, 31]","[14, 19, 17]","[17, 13, 14]","[14, 22, 0]","[14, 0, 2]","[5, 1]","[5, 6]","[19, 18, 10]",3,2,"[0, 9, 4]","[1, 4]",2,2,3,"[6, 2]",1,1,0,"[3, 2]",0,3,3,1,1,0,2,2,10,2,2,4,2,2,5,2,3,2,3,3,3,6,2,0,3,[0],3,1,[6],1,0,0,2,"[18, 14, 5, 31, 1, 17]","[9, 8, 7, 1, 2, 3, 10, 4, 6, 5]","[1, 2, 3, 10, 9, 4, 11, 5, 8, 7, 6]","[2, 1, 5, 3, 4]","[1, 5, 2, 3, 4, 7, 6]","[1, 4, 2, 3, 6, 7, 5]",
4,1,0,USA,0,0,3,6,3,"[0, 12, 11]",9,0,3,1,0,1,4,7,0.140555,"[5, 4]",3,"[8, 7, 5]","[7, 5, 0]",[0],0,2,2,"[18, 3, 31, 14]","[19, 14]","[19, 14]","[22, 14]","[14, 22]",[0],"[0, 5]","[18, 10, 19]",3,2,"[0, 9]","[1, 4]",2,2,3,[1],1,1,1,[3],0,3,0,1,1,0,2,2,10,1,2,4,1,2,5,1,2,2,2,2,2,6,1,2,3,[2],0,1,[6],2,0,1,2,"[18, 14, 31, 5, 3]","[9, 10, 5, 2, 1, 3, 6, 4, 8, 7]","[1, 2, 3, 10, 8, 4, 11, 5, 9, 7, 6]","[2, 1, 5, 3, 4]","[1, 4, 3, 2, 5, 7, 6]","[1, 4, 2, 3, 6, 7, 5]",


Unnamed: 0,Hobby,OpenSource,Country,Student,Employment,FormalEducation,UndergradMajor,CompanySize,DevType,YearsCoding,YearsCodingProf,JobSatisfaction,CareerSatisfaction,HopeFiveYears,JobSearchStatus,LastNewJob,UpdateCV,ConvertedSalary,CommunicationTools,TimeFullyProductive,EducationTypes,SelfTaughtTypes,HackathonReasons,AgreeDisagree1,AgreeDisagree2,AgreeDisagree3,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,FrameworkWorkedWith,FrameworkDesireNextYear,IDE,OperatingSystem,NumberMonitors,Methodology,VersionControl,CheckInCode,AdBlocker,AdBlockerDisable,AdBlockerReasons,AdsAgreeDisagree1,AdsAgreeDisagree2,AdsAgreeDisagree3,AdsActions,AIDangerous,AIInteresting,AIResponsible,AIFuture,EthicsChoice,EthicsReport,EthicsResponsible,EthicalImplications,StackOverflowRecommend,StackOverflowVisit,StackOverflowHasAccount,StackOverflowParticipate,StackOverflowJobs,StackOverflowDevStory,StackOverflowJobsRecommend,StackOverflowConsiderMember,HypotheticalTools1,HypotheticalTools2,HypotheticalTools3,HypotheticalTools4,HypotheticalTools5,WakeTime,HoursComputer,HoursOutside,SkipMeals,ErgonomicDevices,Exercise,EducationParents,RaceEthnicity,Age,Dependents,SurveyTooLong,SurveyEasy,LanguageWorkedWith,AssessJob,AssessBenefits,JobContactPriorities,JobEmailPriorities,AdsPriorities,Cluster
0,0.214824,0.65009,0.879222,0.669913,0.70213,0.718533,0.801337,0.668501,0.834821,0.726103,0.450212,0.320231,0.388866,0.974628,0.40254,0.848838,0.964649,0.129441,0.987734,0.985959,0.93993,0.966832,0.986116,0.942553,0.957021,0.953574,0.922178,0.918366,0.953739,0.948505,0.943378,0.958742,0.959703,0.945248,0.88768,0.868806,0.936361,0.846962,0.918462,0.832527,0.88697,0.951571,0.926933,0.932874,0.93768,0.930776,0.952343,0.95126,0.930693,0.901949,0.924644,0.931861,0.938352,0.904517,0.850321,0.924768,0.789532,0.938382,0.889098,0.915277,0.828977,0.884352,0.951651,0.956063,0.955194,0.95138,0.952433,0.955044,0.924118,0.929215,0.908874,0.962018,0.934714,0.971981,0.942589,0.957081,0.917277,0.940333,0.953364,0.894416,0.00756686,0.00756686,0.00756686,0.00756686,0.00756686,
1,0.351368,0.586691,0.875671,0.622807,0.694228,0.738269,0.728051,0.755451,0.733919,0.773028,0.650849,0.826806,0.842559,0.803974,0.614427,0.763028,0.801651,0.313766,0.888987,0.886754,0.735121,0.751387,0.936223,0.752516,0.829478,0.848503,0.812488,0.815846,0.883136,0.853461,0.862991,0.880729,0.861421,0.84012,0.725877,0.744026,0.799467,0.725211,0.783627,0.499826,0.648124,0.829916,0.808405,0.82032,0.857949,0.758495,0.846713,0.845405,0.76687,0.619091,0.748262,0.770858,0.762779,0.605253,0.628364,0.820628,0.489991,0.841057,0.755325,0.834871,0.710429,0.721247,0.848167,0.853184,0.81951,0.810942,0.820936,0.847608,0.754865,0.788212,0.717336,0.901958,0.783345,0.885934,0.741082,0.76687,0.626248,0.753356,0.828982,0.718111,0.00795079,0.00795079,0.00795079,0.00795079,0.00795079,
2,0.324248,0.579578,0.860844,0.509993,0.492863,0.685498,0.655391,0.881159,0.721469,0.83693,0.828748,0.806014,0.799416,0.824071,0.622407,0.792798,0.827711,0.337665,0.853336,0.851783,0.742513,0.776407,0.900164,0.763772,0.831473,0.847199,0.820999,0.809698,0.86809,0.850746,0.869463,0.864247,0.85315,0.852923,0.786707,0.738306,0.741298,0.693347,0.691445,0.669182,0.669182,0.669182,0.799165,0.823515,0.851245,0.778683,0.865951,0.850303,0.794494,0.64876,0.738383,0.807115,0.783608,0.633577,0.627447,0.767474,0.466418,0.864263,0.705384,0.868757,0.777371,0.639559,0.863003,0.877056,0.883852,0.838468,0.849797,0.851346,0.732505,0.776819,0.716659,0.898989,0.827157,0.881192,0.802968,0.766167,0.733832,0.724052,0.853093,0.726363,0.00756729,0.00756729,0.00756729,0.00756729,0.00756729,
3,0.375701,0.655012,0.835166,0.387378,0.349828,0.697408,0.642714,0.855834,0.726433,0.842112,0.790413,0.771592,0.758416,0.792283,0.595555,0.816906,0.79871,0.389346,0.768233,0.793503,0.681258,0.681849,0.911606,0.695363,0.82518,0.805086,0.806343,0.79818,0.868939,0.826424,0.850247,0.849271,0.845279,0.831997,0.76257,0.651154,0.673093,0.635912,0.605769,0.420774,0.609376,0.822173,0.787219,0.799281,0.851192,0.731666,0.840505,0.788656,0.759932,0.562668,0.65621,0.75361,0.693364,0.480881,0.576992,0.797726,0.408263,0.803701,0.644558,0.8606,0.748724,0.692732,0.841005,0.812761,0.841244,0.820475,0.820609,0.805789,0.663488,0.776664,0.619804,0.898395,0.788332,0.867447,0.584241,0.668096,0.624032,0.643878,0.811434,0.684956,0.00604321,0.00604321,0.00604321,0.00604321,0.00456823,
4,0.410733,0.600256,0.844122,0.373148,0.410856,0.788117,0.681864,0.878733,0.743724,0.901181,0.897591,0.809339,0.8446,0.792649,0.674798,0.791375,0.818882,0.399339,0.85877,0.851678,0.739168,0.743153,0.940927,0.760359,0.865894,0.864145,0.849921,0.822341,0.901026,0.81663,0.864856,0.898022,0.884861,0.80427,0.69032,0.689296,0.753104,0.703176,0.704283,0.552555,0.789332,0.845734,0.842566,0.861338,0.857052,0.787162,0.860986,0.816661,0.811766,0.673077,0.678377,0.788821,0.723086,0.573082,0.686435,0.829289,0.569638,0.827401,0.725802,0.837341,0.589571,0.774948,0.767031,0.7292,0.787669,0.806279,0.805778,0.856227,0.802956,0.772275,0.619366,0.897413,0.826974,0.904255,0.60908,0.850671,0.7425,0.747916,0.844122,0.742145,0.0100448,0.0100448,0.0100448,0.0100448,0.0100448,


Posibles clusters a dividir: [0, 1, 2, 3, 4]
Actualizando clusters
El cluster  0  incluye  18341 miembros.
El cluster  1  incluye  18804 miembros.
El cluster  2  incluye  20050 miembros.
El cluster  3  incluye  29832 miembros.
El cluster  4  incluye  11416 miembros.

Unificando clusters 1 y 2
El cluster  0  incluye  18341 miembros.
El cluster  1  incluye  18804 miembros.
El cluster  2  incluye  20050 miembros.
El cluster  3  incluye  29832 miembros.
El cluster  4  incluye  11416 miembros.

El cluster  0  incluye  21632 miembros
El cluster  1  incluye  23926 miembros
El cluster  2  incluye  39575 miembros
El cluster  3  incluye  13310 miembros

Actualizando clusters
El cluster  0  incluye  21583 miembros.
El cluster  1  incluye  25161 miembros.
El cluster  2  incluye  38576 miembros.
El cluster  3  incluye  13123 miembros.


El cluster  0  incluye  21583 miembros.
El cluster  1  incluye  25161 miembros.
El cluster  2  incluye  38576 miembros.
El cluster  3  incluye  13123 miembros.

Act

Unnamed: 0,Hobby,OpenSource,Country,Student,Employment,FormalEducation,UndergradMajor,CompanySize,DevType,YearsCoding,YearsCodingProf,JobSatisfaction,CareerSatisfaction,HopeFiveYears,JobSearchStatus,LastNewJob,UpdateCV,ConvertedSalary,CommunicationTools,TimeFullyProductive,EducationTypes,SelfTaughtTypes,HackathonReasons,AgreeDisagree1,AgreeDisagree2,AgreeDisagree3,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,FrameworkWorkedWith,FrameworkDesireNextYear,IDE,OperatingSystem,NumberMonitors,Methodology,VersionControl,CheckInCode,AdBlocker,AdBlockerDisable,AdBlockerReasons,AdsAgreeDisagree1,AdsAgreeDisagree2,AdsAgreeDisagree3,AdsActions,AIDangerous,AIInteresting,AIResponsible,AIFuture,EthicsChoice,EthicsReport,EthicsResponsible,EthicalImplications,StackOverflowRecommend,StackOverflowVisit,StackOverflowHasAccount,StackOverflowParticipate,StackOverflowJobs,StackOverflowDevStory,StackOverflowJobsRecommend,StackOverflowConsiderMember,HypotheticalTools1,HypotheticalTools2,HypotheticalTools3,HypotheticalTools4,HypotheticalTools5,WakeTime,HoursComputer,HoursOutside,SkipMeals,ErgonomicDevices,Exercise,EducationParents,RaceEthnicity,Age,Dependents,SurveyTooLong,SurveyEasy,LanguageWorkedWith,AssessJob,AssessBenefits,JobContactPriorities,JobEmailPriorities,AdsPriorities,Cluster
0,1,0,IND,0,0,1,6,8,"[0, 19, 12]",7,11,5,5,0,1,5,7,0.010582,[8],3,[8],[5],[0],0,0,2,"[27, 18, 14]","[14, 19]","[14, 13]","[14, 2]","[14, 2]",[5],[5],"[10, 15]",3,1,[0],[1],2,2,3,[6],1,1,0,[3],1,3,3,1,1,0,2,2,10,2,2,4,1,2,5,2,3,3,4,3,3,5,1,0,3,[2],3,1,[6],0,0,1,0,"[14, 5, 18, 17]","[1, 9, 6, 2, 3, 4, 10, 5, 8, 7]","[1, 11, 2, 10, 9, 5, 6, 3, 8, 4, 7]","[1, 2, 5, 4, 3]","[1, 3, 7, 2, 5, 6, 4]","[1, 3, 2, 4, 6, 7, 5]",
1,1,0,IND,0,0,1,6,8,"[0, 12, 11]",7,0,3,3,6,2,3,7,0.079547,"[8, 5]",3,"[8, 7, 1]","[5, 7]",[4],0,0,1,"[18, 27, 14, 5]","[14, 19]","[13, 14, 17]","[14, 2]","[14, 2, 0]","[5, 1]","[5, 6]","[10, 19, 15]",3,1,"[0, 9]",[1],2,1,0,[0],1,1,0,[2],1,3,3,1,1,0,2,2,10,5,2,4,2,2,5,2,3,3,4,4,4,5,2,0,3,[0],0,1,[6],1,0,1,2,"[18, 14, 5, 31, 17]","[9, 8, 7, 1, 2, 4, 10, 3, 6, 5]","[1, 2, 3, 10, 6, 9, 11, 4, 8, 7, 5]","[2, 1, 5, 4, 3]","[1, 6, 7, 2, 3, 5, 4]","[1, 5, 2, 4, 6, 7, 3]",
2,1,0,USA,0,0,1,6,4,"[0, 12, 11, 15]",9,7,3,3,6,2,3,7,0.154061,"[8, 4, 5]",3,"[8, 7, 1]","[7, 5, 0]","[0, 4]",0,1,1,"[18, 27, 14, 5, 31]","[14, 19, 17]","[17, 13, 14]","[14, 22, 0]","[14, 0, 2]","[5, 1]","[5, 6]","[19, 18, 10]",3,2,"[0, 9, 4]","[1, 4]",2,2,3,"[6, 2]",1,1,0,"[3, 2]",0,3,3,1,1,0,2,2,10,2,2,4,2,2,5,2,3,2,3,3,3,6,2,0,3,[0],3,1,[6],1,0,0,2,"[18, 14, 5, 31, 1, 17]","[9, 8, 7, 1, 2, 3, 10, 4, 6, 5]","[1, 2, 3, 10, 9, 4, 11, 5, 8, 7, 6]","[2, 1, 5, 3, 4]","[1, 5, 2, 3, 4, 7, 6]","[1, 4, 2, 3, 6, 7, 5]",
3,1,0,USA,0,0,3,6,3,"[0, 12, 11, 6]",9,0,3,1,0,1,4,7,0.140791,"[5, 8]",3,"[8, 7, 5]","[7, 5, 0]",[0],0,2,2,"[18, 31, 3, 14]","[19, 14]","[19, 14]","[22, 14]","[14, 22]","[0, 5]","[0, 5]","[18, 10, 19]",3,2,"[0, 9]","[1, 4]",2,2,3,"[1, 6]",1,1,1,[3],0,3,0,1,1,0,2,2,10,1,2,4,1,2,5,1,2,2,2,2,2,6,2,2,3,[2],0,2,[6],1,0,1,2,"[18, 14, 5, 31, 3, 1]","[9, 10, 5, 2, 1, 3, 6, 4, 8, 7]","[1, 2, 3, 10, 8, 4, 11, 5, 9, 7, 6]","[2, 1, 5, 3, 4]","[1, 4, 3, 2, 5, 7, 6]","[1, 4, 2, 3, 6, 7, 5]",


Unnamed: 0,Hobby,OpenSource,Country,Student,Employment,FormalEducation,UndergradMajor,CompanySize,DevType,YearsCoding,YearsCodingProf,JobSatisfaction,CareerSatisfaction,HopeFiveYears,JobSearchStatus,LastNewJob,UpdateCV,ConvertedSalary,CommunicationTools,TimeFullyProductive,EducationTypes,SelfTaughtTypes,HackathonReasons,AgreeDisagree1,AgreeDisagree2,AgreeDisagree3,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,FrameworkWorkedWith,FrameworkDesireNextYear,IDE,OperatingSystem,NumberMonitors,Methodology,VersionControl,CheckInCode,AdBlocker,AdBlockerDisable,AdBlockerReasons,AdsAgreeDisagree1,AdsAgreeDisagree2,AdsAgreeDisagree3,AdsActions,AIDangerous,AIInteresting,AIResponsible,AIFuture,EthicsChoice,EthicsReport,EthicsResponsible,EthicalImplications,StackOverflowRecommend,StackOverflowVisit,StackOverflowHasAccount,StackOverflowParticipate,StackOverflowJobs,StackOverflowDevStory,StackOverflowJobsRecommend,StackOverflowConsiderMember,HypotheticalTools1,HypotheticalTools2,HypotheticalTools3,HypotheticalTools4,HypotheticalTools5,WakeTime,HoursComputer,HoursOutside,SkipMeals,ErgonomicDevices,Exercise,EducationParents,RaceEthnicity,Age,Dependents,SurveyTooLong,SurveyEasy,LanguageWorkedWith,AssessJob,AssessBenefits,JobContactPriorities,JobEmailPriorities,AdsPriorities,Cluster
0,0.228436,0.659171,0.886551,0.690836,0.735782,0.730826,0.796511,0.645817,0.820647,0.723311,0.532802,0.354938,0.488938,0.966487,0.504731,0.870302,0.941713,0.139422,0.985604,0.982811,0.919853,0.954577,0.978681,0.921065,0.94582,0.941202,0.912001,0.899745,0.933768,0.924476,0.931719,0.951277,0.950846,0.935897,0.859728,0.845785,0.92932,0.819765,0.908329,0.784391,0.849164,0.936846,0.911846,0.917688,0.924482,0.910445,0.93489,0.938923,0.904265,0.865588,0.905462,0.909124,0.917216,0.872346,0.824836,0.910526,0.755697,0.927861,0.87411,0.901039,0.799406,0.863221,0.935585,0.947244,0.941151,0.944189,0.945183,0.940818,0.901653,0.924274,0.889289,0.957348,0.908965,0.957731,0.923297,0.915114,0.875431,0.915876,0.943271,0.860307,0.00693826,0.00693826,0.00693826,0.00693826,0.00693826,
1,0.351025,0.676142,0.858869,0.542408,0.555172,0.683014,0.678476,0.846576,0.73135,0.799468,0.749601,0.831589,0.807641,0.806797,0.604341,0.770403,0.830536,0.317724,0.874426,0.865049,0.763971,0.810101,0.948195,0.772169,0.842832,0.849787,0.836521,0.82146,0.871681,0.879737,0.879042,0.876994,0.859363,0.855503,0.775438,0.778755,0.773396,0.628252,0.747749,0.722235,0.722235,0.722235,0.807129,0.827572,0.852559,0.847415,0.859011,0.860849,0.798098,0.666796,0.757701,0.808544,0.795503,0.655131,0.64367,0.791273,0.49955,0.865386,0.731304,0.859322,0.769202,0.668981,0.863726,0.880957,0.836343,0.819688,0.833225,0.852131,0.754678,0.80746,0.728705,0.902124,0.842861,0.881123,0.825779,0.803747,0.726365,0.734689,0.852131,0.742167,0.00697461,0.00697461,0.00697461,0.00697461,0.00697461,
2,0.362598,0.692562,0.834941,0.419138,0.40261,0.695497,0.652434,0.867307,0.726851,0.854187,0.80981,0.791929,0.761945,0.788248,0.585938,0.805972,0.796909,0.385339,0.778797,0.803514,0.679171,0.688201,0.877795,0.698674,0.840721,0.804872,0.804904,0.797796,0.866283,0.828972,0.850918,0.849177,0.842611,0.832713,0.760568,0.65746,0.682815,0.64112,0.619065,0.424075,0.610976,0.821773,0.784079,0.799148,0.84775,0.731404,0.855948,0.795871,0.752529,0.560756,0.662671,0.754038,0.699899,0.487101,0.575053,0.793605,0.40228,0.808368,0.636513,0.861398,0.758703,0.678804,0.835719,0.837748,0.839737,0.836672,0.838206,0.826071,0.684241,0.791649,0.633654,0.89402,0.784813,0.860163,0.601929,0.713787,0.62785,0.665605,0.810994,0.686479,0.00543374,0.00543374,0.00543374,0.00543374,0.00543374,
3,0.400708,0.628578,0.845621,0.403709,0.434818,0.787745,0.681308,0.883187,0.746126,0.895706,0.874993,0.818005,0.845253,0.807562,0.695895,0.816719,0.808044,0.393623,0.845122,0.846496,0.729774,0.733949,0.933161,0.749708,0.85355,0.862128,0.847468,0.821705,0.900956,0.812205,0.860733,0.871584,0.881806,0.805765,0.697293,0.682963,0.748205,0.692217,0.691855,0.511585,0.741877,0.836652,0.837897,0.852363,0.852774,0.776458,0.864608,0.807659,0.81165,0.650015,0.681022,0.779097,0.715536,0.559078,0.665348,0.831085,0.543254,0.821852,0.722737,0.832021,0.585342,0.779047,0.782986,0.748096,0.801659,0.823792,0.821946,0.850671,0.718577,0.753696,0.618588,0.894974,0.821093,0.86753,0.591285,0.796641,0.701135,0.727355,0.835384,0.725146,0.00882437,0.00882437,0.00882437,0.00882437,0.00882437,


Posibles clusters a dividir: [0, 1, 2, 3]
Clusters a eliminar: [1]

Actualizando clusters
El cluster  0  incluye  17775 miembros.
El cluster  1  incluye  41807 miembros.
El cluster  2  incluye  13785 miembros.
El cluster  3  incluye  14870 miembros.
El cluster  4  incluye  10206 miembros.


El cluster  0  incluye  17775 miembros.
El cluster  1  incluye  41807 miembros.
El cluster  2  incluye  13785 miembros.
El cluster  3  incluye  14870 miembros.
El cluster  4  incluye  10206 miembros.



Unnamed: 0,Hobby,OpenSource,Country,Student,Employment,FormalEducation,UndergradMajor,CompanySize,DevType,YearsCoding,YearsCodingProf,JobSatisfaction,CareerSatisfaction,HopeFiveYears,JobSearchStatus,LastNewJob,UpdateCV,ConvertedSalary,CommunicationTools,TimeFullyProductive,EducationTypes,SelfTaughtTypes,HackathonReasons,AgreeDisagree1,AgreeDisagree2,AgreeDisagree3,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,FrameworkWorkedWith,FrameworkDesireNextYear,IDE,OperatingSystem,NumberMonitors,Methodology,VersionControl,CheckInCode,AdBlocker,AdBlockerDisable,AdBlockerReasons,AdsAgreeDisagree1,AdsAgreeDisagree2,AdsAgreeDisagree3,AdsActions,AIDangerous,AIInteresting,AIResponsible,AIFuture,EthicsChoice,EthicsReport,EthicsResponsible,EthicalImplications,StackOverflowRecommend,StackOverflowVisit,StackOverflowHasAccount,StackOverflowParticipate,StackOverflowJobs,StackOverflowDevStory,StackOverflowJobsRecommend,StackOverflowConsiderMember,HypotheticalTools1,HypotheticalTools2,HypotheticalTools3,HypotheticalTools4,HypotheticalTools5,WakeTime,HoursComputer,HoursOutside,SkipMeals,ErgonomicDevices,Exercise,EducationParents,RaceEthnicity,Age,Dependents,SurveyTooLong,SurveyEasy,LanguageWorkedWith,AssessJob,AssessBenefits,JobContactPriorities,JobEmailPriorities,AdsPriorities,Cluster
0,1,0,IND,0,0,1,6,8,"[0, 19, 12]",7,11,5,5,2,1,5,7,0.018582,[8],3,"[8, 7]","[5, 7]",[0],0,0,2,"[18, 27, 14, 5]","[14, 19]","[14, 13]","[14, 2]","[14, 2, 22]",[5],"[5, 6]","[10, 15, 19]",3,1,[0],[1],2,2,3,[6],1,1,0,[3],1,3,3,1,1,0,2,2,10,2,2,4,1,2,5,2,3,3,4,3,3,5,1,0,3,[2],3,1,[6],0,0,1,0,"[14, 5, 18, 17, 31]","[1, 9, 6, 2, 3, 4, 10, 5, 8, 7]","[1, 11, 2, 10, 9, 5, 6, 3, 8, 4, 7]","[2, 1, 5, 4, 3]","[1, 3, 7, 2, 5, 6, 4]","[1, 3, 2, 4, 6, 7, 5]",
1,1,0,USA,0,0,1,6,4,"[0, 12, 11, 15]",9,7,3,3,6,2,3,7,0.154164,"[8, 4, 5]",3,"[8, 7, 1, 5]","[7, 5, 0, 3]","[0, 4]",0,1,1,"[18, 14, 27, 5, 31, 1]","[14, 19, 17]","[17, 13, 14]","[14, 22, 0]","[14, 0, 2]","[5, 1]","[5, 6, 1]","[19, 18, 10]",3,2,"[0, 9, 4]","[1, 4]",2,2,3,"[6, 2]",1,1,0,"[3, 2]",0,3,3,1,1,0,2,2,10,2,2,4,2,2,5,2,3,2,3,3,3,6,2,0,3,[0],3,1,[6],1,0,0,2,"[18, 14, 5, 31, 1, 17, 27]","[9, 8, 7, 1, 2, 3, 10, 4, 6, 5]","[1, 2, 3, 10, 9, 4, 11, 5, 8, 7, 6]","[2, 1, 5, 3, 4]","[1, 5, 2, 3, 4, 7, 6]","[1, 4, 2, 3, 6, 7, 5]",
2,1,0,USA,0,0,3,6,3,"[0, 12, 11, 6]",9,0,3,3,0,1,4,7,0.150769,"[5, 8, 4]",3,"[8, 7, 5]","[7, 5, 0]",[0],0,2,2,"[18, 14, 31, 3, 5]","[19, 14, 17]","[19, 14]","[22, 14, 2]","[14, 22, 2]","[0, 5]","[0, 5]","[18, 10, 19]",3,2,"[0, 9, 4]","[1, 4]",2,2,3,"[1, 6]",1,1,1,"[3, 2]",0,3,0,1,1,0,2,2,10,1,2,4,1,2,5,1,2,2,2,2,2,6,2,2,3,[2],0,2,[6],1,0,1,2,"[18, 14, 5, 31, 3, 1]","[9, 10, 5, 2, 1, 3, 6, 4, 8, 7]","[1, 2, 3, 10, 9, 4, 11, 5, 8, 7, 6]","[2, 1, 5, 3, 4]","[1, 4, 3, 2, 5, 7, 6]","[1, 4, 2, 3, 6, 7, 5]",
3,1,0,IND,0,0,1,6,8,"[0, 12, 11]",7,0,5,5,2,2,3,7,0.032454,[8],0,[8],[5],[4],0,0,4,"[27, 18]",[14],[13],[2],[2],[5],[5],"[0, 15]",2,1,[0],[1],2,2,3,[0],3,3,3,[3],2,3,3,1,0,0,1,2,10,5,2,4,2,1,5,2,3,3,4,4,4,6,2,0,3,[2],3,2,[5],1,0,1,2,"[17, 18, 14]","[9, 8, 7, 1, 2, 5, 10, 3, 6, 4]","[1, 2, 3, 10, 6, 9, 5, 4, 11, 7, 8]","[2, 1, 5, 4, 3]","[1, 5, 7, 2, 3, 6, 4]","[1, 5, 3, 4, 7, 6, 2]",
4,1,0,USA,0,0,3,6,8,"[0, 12, 11]",7,0,5,5,6,2,3,7,0.040848,[5],5,[8],[5],[4],2,0,2,"[27, 18]",[14],[14],[14],[14],[5],[9],[10],3,2,[0],[1],0,2,3,[0],1,1,2,[2],1,3,3,1,1,3,2,2,10,1,2,0,2,2,5,2,3,3,4,4,4,5,1,2,3,[0],0,2,[6],1,0,0,4,"[14, 18]","[10, 8, 6, 3, 1, 2, 7, 4, 9, 5]","[1, 2, 3, 10, 8, 4, 11, 5, 9, 7, 6]","[2, 1, 5, 3, 4]","[1, 3, 7, 2, 4, 6, 5]","[1, 5, 2, 4, 6, 7, 3]",



Actualizando clusters
El cluster  0  incluye  22536 miembros.
El cluster  1  incluye  40893 miembros.
El cluster  2  incluye  15321 miembros.
El cluster  3  incluye  12096 miembros.
El cluster  4  incluye  7597 miembros.


El cluster  0  incluye  22536 miembros.
El cluster  1  incluye  40893 miembros.
El cluster  2  incluye  15321 miembros.
El cluster  3  incluye  12096 miembros.
El cluster  4  incluye  7597 miembros.

Actualizando clusters
El cluster  0  incluye  21138 miembros.
El cluster  1  incluye  36317 miembros.
El cluster  2  incluye  14254 miembros.
El cluster  3  incluye  15091 miembros.
El cluster  4  incluye  11643 miembros.


El cluster  0  incluye  21138 miembros.
El cluster  1  incluye  36317 miembros.
El cluster  2  incluye  14254 miembros.
El cluster  3  incluye  15091 miembros.
El cluster  4  incluye  11643 miembros.

Actualizando clusters
El cluster  0  incluye  20044 miembros.
El cluster  1  incluye  33940 miembros.
El cluster  2  incluye  13684 miembros.
El cluste

In [None]:
display(centroids)
for i in range(NUM_CLUSTERS):
    print("Cluster {}: {} elementos".format(i, df[df["Cluster"]==i].count()["Cluster"]))

std_devs = std_dev()
std_devs["Cluster"] = std_devs.mean(axis=1)
display(std_devs)

update_deltas()

dist_lists = []
for i, rc_i in centroids.iterrows():
    dist_lists.append([])
    for j, rc_j in centroids.iterrows():
        dist_lists[i].append(distance_qual(rc_i, rc_j))
display(np.array(dist_lists))