<div style="width:100%; overflow:hidden; background-color:#F1F1E6; padding: 10px; border-style: outset; color:#17469e">
    <div style="width: 80%; float: left;">
    <h2 align="center">Universidad de Sonora</h2>
    <hr style="border-width: 3px; border-color:#17469e">
          <h1>Reconocimiento de patrones: Preparación de los datos</h1>          
          <h4>Ramón Soto C. <a href="mailto:rsotoc@moviquest.com/">(rsotoc@moviquest.com)</a></h4>
    </div>
    <div style="float: right;">
    <img src="images/escudo_unison.png">
    </div>
</div>

## Caso de estudio: [*Stack Overflow 2018 Developer Survey*](https://www.kaggle.com/stackoverflow/stack-overflow-2018-developer-survey)

Como caso de estudio principal en el presente curso hemos seleccionado la encuesta de desarrolladores 2018 de *Stack Overflow* disponible en [Kaggle](https://www.kaggle.com). En este esta etapa realizaremos el análisis de agrupamientos.

### 4. Modelado - ISODATA

<div style="margin-top: 6px; border: 1px solid #cfcfcf; padding: 8px 12px; border-radius:2px; background-color:#f7f7f7; ">
... ahora utilizamos la técnica ISODATA para identificar prototipos de clases. <br>Inicializamos el contexto y cargamos los datos:
</div>

In [1]:
"""
Reconocimiento de patrones: ISODATA
"""

#from scipy.spatial.distance import squareform

# Inicializar el ambiente
import sys
import numpy as np
import pandas as pd
import json
import pickle
#import math
import random
#import time

from IPython.display import display, HTML
from collections import Counter
from operator import itemgetter
#from scipy.spatial.distance import euclidean, pdist, squareform

np.set_printoptions(precision=2, suppress=True) # Cortar la impresión de decimales a 1
pd.set_option('display.max_columns', 130)
pd.set_option('max_colwidth', 80)

LARGER_DISTANCE = sys.maxsize
TALK = True # TALK = True, imprime resultados parciales

In [2]:
path = "Data sets/Stack Overflow Survey/"

# Recuperar encabezados de columnas en orden original
with open(path + 'survey_results_public_transformed.headers', 'rb') as file:  
    headers = pickle.load(file)

# Recuperar diccionarios... sólo por si se requieren
with open(path + 'survey_results_public_transformed.dicts', 'rb') as file:  
    dict_of_dicts = pickle.load(file)

with open(path + 'survey_results_public_transformed.json') as f:
    dict_json = json.load(f)
df = pd.DataFrame.from_dict(dict_json)
#df = df.sample(n=2000).reset_index(drop=True)

# Reordenar las columnas de acuerdo al orden original
df = df.reindex(headers, axis=1)

DATA_LEN = df.shape[0]

# Agregar una columna "cluster" inicializada a null 
df["Cluster"] = np.nan

In [3]:
var_str = ['Hobby', 'OpenSource', 'Country', 'Student', 'Employment', 'FormalEducation', 
         'UndergradMajor', 'CompanySize', 'YearsCoding', 'YearsCodingProf', 'UpdateCV', 
         'JobSatisfaction', 'CareerSatisfaction', 'HopeFiveYears', 'JobSearchStatus', 
         'LastNewJob', 'TimeFullyProductive', 'AgreeDisagree1', 'AgreeDisagree2', 
         'AgreeDisagree3', 'OperatingSystem', 'NumberMonitors', 'CheckInCode', 'AdBlocker', 
         'AdBlockerDisable', 'AdsAgreeDisagree1', 'AdsAgreeDisagree2', 'AdsAgreeDisagree3', 
         'AIDangerous', 'AIInteresting', 'AIResponsible', 'AIFuture', 'EthicsChoice', 
         'EthicsReport', 'EthicsResponsible', 'EthicalImplications', 'HoursComputer', 
         'StackOverflowRecommend', 'StackOverflowVisit', 'StackOverflowHasAccount', 
         'StackOverflowParticipate', 'StackOverflowJobs', 'StackOverflowDevStory', 
         'StackOverflowJobsRecommend', 'StackOverflowConsiderMember', 'HypotheticalTools1', 
         'HypotheticalTools2', 'HypotheticalTools3', 'HypotheticalTools4', 'WakeTime', 
         'HypotheticalTools5', 'HoursOutside', 'SkipMeals', 'Exercise', 'EducationParents', 
         'Age', 'Dependents', 'SurveyTooLong', 'SurveyEasy']
var_list = ['DevType', 'CommunicationTools', 'EducationTypes', 'SelfTaughtTypes', 
         'HackathonReasons', 'LanguageDesireNextYear', 'DatabaseWorkedWith', 
         'DatabaseDesireNextYear', 'PlatformWorkedWith', 'PlatformDesireNextYear', 
         'FrameworkWorkedWith', 'FrameworkDesireNextYear', 'IDE', 'Methodology', 
         'VersionControl', 'AdBlockerReasons', 'AdsActions', 'ErgonomicDevices', 
         'RaceEthnicity', 'LanguageWorkedWith']
var_ranks = ['AssessJob', 'AssessBenefits', 'JobContactPriorities', 'JobEmailPriorities', 
             'AdsPriorities']
var_float = 'ConvertedSalary'

def distance_qual(x, y):
    # Número de variables; si var_float es array, modificar "+ 1" por "+ len(var_float)"
    numvars = len(var_str) + len(var_list) + len(var_ranks) + 1
    
    distancia = abs(x.ConvertedSalary - y.ConvertedSalary)
    if pd.isnull(distancia):
        distancia = 0
        numvars -= 1
        
    for col in var_str:
        if x[col] != y[col]:
            distancia += 1
        
    for col in var_list:
        num_vars = len(x[col]) + len(y[col])
        d = 0
        if num_vars > 0:
            d = (2*len(set(x[col] + y[col])) - num_vars) / num_vars
        distancia += d

    for col in var_ranks:
        d = 0
        max_vars = max(len(x[col]), len(y[col]))
        if len(x[col]) != 0 and len(y[col]) != 0:
            for v in range(len(x[col])):
                if x[col][v] != y[col][v]:
                    d += 1
        else:
            d += max_vars
        
        if d != 0:
            d /= max_vars
        distancia += d

    return distancia / numvars
    
def decode(dataframe):
    new_df = dataframe.copy(deep=True)
    
    for col in var_str:
        if col in list(dataframe) and col in dict_of_dicts:
            for index, row in dataframe.iterrows():
                value = dict_of_dicts[col][row[col]]
                new_df.at[clusters.index[index], col] = value
                
    for index, row in dataframe.iterrows():
        new_df.at[clusters.index[index], 'ConvertedSalary'] = row['ConvertedSalary'] * 200000
    
    for col in var_list:
        if col in list(dataframe):
            for index, row in dataframe.iterrows():
                values_list = row[col].copy()
                for i in range(len(values_list)):
                    values_list[i] = dict_of_dicts[col][values_list[i]]
                new_df.at[clusters.index[index], col] = values_list
                
    return new_df

<div style="margin-top: 6px; border: 1px solid #cfcfcf; padding: 8px 12px; border-radius:2px; background-color:#f7f7f7; ">
A continuación ejecutamos el algoritmo ISODATA:
</div>

1) Definir los valores de $k_{init}, n_{min}, I_{max}, \sigma_{max}, L_{min}$ y $P_{max}$:

In [4]:
K_INIT = 7
N_MIN = 1000
I_MAX = 10
S_MAX = 0.75 # La desviación estándar está normalizada
DC_MAX = 3 # El cluster sólo se divide cuando hay al menos estas variables con s>S_MAX
L_MIN = 0.5 # Las distancia están normalizadas
D_MAX = 0.5 # Distancia media máxima al centroide
P_MAX = 2

NUM_CLUSTERS = K_INIT # valor de k
iteration = 0

2) Seleccionar de manera arbitraria *k* puntos en el espacio de características como centros iniciales de los clusters (centroides o centros de masa).

In [5]:
# Inicializar los centroides
centroids = df.sample(n=NUM_CLUSTERS).reset_index(drop=True)
display(centroids)

Unnamed: 0,Hobby,OpenSource,Country,Student,Employment,FormalEducation,UndergradMajor,CompanySize,DevType,YearsCoding,YearsCodingProf,JobSatisfaction,CareerSatisfaction,HopeFiveYears,JobSearchStatus,LastNewJob,UpdateCV,ConvertedSalary,CommunicationTools,TimeFullyProductive,EducationTypes,SelfTaughtTypes,HackathonReasons,AgreeDisagree1,AgreeDisagree2,AgreeDisagree3,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,FrameworkWorkedWith,FrameworkDesireNextYear,IDE,OperatingSystem,NumberMonitors,Methodology,VersionControl,CheckInCode,AdBlocker,AdBlockerDisable,AdBlockerReasons,AdsAgreeDisagree1,AdsAgreeDisagree2,AdsAgreeDisagree3,AdsActions,AIDangerous,AIInteresting,AIResponsible,AIFuture,EthicsChoice,EthicsReport,EthicsResponsible,EthicalImplications,StackOverflowRecommend,StackOverflowVisit,StackOverflowHasAccount,StackOverflowParticipate,StackOverflowJobs,StackOverflowDevStory,StackOverflowJobsRecommend,StackOverflowConsiderMember,HypotheticalTools1,HypotheticalTools2,HypotheticalTools3,HypotheticalTools4,HypotheticalTools5,WakeTime,HoursComputer,HoursOutside,SkipMeals,ErgonomicDevices,Exercise,EducationParents,RaceEthnicity,Age,Dependents,SurveyTooLong,SurveyEasy,LanguageWorkedWith,AssessJob,AssessBenefits,JobContactPriorities,JobEmailPriorities,AdsPriorities,Cluster
0,1,1,POL,0,0,3,6.0,3,[18],9,7,4,7,6,2,3,2.0,0.2823,"[0, 2, 4, 5, 8]",3.0,"[1, 5, 7, 8]","[5, 7]",[],0.0,0.0,3.0,[3],[],"[10, 6]",[],"[0, 10]",[],"[11, 9]","[10, 17, 19, 6]",3.0,2.0,"[0, 2, 4, 8, 9]",[1],5.0,2.0,3.0,[4],4.0,4.0,3.0,[],,,2.0,1.0,1.0,0.0,2.0,1.0,9.0,1.0,2.0,4.0,2.0,0.0,7.0,0.0,4.0,4.0,0.0,1.0,0.0,7.0,4.0,3.0,3.0,[2],3.0,2.0,[6],1.0,0.0,0.0,2.0,"[1, 14, 17, 18, 5]","[1, 10, 6, 5, 2, 4, 8, 3, 7, 9]","[1, 4, 2, 10, 7, 9, 5, 6, 11, 8, 3]","[4, 1, 2, 3, 5]","[7, 6, 2, 4, 1, 5, 3]","[7, 3, 2, 5, 4, 6, 1]",
1,1,0,DEU,0,0,7,11.0,3,"[0, 12]",10,10,2,1,6,0,1,0.0,0.40388,"[3, 4, 7]",3.0,"[3, 6, 7, 8]","[0, 1, 3, 5, 6, 7]","[0, 1, 4]",0.0,1.0,0.0,"[14, 18, 2, 25, 31, 4, 5]","[17, 18]","[11, 14, 17, 18]",[14],"[14, 4, 9]",[],[6],"[11, 15, 19]",1.0,2.0,"[4, 9]",[1],2.0,2.0,3.0,"[2, 6]",4.0,0.0,3.0,"[1, 3]",1.0,3.0,3.0,1.0,0.0,3.0,2.0,2.0,10.0,1.0,2.0,3.0,2.0,2.0,10.0,1.0,3.0,3.0,3.0,3.0,4.0,5.0,2.0,0.0,3.0,[],3.0,5.0,[2],1.0,0.0,0.0,2.0,"[12, 18, 25, 31]","[5, 10, 3, 8, 6, 7, 4, 1, 9, 2]","[3, 2, 1, 4, 11, 5, 10, 6, 9, 8, 7]","[4, 1, 5, 2, 3]","[5, 4, 1, 3, 2, 7, 6]","[1, 3, 4, 5, 7, 6, 2]",
2,1,1,GBR,0,0,3,6.0,4,"[0, 11, 12, 20, 7]",2,9,3,7,2,1,3,3.0,0.41671,[8],3.0,"[1, 3, 4, 5, 8]","[5, 7, 8]","[0, 4, 5]",3.0,4.0,4.0,"[14, 18, 30, 5]","[10, 11, 14]","[10, 13, 18, 5]","[0, 14]","[10, 20]","[5, 6]","[5, 6]","[19, 6]",1.0,3.0,"[0, 2, 4, 8, 9]",[1],2.0,2.0,2.0,[1],1.0,1.0,2.0,[],,3.0,0.0,1.0,1.0,0.0,0.0,2.0,10.0,1.0,2.0,0.0,2.0,3.0,10.0,0.0,4.0,0.0,3.0,4.0,1.0,5.0,2.0,0.0,1.0,[],3.0,2.0,[6],1.0,0.0,0.0,4.0,"[1, 12, 13, 14, 17, 18, 25, 30, 31, 34, 5]","[5, 10, 2, 4, 7, 1, 9, 3, 8, 6]","[1, 2, 8, 11, 9, 5, 3, 6, 10, 7, 4]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0]","[2, 1, 4, 3, 7, 5, 6]",
3,1,0,CZE,0,0,7,11.0,3,"[12, 4, 6]",2,9,3,3,0,2,3,7.0,0.0,[5],5.0,[8],"[3, 5, 6, 7, 8]",[],3.0,3.0,4.0,[],[19],[],[],[8],[],[],[18],3.0,2.0,"[0, 2]",[1],2.0,2.0,1.0,[],2.0,1.0,3.0,[],0.0,3.0,3.0,0.0,0.0,0.0,2.0,2.0,10.0,1.0,2.0,4.0,1.0,3.0,5.0,2.0,2.0,2.0,2.0,2.0,4.0,6.0,1.0,2.0,3.0,[],3.0,6.0,[6],2.0,0.0,1.0,,[],"[10, 7, 3, 1, 2, 4, 8, 5, 9, 6]","[1, 10, 8, 5, 11, 6, 4, 3, 9, 2, 7]","[2, 1, 5, 3, 4]","[4, 5, 6, 1, 2, 7, 3]","[1, 7, 4, 3, 2, 5, 6]",
4,1,1,TWN,2,0,3,,7,"[0, 11, 12, 3]",7,0,7,6,6,2,4,7.0,0.06714,"[1, 8]",0.0,"[1, 3, 7, 8]","[2, 3, 5, 7]","[1, 2, 3, 4, 5]",0.0,1.0,4.0,"[12, 19]","[14, 17, 18, 20]","[7, 8]","[0, 10, 12, 14, 22, 8]","[10, 14, 22, 8]","[3, 9]","[5, 9]","[10, 19, 5]",3.0,2.0,[],[1],0.0,1.0,0.0,[0],1.0,3.0,1.0,"[0, 3]",0.0,3.0,0.0,1.0,0.0,0.0,1.0,2.0,10.0,1.0,1.0,,0.0,,5.0,2.0,3.0,2.0,0.0,3.0,2.0,2.0,1.0,0.0,0.0,[0],1.0,6.0,[1],1.0,0.0,1.0,1.0,"[1, 14, 17, 18, 22, 25, 27, 29, 4, 5]","[8, 9, 2, 10, 3, 4, 5, 6, 7, 1]","[2, 3, 6, 9, 4, 8, 7, 11, 10, 5, 1]","[2, 1, 4, 5, 3]","[5, 1, 3, 4, 2, 7, 6]","[5, 4, 2, 1, 3, 7, 6]",
5,1,0,IND,0,0,1,6.0,3,"[0, 3]",0,0,0,2,2,0,3,,0.0,[],,[],[],[],,,,[],[],[],[],[],[],[],[],,,[],[],,,,[],,,,[],,,,,,,,,,,,,,,,,,,,,,,,,,[],,,[],,,,,[],[],[],[],[],[],
6,1,1,BLR,0,2,8,6.0,8,"[0, 11, 12, 18]",1,9,3,7,6,2,3,4.0,0.225,"[10, 4, 7, 8]",,"[1, 7, 8]","[5, 7, 8]",[],0.0,0.0,0.0,"[1, 10, 18, 29, 31, 32, 33]","[14, 15, 17, 18, 20, 6]","[0, 1, 11, 15, 17, 18, 2, 20, 3, 6, 7]","[12, 24, 8]","[12, 15, 25]",[],[9],[15],2.0,1.0,"[0, 8]",[1],2.0,2.0,3.0,"[2, 4, 5]",2.0,2.0,3.0,[],0.0,3.0,2.0,1.0,0.0,0.0,2.0,2.0,9.0,2.0,2.0,4.0,1.0,2.0,5.0,1.0,0.0,2.0,0.0,4.0,3.0,10.0,2.0,2.0,2.0,[],3.0,1.0,[6],1.0,1.0,2.0,2.0,"[1, 14, 18, 25, 29, 31, 5, 8]","[6, 8, 7, 5, 4, 9, 2, 1, 10, 3]","[3, 4, 1, 11, 6, 2, 10, 7, 8, 9, 5]","[5, 4, 1, 2, 3]","[4, 6, 1, 2, 5, 7, 3]","[1, 3, 4, 5, 2, 6, 7]",


3) Asignar cada punto del conjunto de datos al cluster donde la distancia del punto al centroide es menor.

In [6]:
def update_clusters():
    global NUM_CLUSTERS, centroids
    changed = False
    cluster_col_index = df.shape[1] - 1
    
    if TALK :
        print("Actualizando clusters")
    for index, row in df.iterrows():
        dists = []
        for i, r in centroids.iterrows():
            dists.append(distance_qual(row, r))
        cluster = np.argmin(dists)
        
        # Si hay cambio, realizarlo y levantar la bandera 'changed'
        if(pd.isnull(row['Cluster']) or row['Cluster'] != cluster):
            df.iloc[index, cluster_col_index] = cluster
            changed = True
            
    # Contabilizar los elementos en cada cluster   
    to_eliminate = []
    for i in range(NUM_CLUSTERS):
        members = df[df["Cluster"]==i].count()["Cluster"]
        if members < N_MIN:
            to_eliminate.append(i)
        if (TALK) : 
            print("El cluster ", i, " incluye ", members, "miembros.")
    if (TALK) : 
        print()

    if len(to_eliminate) > 0:
        if (TALK) : 
            print("Clusters a eliminar:", to_eliminate)
        
        # Eliminar los centroides seleccionados
        centroids.drop(to_eliminate, inplace=True)    
        centroids = centroids.reset_index(drop=True)
        
        # Reetiquetar los registros afectados
        eliminated = 0
        for i in to_eliminate:
            i_e = i - eliminated
            # Reetiquetar como Null los registros en cada cluster eliminado
            df.loc[df.Cluster == i_e, 'Cluster'] = np.nan
            # Recorrer las etiquetas para coincidir con los nuevos índices
            for cj in range(i_e + 1, NUM_CLUSTERS):
                df.loc[df.Cluster == cj, 'Cluster'] = cj - 1
            # Actualizar el número actual de centroides
            NUM_CLUSTERS -= 1
            eliminated += 1
            
#        if (TALK) : 
#            for i in range(NUM_CLUSTERS):
#                members = df[df["Cluster"]==i].count()["Cluster"]
#                print("El cluster ", i, " incluye ", members, "miembros.")

        changed = True
        
    if changed:
        if TALK : 
            faltantes = df[pd.isnull(df["Cluster"])].shape[0]
            if faltantes > 0:
                print("Faltan por clasificar", faltantes, "miembros.\n")
            else :
                print()
                
        # Reclasificar los registros afectados
        if centroids.shape[0] > 1:
            for index, row in df[pd.isnull(df["Cluster"])].iterrows():
                dists = []
                for i, r in centroids.iterrows():
                    dists.append(distance_qual(row, r))
                df.iloc[index, cluster_col_index] = np.argmin(dists)
                
        # Contabilizar los elementos en cada cluster   
        if TALK : 
            for i in range(NUM_CLUSTERS):
                members = df[df["Cluster"]==i].count()["Cluster"]
                print("El cluster ", i, " incluye ", members, "miembros.")
            print()
        
    return changed

# --------------------------
# Actualizar los clusters
KEEP_WALKING = update_clusters()

Actualizando clusters
El cluster  0  incluye  5803 miembros.
El cluster  1  incluye  13917 miembros.
El cluster  2  incluye  16881 miembros.
El cluster  3  incluye  14181 miembros.
El cluster  4  incluye  5816 miembros.
El cluster  5  incluye  31896 miembros.
El cluster  6  incluye  9949 miembros.


El cluster  0  incluye  5803 miembros.
El cluster  1  incluye  13917 miembros.
El cluster  2  incluye  16881 miembros.
El cluster  3  incluye  14181 miembros.
El cluster  4  incluye  5816 miembros.
El cluster  5  incluye  31896 miembros.
El cluster  6  incluye  9949 miembros.



4) Calcular los centroides a partir de los puntos en cada cluster. 

In [7]:
def update_centroids():
    global centroids
    
    for cl_j in range(NUM_CLUSTERS):        
        # Seleccionar registros en el cluster cl_j
        df_clusterj = df[df["Cluster"] == cl_j]
        
        centroids.loc[centroids.index[cl_j]] = get_centroide(df_clusterj).loc[0]        
    return

def get_centroide(data):
    # Copiar estructura de la tabla
    df2 = pd.DataFrame(data=None, columns=data.columns)
    #df2.append(pd.Series([np.nan]), ignore_index = True)

    col = 'ConvertedSalary'
    df2.at[0, col] = data[col].mean()

    # Moda en las columnas 'simples' (en var_str)
    mode = data[var_str].mode()
    for col in mode:
        df2.at[0, col] = mode[col].values[0]

    # Moda en las columnas con listas de longitud variable (en var_list)
    for col in var_list:
        mean_len = 0
        vars_list = []
        for index, row in data.iterrows():
            mean_len += len(row[col])
            vars_list = vars_list + row[col]
        mean_len /= data.shape[0]
        counter = Counter(vars_list)
        mean_list = []
        for v in counter.most_common(round(mean_len + 0.5)):
            mean_list.append(v[0])
        df2.at[0, col] = mean_list


    # Moda en las columnas con listas de longitud fija (en var_ranks)
    ranges = [11, 12, 6, 8, 8]
    # Para cada variable en var_list, obtener el número de componentes en el vector
    # y el nombre de la columna
    for i, col in zip(range(len(ranges)), var_ranks):
        # Inicializar una matriz (lista de listas, en realidad), con tantos renglones como 
        # componentes tiene el vector de la variable. Cada renglón tiene todos los valores 
        # utilizados en cada posición del vector
        vars = []
        for j in range(ranges[i] - 1):
            vars.append([])

        # Recorrer todos los elementos actualmente en el cluster para rellenar la matriz
        for index, row in data.iterrows():
            # Si el vector de la variable no está vacío...
            if len(row[col]) > 0:
                # Para cada componente en el vector...
                for j in range(len(row[col])):
                    # Si no es 0
                    if row[col][j] != '0':
                        # Agregarla al renglón actual en la matriz
                        vars[j].append(row[col][j])

        
        # Contabilizar ocurrencias de cada componente. Crear una matriz con el orden para
        # cada componente como renglones
        most_commons = []
        for j in range(ranges[i] - 1):
            counter = Counter(vars[j])
            #most_commons.append(counter.most_common(ranges[i] - 1))
            most_commons.append(counter.most_common())

        # Inicializar vector. Se escoge el valor más popular en la primera componente
        if len(most_commons) > 0 and len(most_commons[0]) > 0:
            vars_list = [most_commons[0][0][0]]
            # Para cada componente a partir de la segunda...
            for j in range(1, ranges[i] - 1):
                # Buscar la componente más común...
                for c in most_commons[j]:
                    # Siempre y cuando no esté utilizada...
                    if c[0] not in vars_list[:j]:
                        # Agregarla al vector y...
                        vars_list.append(c[0])
                        # Dejar de buscar.
                        break

        if len(vars_list) < ranges[i] - 1:
            for i in set(range(1, ranges[i])):
                if str(i) not in vars_list:
                    vars_list.append(str(i))
        df2.at[0, col] = vars_list

    return df2

# --------------------------
# Actualizar los centroides
update_centroids()

In [8]:
display(centroids)

Unnamed: 0,Hobby,OpenSource,Country,Student,Employment,FormalEducation,UndergradMajor,CompanySize,DevType,YearsCoding,YearsCodingProf,JobSatisfaction,CareerSatisfaction,HopeFiveYears,JobSearchStatus,LastNewJob,UpdateCV,ConvertedSalary,CommunicationTools,TimeFullyProductive,EducationTypes,SelfTaughtTypes,HackathonReasons,AgreeDisagree1,AgreeDisagree2,AgreeDisagree3,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,FrameworkWorkedWith,FrameworkDesireNextYear,IDE,OperatingSystem,NumberMonitors,Methodology,VersionControl,CheckInCode,AdBlocker,AdBlockerDisable,AdBlockerReasons,AdsAgreeDisagree1,AdsAgreeDisagree2,AdsAgreeDisagree3,AdsActions,AIDangerous,AIInteresting,AIResponsible,AIFuture,EthicsChoice,EthicsReport,EthicsResponsible,EthicalImplications,StackOverflowRecommend,StackOverflowVisit,StackOverflowHasAccount,StackOverflowParticipate,StackOverflowJobs,StackOverflowDevStory,StackOverflowJobsRecommend,StackOverflowConsiderMember,HypotheticalTools1,HypotheticalTools2,HypotheticalTools3,HypotheticalTools4,HypotheticalTools5,WakeTime,HoursComputer,HoursOutside,SkipMeals,ErgonomicDevices,Exercise,EducationParents,RaceEthnicity,Age,Dependents,SurveyTooLong,SurveyEasy,LanguageWorkedWith,AssessJob,AssessBenefits,JobContactPriorities,JobEmailPriorities,AdsPriorities,Cluster
0,1,0,USA,0,0,1,6,3,"[0, 12, 11]",9,7,3,3,6,2,3,7,0.157121,"[8, 4, 5]",3,"[8, 7, 1]","[7, 5, 0, 3]",[0],0,0,1,"[18, 27, 14, 31, 3]","[14, 19]","[13, 14, 19]","[14, 22, 2]","[14, 2, 0]","[5, 1]","[5, 6]","[10, 18, 19]",3,2,"[0, 9, 4]","[1, 4]",2,2,3,"[6, 4]",1,1,0,"[3, 2]",1,3,3,1,1,0,2,2,10,5,2,4,2,0,5,2,4,4,0,1,0,6,2,2,3,[2],3,2,[6],1,0,0,2,"[18, 14, 5, 31, 17, 3]","[9, 10, 6, 2, 1, 4, 5, 3, 8, 7]","[1, 2, 3, 10, 7, 4, 8, 5, 11, 9, 6]","[2, 1, 5, 3, 4]","[1, 6, 3, 2, 4, 7, 5]","[1, 3, 2, 4, 6, 7, 5]",
1,1,0,USA,0,0,1,6,3,"[0, 12, 11]",7,7,3,3,6,2,1,7,0.149501,"[8, 4, 5]",3,"[8, 7, 5, 1]","[7, 5, 0, 3]","[0, 4]",0,1,1,"[18, 27, 14, 5, 31, 1]","[14, 17, 19]","[17, 14, 13]","[14, 22, 2]","[14, 2, 0]","[5, 1]","[5, 6, 1]","[19, 15, 10, 17]",3,2,"[0, 9, 4]","[1, 4]",2,2,3,"[6, 2]",1,1,0,"[3, 2]",1,3,3,1,1,3,2,2,10,2,2,4,2,2,5,2,3,3,3,3,4,5,2,0,3,[0],3,1,[6],1,0,0,2,"[18, 14, 5, 31, 17, 1, 27]","[9, 8, 7, 1, 2, 3, 10, 4, 6, 5]","[1, 2, 3, 10, 7, 4, 11, 6, 9, 8, 5]","[2, 1, 5, 3, 4]","[1, 4, 7, 2, 3, 6, 5]","[1, 4, 2, 5, 6, 7, 3]",
2,1,1,USA,0,0,1,6,4,"[0, 12, 11, 15]",9,7,3,3,2,1,3,7,0.157374,"[8, 4, 5]",3,"[8, 1, 7, 5]","[7, 5, 8, 0]","[0, 4]",0,1,1,"[18, 14, 5, 27, 31, 1]","[14, 17, 19]","[13, 17, 14]","[14, 0, 2]","[14, 0, 2, 18]","[5, 1]","[5, 6, 1]","[19, 17, 6, 15]",3,2,"[0, 9, 4]","[1, 4]",2,2,3,"[1, 6]",1,1,0,"[3, 2]",2,3,3,1,1,0,2,2,10,5,2,4,2,0,5,2,3,2,3,4,3,5,2,0,3,[2],3,2,[6],1,0,0,4,"[18, 14, 5, 31, 1, 17, 27]","[9, 10, 8, 1, 2, 3, 6, 4, 7, 5]","[1, 2, 3, 10, 9, 4, 11, 5, 8, 7, 6]","[2, 1, 5, 3, 4]","[1, 5, 2, 3, 4, 7, 6]","[1, 4, 2, 3, 6, 7, 5]",
3,1,0,USA,0,0,1,6,8,"[0, 12, 11]",7,0,3,3,0,2,3,7,0.138102,"[5, 8]",3,"[8, 7, 5]","[5, 7, 8]",[0],0,2,1,"[18, 3, 14, 31]","[19, 14]","[19, 14]","[22, 14]","[22, 14]",[0],"[0, 5]","[18, 10, 19]",3,2,"[0, 9]","[1, 4]",2,2,3,"[6, 0]",1,1,0,"[3, 2]",0,3,3,1,1,0,2,2,10,2,2,4,1,2,5,2,2,2,2,2,2,6,2,2,3,[0],3,1,[6],1,0,1,2,"[14, 18, 5, 31, 3, 17]","[9, 7, 6, 1, 2, 4, 10, 3, 8, 5]","[1, 2, 3, 10, 8, 4, 11, 5, 9, 7, 6]","[2, 1, 5, 3, 4]","[1, 5, 7, 2, 3, 6, 4]","[1, 4, 2, 3, 6, 7, 5]",
4,1,1,IND,0,0,1,6,8,"[0, 12, 11, 19]",7,0,5,3,6,2,3,7,0.07372,"[8, 5]",0,"[8, 7, 1]","[5, 7, 3]","[4, 0]",0,0,1,"[18, 27, 14, 5, 31, 17]","[14, 19, 17]","[14, 13, 19]","[14, 22, 2]","[14, 2, 22, 8]","[5, 1]","[5, 1, 6]","[10, 18, 19, 15]",3,2,"[0, 9]","[1, 4]",2,1,0,"[0, 6]",1,1,0,"[3, 2]",0,3,3,1,0,0,2,2,10,5,2,4,1,2,5,2,3,4,4,3,3,6,1,0,3,[0],0,1,[6],0,0,1,2,"[14, 5, 18, 31, 17, 27, 25]","[8, 9, 6, 2, 1, 3, 10, 4, 7, 5]","[1, 2, 3, 10, 9, 4, 11, 5, 8, 6, 7]","[2, 1, 5, 4, 3]","[1, 3, 7, 2, 4, 6, 5]","[1, 5, 2, 4, 6, 7, 3]",
5,1,0,IND,0,0,1,6,8,"[0, 12, 11]",7,11,5,5,6,1,3,7,0.011953,[8],3,[8],[5],[0],0,0,2,"[18, 27]",[14],[14],[14],[14],[5],[5],[10],3,2,[0],[1],2,2,3,[0],1,1,0,[3],1,3,3,1,1,0,2,2,10,5,2,4,2,2,5,2,3,4,4,4,4,6,2,0,3,[2],3,1,[6],1,0,1,0,"[14, 18, 5]","[9, 8, 7, 1, 2, 3, 10, 4, 6, 5]","[1, 2, 3, 10, 8, 9, 11, 4, 7, 6, 5]","[2, 1, 5, 4, 3]","[1, 4, 7, 2, 3, 6, 5]","[1, 4, 2, 3, 6, 7, 5]",
6,1,1,USA,0,0,1,6,8,"[0, 12, 11, 15]",7,0,3,3,6,2,3,7,0.116436,"[8, 4, 7]",3,"[8, 1, 7]","[7, 5, 3, 8]",[0],0,0,1,"[18, 27, 14, 5, 31, 1]","[14, 17, 19]","[17, 14, 13]","[14, 2, 22]","[14, 2, 0, 18]","[5, 1]","[5, 6]","[15, 19, 17]",2,1,"[0, 9]","[1, 4]",2,2,3,"[6, 2]",1,1,0,"[3, 2]",0,3,3,1,1,0,2,2,10,2,2,4,1,2,5,2,0,2,0,4,3,6,2,2,3,[0],3,1,[6],1,0,1,2,"[18, 14, 5, 31, 1, 17, 27]","[9, 8, 7, 2, 1, 4, 10, 3, 6, 5]","[1, 2, 3, 10, 9, 4, 11, 5, 8, 7, 6]","[5, 1, 4, 3, 2]","[1, 6, 2, 3, 4, 7, 5]","[1, 4, 2, 5, 6, 7, 3]",


In [9]:
deltas = []
delta = 0
def update_deltas():
    global deltas, delta, centroids
    deltas = [0] * NUM_CLUSTERS
    N = 0
    for j, rc in centroids.iterrows():
        n = 0
        for i, row in df[df["Cluster"]==j].iterrows():
            deltas[j] += distance_qual(row, rc)
            n += 1
        delta += deltas[j]
        deltas[j] /= n
        N += n
    delta /= N
    
    if TALK : 
        print("Las distancias medias en cada cluster son:\n", deltas)   
        print("\nLa distancia media promedio es:", delta)   
        
    return

update_deltas()

Las distancias medias en cada cluster son:
 [0.5570594336773347, 0.5449713348106269, 0.5484977880445203, 0.5753746402196187, 0.5881364006569479, 0.8460733536104171, 0.5624925610453452]

La distancia media promedio es: 0.6525477319458147


In [10]:
import math

def std_dev():
    # Inicializar vector de desviaciones estándar... los valores actuales son inserbibles
    std_vectors = centroids.copy()
    
    for c in range(NUM_CLUSTERS) :
        df_c = df[(df["Cluster"]==c)]
        
        # Para cada variable numérica...
        df_cj = df_c[pd.notnull(df_c['ConvertedSalary'])]

        s = math.sqrt(sum(abs(df_cj["ConvertedSalary"] - 
                              centroids.iloc[c]["ConvertedSalary"])) / (df_cj.shape[0] - 1))
        std_vectors.loc[c, "ConvertedSalary"] = s
        
        for col in var_str:
            diff = sum(df_cj[col] != centroids.iloc[c][col])
            s = math.sqrt(diff / (df_cj.shape[0] - 1))
            std_vectors.loc[c, col] = s
        
        for col in var_list:
            y = centroids.iloc[c][col]
            diff = 0
            for i, row in df_cj.iterrows():
                x = row[col]
                num_vars = len(x) + len(y)
                if num_vars > 0:
                    diff += (2*len(set(x + y)) - num_vars) / num_vars
            s = math.sqrt(diff / (df_cj.shape[0] - 1))
            std_vectors.loc[c, col] = s
        
        for col in var_ranks:
            y = centroids.iloc[c][col]
            for i, row in df_cj.iterrows():
                diff = 0
                x = row[col]
                max_vars = max(len(x), len(y))
                if len(x) != 0 and len(y) != 0:
                    for v in range(len(x)):
                        if x[v] != y[v]:
                            diff += 1
                else:
                    diff += max_vars

                if diff != 0:
                    diff /= max_vars
            s = math.sqrt(diff / (df_cj.shape[0] - 1))
            std_vectors.loc[c, col] = s
         
    return std_vectors

display(std_dev())

Unnamed: 0,Hobby,OpenSource,Country,Student,Employment,FormalEducation,UndergradMajor,CompanySize,DevType,YearsCoding,YearsCodingProf,JobSatisfaction,CareerSatisfaction,HopeFiveYears,JobSearchStatus,LastNewJob,UpdateCV,ConvertedSalary,CommunicationTools,TimeFullyProductive,EducationTypes,SelfTaughtTypes,HackathonReasons,AgreeDisagree1,AgreeDisagree2,AgreeDisagree3,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,FrameworkWorkedWith,FrameworkDesireNextYear,IDE,OperatingSystem,NumberMonitors,Methodology,VersionControl,CheckInCode,AdBlocker,AdBlockerDisable,AdBlockerReasons,AdsAgreeDisagree1,AdsAgreeDisagree2,AdsAgreeDisagree3,AdsActions,AIDangerous,AIInteresting,AIResponsible,AIFuture,EthicsChoice,EthicsReport,EthicsResponsible,EthicalImplications,StackOverflowRecommend,StackOverflowVisit,StackOverflowHasAccount,StackOverflowParticipate,StackOverflowJobs,StackOverflowDevStory,StackOverflowJobsRecommend,StackOverflowConsiderMember,HypotheticalTools1,HypotheticalTools2,HypotheticalTools3,HypotheticalTools4,HypotheticalTools5,WakeTime,HoursComputer,HoursOutside,SkipMeals,ErgonomicDevices,Exercise,EducationParents,RaceEthnicity,Age,Dependents,SurveyTooLong,SurveyEasy,LanguageWorkedWith,AssessJob,AssessBenefits,JobContactPriorities,JobEmailPriorities,AdsPriorities,Cluster
0,0.400809,0.698099,0.848107,0.439573,0.379579,0.723759,0.595341,0.882843,0.764452,0.80867,0.752138,0.852485,0.828454,0.733341,0.528594,0.800107,0.796493,0.37727,0.774685,0.752756,0.649906,0.656805,0.934963,0.628943,0.791099,0.796259,0.805879,0.823352,0.877917,0.83052,0.856107,0.882064,0.876488,0.792074,0.6194,0.615632,0.684089,0.628587,0.725814,0.422073,0.587472,0.814676,0.810739,0.823157,0.849094,0.729885,0.884107,0.852813,0.8044,0.52877,0.604648,0.699431,0.638636,0.578048,0.64617,0.825528,0.289748,0.748167,0.54009,0.803242,0.784364,0.688973,0.855537,0.854121,0.863442,0.847228,0.871596,0.841054,0.734355,0.832489,0.550671,0.844283,0.771683,0.837394,0.580843,0.622099,0.50297,0.649331,0.734735,0.699068,0.0136437,0.0136437,0.0136437,0.0136437,0.0136437,
1,0.389543,0.566197,0.869895,0.46735,0.438802,0.710284,0.719224,0.893902,0.723834,0.864969,0.833927,0.865058,0.832033,0.758203,0.653081,0.849341,0.809711,0.374957,0.793579,0.763716,0.662515,0.647613,0.856446,0.623901,0.818082,0.811326,0.766121,0.777742,0.837668,0.811762,0.826899,0.854288,0.807911,0.799635,0.763716,0.625811,0.679536,0.615004,0.616453,0.412951,0.563539,0.791313,0.792411,0.82525,0.83752,0.680614,0.822868,0.779926,0.68169,0.492065,0.673395,0.716597,0.643641,0.422353,0.504808,0.823709,0.330408,0.821885,0.614514,0.789881,0.73871,0.714284,0.777701,0.823476,0.819211,0.82015,0.80523,0.813694,0.634849,0.735263,0.548235,0.90171,0.794255,0.851513,0.635338,0.682028,0.517237,0.645254,0.755048,0.647203,0.00877429,0.00877429,0.00877429,0.00877429,0.00812342,
2,0.326405,0.623615,0.84736,0.392183,0.392343,0.721982,0.610084,0.861011,0.718178,0.888423,0.865141,0.782571,0.784645,0.819872,0.727073,0.776435,0.789729,0.392417,0.75822,0.778405,0.627974,0.622463,0.828339,0.721549,0.870436,0.811286,0.760636,0.781345,0.844671,0.813565,0.818342,0.798197,0.781061,0.795448,0.797802,0.729906,0.635879,0.59267,0.524205,0.499093,0.737744,0.834041,0.716636,0.736981,0.848871,0.717148,0.874592,0.7745,0.769275,0.478629,0.528777,0.74218,0.730719,0.375903,0.475352,0.803425,0.269193,0.825079,0.541685,0.851556,0.786396,0.612844,0.852656,0.850968,0.844885,0.825382,0.868962,0.826139,0.627862,0.748472,0.621707,0.891454,0.802647,0.86043,0.562968,0.676561,0.590979,0.649737,0.742222,0.629028,0.00790718,0.00790718,0.00790718,0.00790718,0.00597726,
3,0.442231,0.51512,0.865248,0.426647,0.490328,0.751467,0.73861,0.890185,0.772605,0.898015,0.871096,0.782315,0.753205,0.836954,0.597927,0.844959,0.747978,0.383859,0.834228,0.857432,0.682884,0.69732,0.953592,0.734327,0.854556,0.824265,0.841572,0.783829,0.892663,0.817747,0.87554,0.90739,0.886552,0.766727,0.541834,0.651852,0.734343,0.677304,0.688138,0.532885,0.750699,0.851742,0.797991,0.777432,0.839891,0.751978,0.834745,0.782709,0.717261,0.618109,0.670872,0.694759,0.65802,0.465207,0.539771,0.836494,0.365878,0.78374,0.699394,0.824498,0.536554,0.658429,0.803707,0.767222,0.82929,0.860254,0.872553,0.801359,0.758498,0.757686,0.513175,0.903682,0.77931,0.876731,0.540936,0.81996,0.610091,0.668747,0.820382,0.691081,0.00877125,0.00877125,0.00877125,0.00877125,0.0081206,
4,0.362657,0.694614,0.874907,0.694212,0.628585,0.723256,0.740312,0.81995,0.752883,0.770763,0.718735,0.863872,0.855856,0.800172,0.573898,0.831008,0.753037,0.303893,0.876757,0.852475,0.700247,0.739318,0.879163,0.676407,0.844682,0.827751,0.762859,0.767103,0.83702,0.790639,0.822777,0.860635,0.814247,0.787212,0.625317,0.722741,0.797609,0.681574,0.812761,0.63756,0.63756,0.731699,0.721838,0.806318,0.821312,0.702234,0.858247,0.818813,0.758091,0.511602,0.731325,0.720935,0.79257,0.536312,0.576489,0.831008,0.560598,0.895842,0.78939,0.849849,0.52348,0.623676,0.793627,0.876183,0.843137,0.836036,0.872134,0.859765,0.7636,0.730688,0.749689,0.825843,0.835033,0.858898,0.800065,0.78525,0.538219,0.598371,0.828088,0.663759,0.0136488,0.0136488,0.0136488,0.0136488,0.0136488,
5,0.208966,0.670197,0.882492,0.63095,0.614357,0.707351,0.742829,0.769472,0.799917,0.770642,0.704227,0.61551,0.67436,0.929772,0.623524,0.90966,0.970268,0.14919,0.985613,0.979353,0.972876,0.989255,0.994204,0.976668,0.983221,0.98546,0.954193,0.941678,0.97003,0.96796,0.975445,0.970482,0.971236,0.974849,0.911933,0.928679,0.943438,0.894476,0.922861,0.898486,0.934748,0.961177,0.95214,0.955797,0.957781,0.96094,0.977099,0.974664,0.969398,0.953749,0.961476,0.969972,0.973818,0.958921,0.893651,0.948368,0.851291,0.95956,0.941287,0.945068,0.901935,0.912038,0.976098,0.980038,0.976707,0.97757,0.978648,0.975018,0.954493,0.962592,0.949156,0.976871,0.964462,0.996351,0.997294,0.994657,0.993268,0.985557,0.994233,0.918288,0.00619174,0.00619174,0.00619174,0.00619174,0.00619174,
6,0.361562,0.657543,0.865228,0.502673,0.711345,0.739561,0.65968,0.724246,0.737021,0.884907,0.853824,0.809681,0.818726,0.815545,0.571726,0.798908,0.787438,0.364709,0.815847,0.900618,0.633358,0.647169,0.932625,0.639591,0.807605,0.799111,0.771617,0.787816,0.842476,0.828576,0.831641,0.854066,0.835547,0.826738,0.754638,0.714535,0.72542,0.622229,0.631582,0.431611,0.575785,0.81787,0.798026,0.818462,0.845795,0.725299,0.837042,0.781155,0.779142,0.506751,0.692207,0.672365,0.641788,0.461563,0.655398,0.761994,0.313251,0.743724,0.691581,0.755857,0.532492,0.719294,0.846179,0.806128,0.865103,0.82885,0.811685,0.850711,0.656966,0.752051,0.674696,0.909211,0.773421,0.807538,0.586939,0.743068,0.648503,0.707298,0.751042,0.648338,0.0104071,0.0104071,0.0104071,0.0104071,0.00879558,


In [11]:
def divide_clusters():
    global NUM_CLUSTERS, centroids

    if TALK :
        display(centroids)
    
    # Cálculo de desviaciones estandar
    sigma_vect = std_dev()   
    if TALK :
        display(sigma_vect)
    
    candidates = []
    for c, s_row in sigma_vect.iterrows():
        causes = 0
        for col in s_row:
            if col > S_MAX :
                causes += 1
                if causes > DC_MAX :
                    candidates.append(c)
                    break # Ya encontramos un atributo con sigma elevada... o varios!

    if TALK :
        print("Posibles clusters a dividir:", candidates)
    
    divided = False
    to_eliminate = []
    for c in candidates:
        std = sigma_vect.iloc[c].mean()
        if std < S_MAX :
            members = df[df["Cluster"]==c].count()["Cluster"]
            cond = NUM_CLUSTERS < K_INIT/2 or (deltas[c] > delta and members > 2 * N_MIN)
            if cond: 
                if (deltas[c] <= D_MAX or NUM_CLUSTERS < K_INIT / 2) :
                    d = 0
                    # Obtener dos puntos "suficientemente separados", no es el óptimo, 
                    # pero son buenos candidatos a buen costo
                    count = 0
                    while (d < deltas[c] and count < 5000) : 
                        s1 = df[df["Cluster"]==c].sample(n=2)
                        d = distance_qual(s1.iloc[0], s1.iloc[1])
                        count += 1
                    if count < 5000:
                        to_eliminate.append(c)
                        centroids = centroids.append(s1)
                        NUM_CLUSTERS += 1
                else : 
                    # Si la distancia media en el cluster es mayor a D_MAX, se elimina 
                    # (no se divide)
                    to_eliminate.append(c)
                    NUM_CLUSTERS -= 1
                
            
    if len(to_eliminate) > 0 :
        if TALK : 
            print("Clusters a eliminar:", to_eliminate)
            print("")
        centroids.drop(to_eliminate, inplace=True)
        centroids = centroids.reset_index(drop=True)
        update_clusters()
        update_centroids()
        if TALK : 
            display(centroids)
            print("")
            
    return 

divide_clusters()    

Unnamed: 0,Hobby,OpenSource,Country,Student,Employment,FormalEducation,UndergradMajor,CompanySize,DevType,YearsCoding,YearsCodingProf,JobSatisfaction,CareerSatisfaction,HopeFiveYears,JobSearchStatus,LastNewJob,UpdateCV,ConvertedSalary,CommunicationTools,TimeFullyProductive,EducationTypes,SelfTaughtTypes,HackathonReasons,AgreeDisagree1,AgreeDisagree2,AgreeDisagree3,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,FrameworkWorkedWith,FrameworkDesireNextYear,IDE,OperatingSystem,NumberMonitors,Methodology,VersionControl,CheckInCode,AdBlocker,AdBlockerDisable,AdBlockerReasons,AdsAgreeDisagree1,AdsAgreeDisagree2,AdsAgreeDisagree3,AdsActions,AIDangerous,AIInteresting,AIResponsible,AIFuture,EthicsChoice,EthicsReport,EthicsResponsible,EthicalImplications,StackOverflowRecommend,StackOverflowVisit,StackOverflowHasAccount,StackOverflowParticipate,StackOverflowJobs,StackOverflowDevStory,StackOverflowJobsRecommend,StackOverflowConsiderMember,HypotheticalTools1,HypotheticalTools2,HypotheticalTools3,HypotheticalTools4,HypotheticalTools5,WakeTime,HoursComputer,HoursOutside,SkipMeals,ErgonomicDevices,Exercise,EducationParents,RaceEthnicity,Age,Dependents,SurveyTooLong,SurveyEasy,LanguageWorkedWith,AssessJob,AssessBenefits,JobContactPriorities,JobEmailPriorities,AdsPriorities,Cluster
0,1,0,USA,0,0,1,6,3,"[0, 12, 11]",9,7,3,3,6,2,3,7,0.157121,"[8, 4, 5]",3,"[8, 7, 1]","[7, 5, 0, 3]",[0],0,0,1,"[18, 27, 14, 31, 3]","[14, 19]","[13, 14, 19]","[14, 22, 2]","[14, 2, 0]","[5, 1]","[5, 6]","[10, 18, 19]",3,2,"[0, 9, 4]","[1, 4]",2,2,3,"[6, 4]",1,1,0,"[3, 2]",1,3,3,1,1,0,2,2,10,5,2,4,2,0,5,2,4,4,0,1,0,6,2,2,3,[2],3,2,[6],1,0,0,2,"[18, 14, 5, 31, 17, 3]","[9, 10, 6, 2, 1, 4, 5, 3, 8, 7]","[1, 2, 3, 10, 7, 4, 8, 5, 11, 9, 6]","[2, 1, 5, 3, 4]","[1, 6, 3, 2, 4, 7, 5]","[1, 3, 2, 4, 6, 7, 5]",
1,1,0,USA,0,0,1,6,3,"[0, 12, 11]",7,7,3,3,6,2,1,7,0.149501,"[8, 4, 5]",3,"[8, 7, 5, 1]","[7, 5, 0, 3]","[0, 4]",0,1,1,"[18, 27, 14, 5, 31, 1]","[14, 17, 19]","[17, 14, 13]","[14, 22, 2]","[14, 2, 0]","[5, 1]","[5, 6, 1]","[19, 15, 10, 17]",3,2,"[0, 9, 4]","[1, 4]",2,2,3,"[6, 2]",1,1,0,"[3, 2]",1,3,3,1,1,3,2,2,10,2,2,4,2,2,5,2,3,3,3,3,4,5,2,0,3,[0],3,1,[6],1,0,0,2,"[18, 14, 5, 31, 17, 1, 27]","[9, 8, 7, 1, 2, 3, 10, 4, 6, 5]","[1, 2, 3, 10, 7, 4, 11, 6, 9, 8, 5]","[2, 1, 5, 3, 4]","[1, 4, 7, 2, 3, 6, 5]","[1, 4, 2, 5, 6, 7, 3]",
2,1,1,USA,0,0,1,6,4,"[0, 12, 11, 15]",9,7,3,3,2,1,3,7,0.157374,"[8, 4, 5]",3,"[8, 1, 7, 5]","[7, 5, 8, 0]","[0, 4]",0,1,1,"[18, 14, 5, 27, 31, 1]","[14, 17, 19]","[13, 17, 14]","[14, 0, 2]","[14, 0, 2, 18]","[5, 1]","[5, 6, 1]","[19, 17, 6, 15]",3,2,"[0, 9, 4]","[1, 4]",2,2,3,"[1, 6]",1,1,0,"[3, 2]",2,3,3,1,1,0,2,2,10,5,2,4,2,0,5,2,3,2,3,4,3,5,2,0,3,[2],3,2,[6],1,0,0,4,"[18, 14, 5, 31, 1, 17, 27]","[9, 10, 8, 1, 2, 3, 6, 4, 7, 5]","[1, 2, 3, 10, 9, 4, 11, 5, 8, 7, 6]","[2, 1, 5, 3, 4]","[1, 5, 2, 3, 4, 7, 6]","[1, 4, 2, 3, 6, 7, 5]",
3,1,0,USA,0,0,1,6,8,"[0, 12, 11]",7,0,3,3,0,2,3,7,0.138102,"[5, 8]",3,"[8, 7, 5]","[5, 7, 8]",[0],0,2,1,"[18, 3, 14, 31]","[19, 14]","[19, 14]","[22, 14]","[22, 14]",[0],"[0, 5]","[18, 10, 19]",3,2,"[0, 9]","[1, 4]",2,2,3,"[6, 0]",1,1,0,"[3, 2]",0,3,3,1,1,0,2,2,10,2,2,4,1,2,5,2,2,2,2,2,2,6,2,2,3,[0],3,1,[6],1,0,1,2,"[14, 18, 5, 31, 3, 17]","[9, 7, 6, 1, 2, 4, 10, 3, 8, 5]","[1, 2, 3, 10, 8, 4, 11, 5, 9, 7, 6]","[2, 1, 5, 3, 4]","[1, 5, 7, 2, 3, 6, 4]","[1, 4, 2, 3, 6, 7, 5]",
4,1,1,IND,0,0,1,6,8,"[0, 12, 11, 19]",7,0,5,3,6,2,3,7,0.07372,"[8, 5]",0,"[8, 7, 1]","[5, 7, 3]","[4, 0]",0,0,1,"[18, 27, 14, 5, 31, 17]","[14, 19, 17]","[14, 13, 19]","[14, 22, 2]","[14, 2, 22, 8]","[5, 1]","[5, 1, 6]","[10, 18, 19, 15]",3,2,"[0, 9]","[1, 4]",2,1,0,"[0, 6]",1,1,0,"[3, 2]",0,3,3,1,0,0,2,2,10,5,2,4,1,2,5,2,3,4,4,3,3,6,1,0,3,[0],0,1,[6],0,0,1,2,"[14, 5, 18, 31, 17, 27, 25]","[8, 9, 6, 2, 1, 3, 10, 4, 7, 5]","[1, 2, 3, 10, 9, 4, 11, 5, 8, 6, 7]","[2, 1, 5, 4, 3]","[1, 3, 7, 2, 4, 6, 5]","[1, 5, 2, 4, 6, 7, 3]",
5,1,0,IND,0,0,1,6,8,"[0, 12, 11]",7,11,5,5,6,1,3,7,0.011953,[8],3,[8],[5],[0],0,0,2,"[18, 27]",[14],[14],[14],[14],[5],[5],[10],3,2,[0],[1],2,2,3,[0],1,1,0,[3],1,3,3,1,1,0,2,2,10,5,2,4,2,2,5,2,3,4,4,4,4,6,2,0,3,[2],3,1,[6],1,0,1,0,"[14, 18, 5]","[9, 8, 7, 1, 2, 3, 10, 4, 6, 5]","[1, 2, 3, 10, 8, 9, 11, 4, 7, 6, 5]","[2, 1, 5, 4, 3]","[1, 4, 7, 2, 3, 6, 5]","[1, 4, 2, 3, 6, 7, 5]",
6,1,1,USA,0,0,1,6,8,"[0, 12, 11, 15]",7,0,3,3,6,2,3,7,0.116436,"[8, 4, 7]",3,"[8, 1, 7]","[7, 5, 3, 8]",[0],0,0,1,"[18, 27, 14, 5, 31, 1]","[14, 17, 19]","[17, 14, 13]","[14, 2, 22]","[14, 2, 0, 18]","[5, 1]","[5, 6]","[15, 19, 17]",2,1,"[0, 9]","[1, 4]",2,2,3,"[6, 2]",1,1,0,"[3, 2]",0,3,3,1,1,0,2,2,10,2,2,4,1,2,5,2,0,2,0,4,3,6,2,2,3,[0],3,1,[6],1,0,1,2,"[18, 14, 5, 31, 1, 17, 27]","[9, 8, 7, 2, 1, 4, 10, 3, 6, 5]","[1, 2, 3, 10, 9, 4, 11, 5, 8, 7, 6]","[5, 1, 4, 3, 2]","[1, 6, 2, 3, 4, 7, 5]","[1, 4, 2, 5, 6, 7, 3]",


Unnamed: 0,Hobby,OpenSource,Country,Student,Employment,FormalEducation,UndergradMajor,CompanySize,DevType,YearsCoding,YearsCodingProf,JobSatisfaction,CareerSatisfaction,HopeFiveYears,JobSearchStatus,LastNewJob,UpdateCV,ConvertedSalary,CommunicationTools,TimeFullyProductive,EducationTypes,SelfTaughtTypes,HackathonReasons,AgreeDisagree1,AgreeDisagree2,AgreeDisagree3,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,FrameworkWorkedWith,FrameworkDesireNextYear,IDE,OperatingSystem,NumberMonitors,Methodology,VersionControl,CheckInCode,AdBlocker,AdBlockerDisable,AdBlockerReasons,AdsAgreeDisagree1,AdsAgreeDisagree2,AdsAgreeDisagree3,AdsActions,AIDangerous,AIInteresting,AIResponsible,AIFuture,EthicsChoice,EthicsReport,EthicsResponsible,EthicalImplications,StackOverflowRecommend,StackOverflowVisit,StackOverflowHasAccount,StackOverflowParticipate,StackOverflowJobs,StackOverflowDevStory,StackOverflowJobsRecommend,StackOverflowConsiderMember,HypotheticalTools1,HypotheticalTools2,HypotheticalTools3,HypotheticalTools4,HypotheticalTools5,WakeTime,HoursComputer,HoursOutside,SkipMeals,ErgonomicDevices,Exercise,EducationParents,RaceEthnicity,Age,Dependents,SurveyTooLong,SurveyEasy,LanguageWorkedWith,AssessJob,AssessBenefits,JobContactPriorities,JobEmailPriorities,AdsPriorities,Cluster
0,0.400809,0.698099,0.848107,0.439573,0.379579,0.723759,0.595341,0.882843,0.764452,0.80867,0.752138,0.852485,0.828454,0.733341,0.528594,0.800107,0.796493,0.37727,0.774685,0.752756,0.649906,0.656805,0.934963,0.628943,0.791099,0.796259,0.805879,0.823352,0.877917,0.83052,0.856107,0.882064,0.876488,0.792074,0.6194,0.615632,0.684089,0.628587,0.725814,0.422073,0.587472,0.814676,0.810739,0.823157,0.849094,0.729885,0.884107,0.852813,0.8044,0.52877,0.604648,0.699431,0.638636,0.578048,0.64617,0.825528,0.289748,0.748167,0.54009,0.803242,0.784364,0.688973,0.855537,0.854121,0.863442,0.847228,0.871596,0.841054,0.734355,0.832489,0.550671,0.844283,0.771683,0.837394,0.580843,0.622099,0.50297,0.649331,0.734735,0.699068,0.0136437,0.0136437,0.0136437,0.0136437,0.0136437,
1,0.389543,0.566197,0.869895,0.46735,0.438802,0.710284,0.719224,0.893902,0.723834,0.864969,0.833927,0.865058,0.832033,0.758203,0.653081,0.849341,0.809711,0.374957,0.793579,0.763716,0.662515,0.647613,0.856446,0.623901,0.818082,0.811326,0.766121,0.777742,0.837668,0.811762,0.826899,0.854288,0.807911,0.799635,0.763716,0.625811,0.679536,0.615004,0.616453,0.412951,0.563539,0.791313,0.792411,0.82525,0.83752,0.680614,0.822868,0.779926,0.68169,0.492065,0.673395,0.716597,0.643641,0.422353,0.504808,0.823709,0.330408,0.821885,0.614514,0.789881,0.73871,0.714284,0.777701,0.823476,0.819211,0.82015,0.80523,0.813694,0.634849,0.735263,0.548235,0.90171,0.794255,0.851513,0.635338,0.682028,0.517237,0.645254,0.755048,0.647203,0.00877429,0.00877429,0.00877429,0.00877429,0.00812342,
2,0.326405,0.623615,0.84736,0.392183,0.392343,0.721982,0.610084,0.861011,0.718178,0.888423,0.865141,0.782571,0.784645,0.819872,0.727073,0.776435,0.789729,0.392417,0.75822,0.778405,0.627974,0.622463,0.828339,0.721549,0.870436,0.811286,0.760636,0.781345,0.844671,0.813565,0.818342,0.798197,0.781061,0.795448,0.797802,0.729906,0.635879,0.59267,0.524205,0.499093,0.737744,0.834041,0.716636,0.736981,0.848871,0.717148,0.874592,0.7745,0.769275,0.478629,0.528777,0.74218,0.730719,0.375903,0.475352,0.803425,0.269193,0.825079,0.541685,0.851556,0.786396,0.612844,0.852656,0.850968,0.844885,0.825382,0.868962,0.826139,0.627862,0.748472,0.621707,0.891454,0.802647,0.86043,0.562968,0.676561,0.590979,0.649737,0.742222,0.629028,0.00790718,0.00790718,0.00790718,0.00790718,0.00597726,
3,0.442231,0.51512,0.865248,0.426647,0.490328,0.751467,0.73861,0.890185,0.772605,0.898015,0.871096,0.782315,0.753205,0.836954,0.597927,0.844959,0.747978,0.383859,0.834228,0.857432,0.682884,0.69732,0.953592,0.734327,0.854556,0.824265,0.841572,0.783829,0.892663,0.817747,0.87554,0.90739,0.886552,0.766727,0.541834,0.651852,0.734343,0.677304,0.688138,0.532885,0.750699,0.851742,0.797991,0.777432,0.839891,0.751978,0.834745,0.782709,0.717261,0.618109,0.670872,0.694759,0.65802,0.465207,0.539771,0.836494,0.365878,0.78374,0.699394,0.824498,0.536554,0.658429,0.803707,0.767222,0.82929,0.860254,0.872553,0.801359,0.758498,0.757686,0.513175,0.903682,0.77931,0.876731,0.540936,0.81996,0.610091,0.668747,0.820382,0.691081,0.00877125,0.00877125,0.00877125,0.00877125,0.0081206,
4,0.362657,0.694614,0.874907,0.694212,0.628585,0.723256,0.740312,0.81995,0.752883,0.770763,0.718735,0.863872,0.855856,0.800172,0.573898,0.831008,0.753037,0.303893,0.876757,0.852475,0.700247,0.739318,0.879163,0.676407,0.844682,0.827751,0.762859,0.767103,0.83702,0.790639,0.822777,0.860635,0.814247,0.787212,0.625317,0.722741,0.797609,0.681574,0.812761,0.63756,0.63756,0.731699,0.721838,0.806318,0.821312,0.702234,0.858247,0.818813,0.758091,0.511602,0.731325,0.720935,0.79257,0.536312,0.576489,0.831008,0.560598,0.895842,0.78939,0.849849,0.52348,0.623676,0.793627,0.876183,0.843137,0.836036,0.872134,0.859765,0.7636,0.730688,0.749689,0.825843,0.835033,0.858898,0.800065,0.78525,0.538219,0.598371,0.828088,0.663759,0.0136488,0.0136488,0.0136488,0.0136488,0.0136488,
5,0.208966,0.670197,0.882492,0.63095,0.614357,0.707351,0.742829,0.769472,0.799917,0.770642,0.704227,0.61551,0.67436,0.929772,0.623524,0.90966,0.970268,0.14919,0.985613,0.979353,0.972876,0.989255,0.994204,0.976668,0.983221,0.98546,0.954193,0.941678,0.97003,0.96796,0.975445,0.970482,0.971236,0.974849,0.911933,0.928679,0.943438,0.894476,0.922861,0.898486,0.934748,0.961177,0.95214,0.955797,0.957781,0.96094,0.977099,0.974664,0.969398,0.953749,0.961476,0.969972,0.973818,0.958921,0.893651,0.948368,0.851291,0.95956,0.941287,0.945068,0.901935,0.912038,0.976098,0.980038,0.976707,0.97757,0.978648,0.975018,0.954493,0.962592,0.949156,0.976871,0.964462,0.996351,0.997294,0.994657,0.993268,0.985557,0.994233,0.918288,0.00619174,0.00619174,0.00619174,0.00619174,0.00619174,
6,0.361562,0.657543,0.865228,0.502673,0.711345,0.739561,0.65968,0.724246,0.737021,0.884907,0.853824,0.809681,0.818726,0.815545,0.571726,0.798908,0.787438,0.364709,0.815847,0.900618,0.633358,0.647169,0.932625,0.639591,0.807605,0.799111,0.771617,0.787816,0.842476,0.828576,0.831641,0.854066,0.835547,0.826738,0.754638,0.714535,0.72542,0.622229,0.631582,0.431611,0.575785,0.81787,0.798026,0.818462,0.845795,0.725299,0.837042,0.781155,0.779142,0.506751,0.692207,0.672365,0.641788,0.461563,0.655398,0.761994,0.313251,0.743724,0.691581,0.755857,0.532492,0.719294,0.846179,0.806128,0.865103,0.82885,0.811685,0.850711,0.656966,0.752051,0.674696,0.909211,0.773421,0.807538,0.586939,0.743068,0.648503,0.707298,0.751042,0.648338,0.0104071,0.0104071,0.0104071,0.0104071,0.00879558,


Posibles clusters a dividir: [0, 1, 2, 3, 4, 5, 6]


In [None]:
def mix_clusters():
    global centroids, NUM_CLUSTERS
    
    # Matriz triangular superior de distancias entre centroides
    dist_lists = []
    for i, rc_i in centroids.iterrows():
        dist_lists.append([])
        for j, rc_j in centroids.iterrows():
            if j <= i:
                dist_lists[i].append(LARGER_DISTANCE)
            else:
                dist_lists[i].append(distance_qual(rc_i, rc_j))
    dist_matrix = np.array(dist_lists)
    
    to_eliminate = []
    # to_eliminate contendrá la mitad de los clusters unidos...
    while (dist_matrix.min() < LARGER_DISTANCE and len(to_eliminate) < P_MAX/2) :
        dist_min = dist_matrix.min()
        idx = (dist_matrix==dist_min).argmax()
        z1 = idx // len(centroids)
        z2 = idx % len(centroids)
        
        if dist_min < L_MIN:
            if TALK:
                print("Unificando clusters {} y {}".format(z1, z2))
                for i in range(NUM_CLUSTERS):
                    members = df[df["Cluster"]==i].count()["Cluster"]
                    print("El cluster ", i, " incluye ", members, "miembros.")
                print()

            # Modificar z1 para contener el centroide entre z1 y z2
            centroids.iloc[z1] = get_centroide(centroids.iloc[[z1, z2]]).loc[0]
            # Marcar puntos en z1 y z2 para reclasificar
            df.loc[df.Cluster == z1, 'Cluster'] = np.nan
            df.loc[df.Cluster == z2, 'Cluster'] = np.nan
            
            # Marcar z2 para eliminación
            to_eliminate.append(z2)
        
        dist_matrix[z1][z2] = LARGER_DISTANCE
        
    if len(to_eliminate) > 0:
        centroids.drop(to_eliminate, inplace=True)
        centroids = centroids.reset_index(drop=True)
        
        # Reetiquetar los registros afectados
        eliminated = 0
        for i in to_eliminate:
            i_e = i - eliminated
            # Recorrer las etiquetas para coincidir con los nuevos índices
            for cj in range(i_e + 1, NUM_CLUSTERS):
                df.loc[df.Cluster == cj, 'Cluster'] = cj - 1
            # Actualizar el número actual de centroides
            NUM_CLUSTERS -= 1
            eliminated += 1
            
        cluster_col_index = df.shape[1] - 1
        for index, row in df[pd.isnull(df["Cluster"])].iterrows():
            dists = []
            for i, r in centroids.iterrows():
                dists.append(distance_qual(row, r))
            df.iloc[index, cluster_col_index] = np.argmin(dists)
        update_centroids()
            
        if (TALK) : 
            # Contabilizar los elementos en cada cluster   
            for i in range(NUM_CLUSTERS):
                members = df[df["Cluster"]==i].count()["Cluster"]
                print("El cluster ", i, " incluye ", members, "miembros")
            print()

    return

#mix_clusters()

In [None]:
# Reproducido aquí para facilitar la ejecución
#iteration +=1 #usar si se está probando dividir/unir demostrativo

I_MAX_INT = 5 # Iteraciones permitidas en cada ciclo k-means

while iteration < I_MAX:
    if (iteration % 2 == 1 or NUM_CLUSTERS <= K_INIT / 2) :
        update_deltas()
        divide_clusters()
    elif (iteration % 2 == 0 or NUM_CLUSTERS > 2 * K_INIT) :
        mix_clusters()
        
    step = 0
    KEEP_WALKING = True
    while KEEP_WALKING and step < I_MAX_INT :
        KEEP_WALKING = update_clusters()
        update_centroids()
            
    iteration += 1
    
if TALK : 
    print ("No más cambios.")

Unificando clusters 3 y 6
El cluster  0  incluye  5803 miembros.
El cluster  1  incluye  13917 miembros.
El cluster  2  incluye  16881 miembros.
El cluster  3  incluye  14181 miembros.
El cluster  4  incluye  5816 miembros.
El cluster  5  incluye  31896 miembros.
El cluster  6  incluye  9949 miembros.

El cluster  0  incluye  9203 miembros
El cluster  1  incluye  18000 miembros
El cluster  2  incluye  20120 miembros
El cluster  3  incluye  7768 miembros
El cluster  4  incluye  9905 miembros
El cluster  5  incluye  33447 miembros

Actualizando clusters
El cluster  0  incluye  13242 miembros.
El cluster  1  incluye  18315 miembros.
El cluster  2  incluye  16915 miembros.
El cluster  3  incluye  16023 miembros.
El cluster  4  incluye  15719 miembros.
El cluster  5  incluye  18229 miembros.


El cluster  0  incluye  13242 miembros.
El cluster  1  incluye  18315 miembros.
El cluster  2  incluye  16915 miembros.
El cluster  3  incluye  16023 miembros.
El cluster  4  incluye  15719 miembros.


Unnamed: 0,Hobby,OpenSource,Country,Student,Employment,FormalEducation,UndergradMajor,CompanySize,DevType,YearsCoding,YearsCodingProf,JobSatisfaction,CareerSatisfaction,HopeFiveYears,JobSearchStatus,LastNewJob,UpdateCV,ConvertedSalary,CommunicationTools,TimeFullyProductive,EducationTypes,SelfTaughtTypes,HackathonReasons,AgreeDisagree1,AgreeDisagree2,AgreeDisagree3,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,FrameworkWorkedWith,FrameworkDesireNextYear,IDE,OperatingSystem,NumberMonitors,Methodology,VersionControl,CheckInCode,AdBlocker,AdBlockerDisable,AdBlockerReasons,AdsAgreeDisagree1,AdsAgreeDisagree2,AdsAgreeDisagree3,AdsActions,AIDangerous,AIInteresting,AIResponsible,AIFuture,EthicsChoice,EthicsReport,EthicsResponsible,EthicalImplications,StackOverflowRecommend,StackOverflowVisit,StackOverflowHasAccount,StackOverflowParticipate,StackOverflowJobs,StackOverflowDevStory,StackOverflowJobsRecommend,StackOverflowConsiderMember,HypotheticalTools1,HypotheticalTools2,HypotheticalTools3,HypotheticalTools4,HypotheticalTools5,WakeTime,HoursComputer,HoursOutside,SkipMeals,ErgonomicDevices,Exercise,EducationParents,RaceEthnicity,Age,Dependents,SurveyTooLong,SurveyEasy,LanguageWorkedWith,AssessJob,AssessBenefits,JobContactPriorities,JobEmailPriorities,AdsPriorities,Cluster
0,1,0,USA,0,0,1,6,3,"[0, 12, 11]",9,7,3,3,6,2,3,7,0.158553,"[5, 4, 8]",3,"[8, 7, 5]","[5, 7, 0]",[0],0,0,1,"[18, 3, 31, 14, 27]","[19, 14, 17]","[19, 13, 14]","[22, 14]","[14, 22, 2]","[0, 5]","[0, 5]","[18, 10, 19]",3,2,"[0, 9, 4]","[1, 4]",2,2,3,"[6, 4]",1,1,0,"[3, 2]",1,3,3,1,1,0,2,2,10,5,2,4,2,0,5,2,4,2,0,1,0,6,2,2,3,[2],3,2,[6],1,0,0,2,"[18, 14, 31, 5, 3, 17]","[9, 10, 6, 2, 1, 4, 5, 3, 8, 7]","[1, 2, 3, 10, 9, 4, 7, 5, 11, 8, 6]","[2, 1, 5, 3, 4]","[1, 6, 3, 2, 4, 7, 5]","[1, 3, 2, 4, 6, 7, 5]",
1,1,0,USA,0,0,1,6,3,"[0, 12, 11]",7,7,3,3,6,2,1,7,0.149083,"[8, 5, 4]",3,"[8, 7, 5]","[5, 7, 0]",[0],0,1,1,"[18, 14, 27, 5, 31]","[14, 19, 17]","[13, 14, 17]","[14, 22, 2]","[14, 2, 0]","[5, 1]","[5, 6]","[10, 18, 19]",3,2,"[0, 9, 4]","[1, 4]",2,2,3,"[6, 2]",1,1,0,"[3, 2]",1,3,3,1,1,3,2,2,10,2,2,4,2,2,5,2,3,3,3,3,4,5,2,0,3,[0],3,1,[6],1,0,0,2,"[18, 14, 5, 31, 17, 1]","[9, 8, 7, 1, 2, 3, 10, 4, 6, 5]","[1, 2, 3, 10, 8, 4, 11, 5, 9, 7, 6]","[2, 1, 5, 3, 4]","[1, 4, 7, 2, 3, 6, 5]","[1, 5, 2, 4, 6, 7, 3]",
2,1,1,USA,0,0,1,6,4,"[0, 12, 11, 15]",9,7,3,3,2,1,3,7,0.141658,"[8, 4, 7]",3,"[8, 1, 7, 3]","[7, 5, 0, 3]","[0, 4]",0,1,1,"[18, 27, 14, 1, 5]","[14, 17, 13]","[17, 18, 13]","[14, 0, 15]","[14, 0, 18]","[5, 6]","[6, 5]","[17, 19, 6]",2,2,"[0, 9, 4]","[1, 4]",2,2,3,"[6, 2]",1,1,0,"[3, 2]",2,3,3,1,1,0,2,2,10,5,2,4,2,0,5,2,3,2,3,4,3,5,2,0,3,[2],3,2,[6],1,0,0,4,"[18, 14, 5, 1, 31, 27, 17]","[9, 8, 7, 1, 2, 3, 10, 4, 6, 5]","[1, 2, 3, 10, 9, 4, 11, 5, 8, 7, 6]","[4, 1, 5, 3, 2]","[1, 5, 2, 3, 4, 7, 6]","[1, 4, 2, 3, 6, 7, 5]",
3,1,0,USA,0,0,1,6,8,"[0, 12, 11]",7,0,3,3,0,2,3,7,0.105104,"[8, 5]",3,"[8, 7]","[7, 5, 8]",[0],0,0,1,"[18, 14, 5, 27]","[14, 19]","[14, 17]","[14, 22]","[14, 22]",[5],"[5, 0]","[18, 10, 19]",3,1,"[0, 9]","[1, 4]",2,2,3,"[6, 2]",1,1,0,[3],0,3,3,1,1,0,2,2,10,2,2,4,1,2,5,2,2,2,2,2,2,6,2,2,3,[0],3,1,[6],1,0,1,2,"[18, 14, 5, 31, 17]","[9, 8, 7, 1, 2, 4, 10, 3, 6, 5]","[1, 2, 3, 10, 9, 4, 8, 5, 11, 7, 6]","[2, 1, 5, 3, 4]","[1, 5, 7, 2, 3, 6, 4]","[1, 4, 2, 3, 6, 7, 5]",
4,1,1,IND,0,0,1,6,8,"[0, 12, 11, 19]",7,0,5,3,6,2,3,7,0.054096,"[8, 5]",0,"[8, 7, 1]","[5, 7]",[4],0,0,1,"[18, 27, 14, 5, 17]","[14, 19]","[14, 13, 19]","[14, 2, 22]","[14, 2, 8]","[5, 1]","[5, 6]","[10, 15, 19]",3,2,"[0, 9]","[1, 4]",2,1,0,[0],1,1,0,"[2, 3]",0,3,3,1,0,0,2,2,10,5,2,4,1,2,5,2,3,4,4,3,3,6,1,0,3,[0],0,1,[6],0,0,1,2,"[14, 18, 5, 31, 17, 25]","[8, 9, 7, 2, 3, 4, 10, 1, 6, 5]","[1, 2, 3, 10, 6, 9, 11, 4, 8, 5, 7]","[2, 1, 5, 4, 3]","[1, 3, 7, 2, 4, 6, 5]","[1, 5, 2, 4, 6, 7, 3]",
5,1,0,IND,0,0,1,6,8,"[0, 19, 12]",7,11,5,5,6,1,5,7,0.009008,[8],3,[8],[5],[0],0,0,2,"[27, 18, 14]",[14],[14],[14],"[14, 2]",[5],[5],"[10, 15]",3,1,[0],[1],2,2,3,[6],1,1,0,[3],1,3,3,1,1,0,2,2,10,5,2,4,2,2,5,2,3,4,4,4,4,6,2,0,3,[2],3,1,[6],1,0,1,0,"[14, 5, 18]","[9, 8, 6, 1, 2, 3, 10, 4, 7, 5]","[1, 11, 2, 10, 8, 5, 7, 4, 9, 6, 3]","[2, 1, 5, 4, 3]","[1, 4, 7, 2, 5, 6, 3]","[1, 4, 2, 3, 6, 7, 5]",


Unnamed: 0,Hobby,OpenSource,Country,Student,Employment,FormalEducation,UndergradMajor,CompanySize,DevType,YearsCoding,YearsCodingProf,JobSatisfaction,CareerSatisfaction,HopeFiveYears,JobSearchStatus,LastNewJob,UpdateCV,ConvertedSalary,CommunicationTools,TimeFullyProductive,EducationTypes,SelfTaughtTypes,HackathonReasons,AgreeDisagree1,AgreeDisagree2,AgreeDisagree3,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,FrameworkWorkedWith,FrameworkDesireNextYear,IDE,OperatingSystem,NumberMonitors,Methodology,VersionControl,CheckInCode,AdBlocker,AdBlockerDisable,AdBlockerReasons,AdsAgreeDisagree1,AdsAgreeDisagree2,AdsAgreeDisagree3,AdsActions,AIDangerous,AIInteresting,AIResponsible,AIFuture,EthicsChoice,EthicsReport,EthicsResponsible,EthicalImplications,StackOverflowRecommend,StackOverflowVisit,StackOverflowHasAccount,StackOverflowParticipate,StackOverflowJobs,StackOverflowDevStory,StackOverflowJobsRecommend,StackOverflowConsiderMember,HypotheticalTools1,HypotheticalTools2,HypotheticalTools3,HypotheticalTools4,HypotheticalTools5,WakeTime,HoursComputer,HoursOutside,SkipMeals,ErgonomicDevices,Exercise,EducationParents,RaceEthnicity,Age,Dependents,SurveyTooLong,SurveyEasy,LanguageWorkedWith,AssessJob,AssessBenefits,JobContactPriorities,JobEmailPriorities,AdsPriorities,Cluster
0,0.396858,0.560602,0.850274,0.421038,0.368301,0.719738,0.635412,0.870297,0.712506,0.814835,0.816643,0.787411,0.786398,0.752667,0.572774,0.791048,0.810812,0.386517,0.787636,0.797124,0.694314,0.703828,0.926892,0.705187,0.823692,0.822627,0.791089,0.78475,0.860141,0.816337,0.84885,0.842636,0.838505,0.754552,0.602229,0.650788,0.695422,0.667558,0.662507,0.509792,0.681669,0.848273,0.79276,0.808645,0.861826,0.747608,0.849148,0.81209,0.77942,0.575967,0.693269,0.705244,0.707107,0.527626,0.563015,0.772845,0.439298,0.826592,0.612665,0.846656,0.789585,0.687146,0.865887,0.831689,0.846844,0.862011,0.8669,0.801016,0.687668,0.740059,0.641283,0.888291,0.798423,0.865473,0.615379,0.721619,0.650604,0.688942,0.798024,0.680508,0.00893,0.00893,0.00893,0.00893,0.00584605,
1,0.403707,0.559793,0.860004,0.422121,0.362378,0.696919,0.658478,0.875019,0.715116,0.848109,0.80067,0.789223,0.778489,0.756441,0.559292,0.811955,0.80911,0.377945,0.786656,0.791746,0.689911,0.710991,0.929286,0.702176,0.831523,0.823804,0.803359,0.793265,0.869082,0.834073,0.859898,0.854953,0.847954,0.806926,0.680998,0.658146,0.70198,0.663469,0.668529,0.518526,0.685013,0.847704,0.776445,0.79712,0.844612,0.740547,0.850751,0.814369,0.774194,0.57981,0.643834,0.698616,0.712878,0.503958,0.588876,0.755617,0.431034,0.807954,0.691307,0.806873,0.723337,0.710689,0.80273,0.815402,0.810226,0.815325,0.80432,0.798955,0.685195,0.754502,0.631815,0.874599,0.81712,0.838018,0.644878,0.717541,0.650431,0.69468,0.788986,0.691396,0.0078946,0.0078946,0.0078946,0.0078946,0.00516823,
2,0.29542,0.507255,0.823319,0.402088,0.418793,0.728639,0.660306,0.846693,0.726629,0.851051,0.839953,0.803128,0.783644,0.802828,0.748148,0.782915,0.797453,0.391941,0.768987,0.817785,0.655421,0.669912,0.854791,0.720379,0.841491,0.826418,0.810006,0.799602,0.850463,0.813063,0.840324,0.816816,0.824422,0.818757,0.688489,0.698334,0.674467,0.629819,0.564125,0.505235,0.668768,0.849701,0.797717,0.810285,0.870407,0.742012,0.847935,0.796472,0.782684,0.584803,0.638871,0.753515,0.714721,0.501051,0.598627,0.822515,0.425702,0.816791,0.613415,0.834529,0.796321,0.680846,0.836437,0.814542,0.853062,0.836437,0.835934,0.84267,0.697946,0.77597,0.648959,0.880493,0.829831,0.872167,0.621148,0.729423,0.668319,0.705105,0.775078,0.676925,0.00775543,0.00775543,0.00775543,0.00775543,0.00775543,
3,0.386857,0.557463,0.872103,0.481688,0.587392,0.752559,0.708358,0.828861,0.751038,0.860982,0.769658,0.8043,0.808743,0.80563,0.646215,0.799679,0.807112,0.359537,0.871416,0.867558,0.745845,0.752704,0.943814,0.770567,0.877751,0.850539,0.853855,0.835009,0.908541,0.830153,0.871362,0.915505,0.90029,0.838168,0.721492,0.792956,0.795004,0.721545,0.751848,0.550101,0.723373,0.867629,0.843147,0.856233,0.882944,0.793122,0.843245,0.825675,0.80859,0.666914,0.701405,0.760182,0.747342,0.600977,0.67355,0.808233,0.5373,0.835295,0.703574,0.804966,0.569088,0.794616,0.770674,0.753981,0.791396,0.812401,0.808029,0.8502,0.740478,0.782394,0.659275,0.906229,0.796738,0.874319,0.668155,0.824277,0.684888,0.704276,0.830747,0.756367,0.00907555,0.00907555,0.00907555,0.00907555,0.00907555,
4,0.325158,0.629333,0.842011,0.639048,0.666946,0.717916,0.705141,0.7706,0.744539,0.789998,0.699074,0.818373,0.855143,0.818745,0.628547,0.786959,0.834252,0.274834,0.906831,0.88547,0.773601,0.828093,0.949261,0.790287,0.859402,0.86755,0.820412,0.809558,0.867294,0.845103,0.874651,0.880077,0.86792,0.852653,0.738853,0.780944,0.809052,0.741236,0.785169,0.69804,0.69804,0.69804,0.814367,0.833796,0.855276,0.792486,0.884654,0.863772,0.799235,0.677357,0.754087,0.776254,0.8102,0.69902,0.68556,0.835664,0.535852,0.875449,0.786379,0.832015,0.666946,0.70298,0.849968,0.87597,0.84849,0.869433,0.874623,0.874362,0.80308,0.807001,0.757659,0.893827,0.837982,0.883406,0.845998,0.821018,0.716112,0.743216,0.851488,0.733439,0.00872141,0.00872141,0.00872141,0.00872141,0.00872141,
5,0.224497,0.618556,0.87242,0.677417,0.705856,0.723644,0.7963,0.663041,0.831809,0.722974,0.479214,0.347577,0.42713,0.964093,0.456363,0.852222,0.958548,0.129067,0.986536,0.979348,0.936401,0.966141,0.984689,0.938107,0.95554,0.945734,0.920377,0.912442,0.949422,0.94837,0.94294,0.961487,0.958479,0.940183,0.877373,0.854727,0.93297,0.846538,0.919385,0.820942,0.878412,0.945986,0.925319,0.930327,0.93683,0.928353,0.943048,0.950095,0.928148,0.896846,0.917149,0.92575,0.934667,0.897609,0.839144,0.92501,0.776288,0.937135,0.906489,0.909941,0.854193,0.876853,0.945674,0.943229,0.934637,0.933996,0.938988,0.949494,0.927595,0.927565,0.908248,0.960956,0.927257,0.964211,0.949764,0.954346,0.908311,0.925226,0.948082,0.89149,0.00755067,0.00755067,0.00755067,0.00755067,0.00755067,


Posibles clusters a dividir: [0, 1, 2, 3, 4, 5]
Clusters a eliminar: [3, 4]

Actualizando clusters
El cluster  0  incluye  22530 miembros.
El cluster  1  incluye  27204 miembros.
El cluster  2  incluye  22680 miembros.
El cluster  3  incluye  26029 miembros.


El cluster  0  incluye  22530 miembros.
El cluster  1  incluye  27204 miembros.
El cluster  2  incluye  22680 miembros.
El cluster  3  incluye  26029 miembros.



Unnamed: 0,Hobby,OpenSource,Country,Student,Employment,FormalEducation,UndergradMajor,CompanySize,DevType,YearsCoding,YearsCodingProf,JobSatisfaction,CareerSatisfaction,HopeFiveYears,JobSearchStatus,LastNewJob,UpdateCV,ConvertedSalary,CommunicationTools,TimeFullyProductive,EducationTypes,SelfTaughtTypes,HackathonReasons,AgreeDisagree1,AgreeDisagree2,AgreeDisagree3,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,FrameworkWorkedWith,FrameworkDesireNextYear,IDE,OperatingSystem,NumberMonitors,Methodology,VersionControl,CheckInCode,AdBlocker,AdBlockerDisable,AdBlockerReasons,AdsAgreeDisagree1,AdsAgreeDisagree2,AdsAgreeDisagree3,AdsActions,AIDangerous,AIInteresting,AIResponsible,AIFuture,EthicsChoice,EthicsReport,EthicsResponsible,EthicalImplications,StackOverflowRecommend,StackOverflowVisit,StackOverflowHasAccount,StackOverflowParticipate,StackOverflowJobs,StackOverflowDevStory,StackOverflowJobsRecommend,StackOverflowConsiderMember,HypotheticalTools1,HypotheticalTools2,HypotheticalTools3,HypotheticalTools4,HypotheticalTools5,WakeTime,HoursComputer,HoursOutside,SkipMeals,ErgonomicDevices,Exercise,EducationParents,RaceEthnicity,Age,Dependents,SurveyTooLong,SurveyEasy,LanguageWorkedWith,AssessJob,AssessBenefits,JobContactPriorities,JobEmailPriorities,AdsPriorities,Cluster
0,1,0,USA,0,0,1,6,3,"[0, 12, 11, 6]",9,0,3,3,6,2,3,7,0.14231,"[5, 8]",3,"[8, 7, 5]","[5, 7, 0]",[0],0,0,1,"[18, 3, 31, 14, 27]","[19, 14]","[19, 14]","[22, 14]","[14, 22, 2]","[0, 5]","[0, 5]","[18, 10, 19]",3,2,"[0, 9]","[1, 4]",2,2,3,"[6, 4]",1,1,0,"[3, 2]",1,3,3,1,1,0,2,2,10,5,2,4,2,0,5,2,2,2,0,1,0,6,2,2,3,[2],3,2,[6],1,0,0,2,"[18, 14, 31, 5, 3, 17]","[9, 7, 6, 2, 1, 4, 10, 3, 8, 5]","[1, 2, 3, 10, 9, 4, 7, 5, 11, 8, 6]","[2, 1, 5, 3, 4]","[1, 6, 3, 2, 4, 7, 5]","[1, 3, 2, 4, 6, 7, 5]",
1,1,0,USA,0,0,1,6,3,"[0, 12, 11]",7,0,3,3,6,2,1,7,0.130344,"[8, 5]",3,"[8, 7, 5]","[5, 7, 0]",[0],0,1,1,"[18, 14, 27, 5, 31]","[14, 19, 17]","[13, 14, 17]","[14, 22]","[14, 2, 0]","[5, 1]","[5, 6]","[10, 18, 19]",3,2,"[0, 9]","[1, 4]",2,2,3,"[6, 2]",1,1,0,"[3, 2]",1,3,3,1,1,3,2,2,10,2,2,4,2,2,5,2,3,3,3,3,4,5,2,0,3,[0],3,1,[6],1,0,0,2,"[18, 14, 5, 31, 17, 1]","[9, 8, 7, 1, 2, 3, 10, 4, 6, 5]","[1, 2, 3, 10, 8, 4, 11, 5, 9, 7, 6]","[2, 1, 5, 3, 4]","[1, 4, 7, 2, 3, 6, 5]","[1, 5, 2, 4, 6, 7, 3]",
2,1,1,USA,0,0,1,6,4,"[0, 12, 11, 15]",9,7,3,3,2,2,3,7,0.130506,"[8, 4, 7]",3,"[8, 1, 7]","[7, 5, 3]","[0, 4]",0,1,1,"[18, 27, 14, 5, 1]","[14, 17, 13]","[17, 18, 13]","[14, 0, 15]","[14, 0, 18]","[5, 6]","[6, 5]","[17, 19, 6]",2,2,"[0, 9, 4]","[1, 4]",2,2,3,"[6, 2]",1,1,0,"[3, 2]",2,3,3,1,1,0,2,2,10,5,2,4,2,0,5,2,3,2,3,4,3,5,2,0,3,[2],3,2,[6],1,0,0,4,"[18, 14, 5, 1, 31, 27, 17]","[9, 8, 7, 1, 2, 3, 10, 4, 6, 5]","[1, 2, 3, 10, 9, 4, 11, 5, 8, 7, 6]","[4, 1, 5, 3, 2]","[1, 5, 2, 3, 4, 7, 6]","[1, 4, 2, 3, 6, 7, 5]",
3,1,0,IND,0,0,1,6,8,"[0, 19, 12]",7,11,5,5,6,1,5,7,0.013241,[8],3,[8],[5],[0],0,0,2,"[18, 27, 14]","[14, 19]","[14, 13]","[14, 2]","[14, 2]",[5],[5],"[10, 15]",3,1,[0],[1],2,2,3,[0],1,1,0,[3],1,3,3,1,1,0,2,2,10,5,2,4,1,2,5,2,3,4,4,4,4,6,2,0,3,[2],3,1,[6],0,0,1,0,"[14, 5, 18, 17]","[9, 8, 6, 1, 2, 3, 10, 4, 7, 5]","[1, 11, 2, 10, 8, 5, 6, 4, 9, 7, 3]","[2, 1, 5, 4, 3]","[1, 3, 7, 2, 5, 6, 4]","[1, 4, 2, 3, 6, 7, 5]",



Actualizando clusters
El cluster  0  incluye  20918 miembros.
El cluster  1  incluye  25712 miembros.
El cluster  2  incluye  25496 miembros.
El cluster  3  incluye  26317 miembros.


El cluster  0  incluye  20918 miembros.
El cluster  1  incluye  25712 miembros.
El cluster  2  incluye  25496 miembros.
El cluster  3  incluye  26317 miembros.

Actualizando clusters
El cluster  0  incluye  20803 miembros.
El cluster  1  incluye  25680 miembros.
El cluster  2  incluye  25660 miembros.
El cluster  3  incluye  26300 miembros.


El cluster  0  incluye  20803 miembros.
El cluster  1  incluye  25680 miembros.
El cluster  2  incluye  25660 miembros.
El cluster  3  incluye  26300 miembros.

Actualizando clusters
El cluster  0  incluye  20654 miembros.
El cluster  1  incluye  25719 miembros.
El cluster  2  incluye  25765 miembros.
El cluster  3  incluye  26305 miembros.


El cluster  0  incluye  20654 miembros.
El cluster  1  incluye  25719 miembros.
El cluster  2  incluye  25765 miembros.
El cl

Unnamed: 0,Hobby,OpenSource,Country,Student,Employment,FormalEducation,UndergradMajor,CompanySize,DevType,YearsCoding,YearsCodingProf,JobSatisfaction,CareerSatisfaction,HopeFiveYears,JobSearchStatus,LastNewJob,UpdateCV,ConvertedSalary,CommunicationTools,TimeFullyProductive,EducationTypes,SelfTaughtTypes,HackathonReasons,AgreeDisagree1,AgreeDisagree2,AgreeDisagree3,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,FrameworkWorkedWith,FrameworkDesireNextYear,IDE,OperatingSystem,NumberMonitors,Methodology,VersionControl,CheckInCode,AdBlocker,AdBlockerDisable,AdBlockerReasons,AdsAgreeDisagree1,AdsAgreeDisagree2,AdsAgreeDisagree3,AdsActions,AIDangerous,AIInteresting,AIResponsible,AIFuture,EthicsChoice,EthicsReport,EthicsResponsible,EthicalImplications,StackOverflowRecommend,StackOverflowVisit,StackOverflowHasAccount,StackOverflowParticipate,StackOverflowJobs,StackOverflowDevStory,StackOverflowJobsRecommend,StackOverflowConsiderMember,HypotheticalTools1,HypotheticalTools2,HypotheticalTools3,HypotheticalTools4,HypotheticalTools5,WakeTime,HoursComputer,HoursOutside,SkipMeals,ErgonomicDevices,Exercise,EducationParents,RaceEthnicity,Age,Dependents,SurveyTooLong,SurveyEasy,LanguageWorkedWith,AssessJob,AssessBenefits,JobContactPriorities,JobEmailPriorities,AdsPriorities,Cluster
0,1,0,USA,0,0,1,6,3,"[0, 12, 11]",7,0,3,3,6,2,3,7,0.138694,"[5, 8]",3,"[8, 7, 5]","[5, 7, 0]",[0],0,0,1,"[18, 31, 3, 14]","[19, 14]","[19, 14]","[22, 14]","[14, 22, 2]","[0, 5]","[0, 5]","[18, 10, 19]",3,2,"[0, 9]","[1, 4]",2,2,3,"[6, 2]",1,1,0,"[3, 2]",1,3,3,1,1,0,2,2,10,2,2,4,2,2,5,2,2,2,0,3,3,5,2,2,3,[0],3,1,[6],1,0,0,2,"[18, 14, 5, 31, 3, 17]","[9, 8, 7, 2, 1, 4, 10, 3, 6, 5]","[1, 2, 3, 10, 8, 4, 7, 5, 11, 9, 6]","[2, 1, 5, 3, 4]","[1, 6, 7, 2, 3, 5, 4]","[1, 5, 2, 4, 6, 7, 3]",
1,1,1,USA,0,0,1,6,4,"[0, 12, 11, 15]",9,7,3,3,2,2,3,7,0.13697,"[8, 4, 7]",3,"[8, 1, 7]","[7, 5, 3]","[0, 4]",0,1,1,"[18, 27, 14, 5, 1]","[14, 17, 13]","[17, 13, 18]","[14, 0, 2]","[14, 0, 2]","[5, 6]","[5, 6]","[17, 19, 15]",2,2,"[0, 9, 4]","[1, 4]",2,2,3,"[6, 2]",1,1,0,"[3, 2]",2,3,3,1,1,0,2,2,10,5,2,4,2,0,5,2,3,2,3,4,3,6,2,0,3,[2],3,2,[6],1,0,0,4,"[18, 14, 5, 31, 1, 27]","[9, 10, 8, 1, 2, 3, 6, 4, 7, 5]","[1, 2, 3, 10, 9, 4, 11, 5, 8, 7, 6]","[4, 1, 5, 3, 2]","[1, 5, 2, 3, 4, 7, 6]","[1, 4, 2, 3, 6, 7, 5]",
2,1,0,IND,0,0,1,6,8,"[0, 12, 19]",7,11,5,5,6,1,5,7,0.019417,[8],3,"[8, 7]","[5, 7]",[0],0,0,2,"[18, 27, 14, 5]","[14, 19]","[14, 13]","[14, 2]","[14, 2]",[5],"[5, 6]","[10, 15]",3,1,[0],[1],2,2,3,[0],1,1,0,[3],1,3,3,1,1,0,2,2,10,5,2,4,1,2,5,2,3,4,4,4,4,6,2,0,3,[2],3,1,[6],0,0,1,0,"[14, 5, 18, 17]","[9, 8, 6, 1, 2, 4, 10, 3, 7, 5]","[1, 11, 2, 10, 9, 7, 6, 3, 8, 5, 4]","[2, 1, 5, 4, 3]","[1, 3, 7, 2, 5, 6, 4]","[1, 4, 2, 3, 6, 7, 5]",


Unnamed: 0,Hobby,OpenSource,Country,Student,Employment,FormalEducation,UndergradMajor,CompanySize,DevType,YearsCoding,YearsCodingProf,JobSatisfaction,CareerSatisfaction,HopeFiveYears,JobSearchStatus,LastNewJob,UpdateCV,ConvertedSalary,CommunicationTools,TimeFullyProductive,EducationTypes,SelfTaughtTypes,HackathonReasons,AgreeDisagree1,AgreeDisagree2,AgreeDisagree3,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,FrameworkWorkedWith,FrameworkDesireNextYear,IDE,OperatingSystem,NumberMonitors,Methodology,VersionControl,CheckInCode,AdBlocker,AdBlockerDisable,AdBlockerReasons,AdsAgreeDisagree1,AdsAgreeDisagree2,AdsAgreeDisagree3,AdsActions,AIDangerous,AIInteresting,AIResponsible,AIFuture,EthicsChoice,EthicsReport,EthicsResponsible,EthicalImplications,StackOverflowRecommend,StackOverflowVisit,StackOverflowHasAccount,StackOverflowParticipate,StackOverflowJobs,StackOverflowDevStory,StackOverflowJobsRecommend,StackOverflowConsiderMember,HypotheticalTools1,HypotheticalTools2,HypotheticalTools3,HypotheticalTools4,HypotheticalTools5,WakeTime,HoursComputer,HoursOutside,SkipMeals,ErgonomicDevices,Exercise,EducationParents,RaceEthnicity,Age,Dependents,SurveyTooLong,SurveyEasy,LanguageWorkedWith,AssessJob,AssessBenefits,JobContactPriorities,JobEmailPriorities,AdsPriorities,Cluster
0,0.41072,0.529436,0.862359,0.455351,0.439989,0.714753,0.674645,0.89118,0.725253,0.86318,0.812073,0.794509,0.783342,0.777875,0.609793,0.809172,0.80646,0.379691,0.835808,0.825936,0.713147,0.732587,0.939806,0.727062,0.856831,0.828526,0.828445,0.789175,0.881632,0.824427,0.85859,0.866273,0.862053,0.772576,0.605605,0.689567,0.735293,0.701625,0.716778,0.578511,0.730003,0.86956,0.800427,0.81806,0.861125,0.765642,0.872736,0.820203,0.791483,0.617926,0.70664,0.768459,0.728412,0.561157,0.616099,0.798412,0.487895,0.826443,0.739864,0.829517,0.706457,0.739908,0.875868,0.844659,0.89313,0.849943,0.868797,0.821812,0.724266,0.785601,0.64738,0.886026,0.81981,0.864057,0.643699,0.774243,0.67302,0.738535,0.793901,0.7003,0.0056755,0.0056755,0.0056755,0.0056755,0.0056755,
1,0.313283,0.565948,0.851458,0.424773,0.437468,0.725827,0.652142,0.865414,0.722086,0.846308,0.807453,0.797508,0.77852,0.814911,0.628052,0.80552,0.803393,0.382124,0.781016,0.822857,0.674867,0.705232,0.869645,0.723722,0.849785,0.82261,0.814053,0.80625,0.859716,0.82769,0.846766,0.835556,0.831587,0.827103,0.754871,0.709989,0.689234,0.638946,0.594538,0.552187,0.698802,0.862149,0.79982,0.813579,0.865512,0.745885,0.869828,0.805142,0.78219,0.589874,0.667525,0.775816,0.720249,0.520922,0.601537,0.81489,0.427794,0.822095,0.637823,0.849565,0.782904,0.679126,0.836459,0.845567,0.853982,0.852075,0.873696,0.827498,0.709105,0.767847,0.660148,0.888561,0.830829,0.879476,0.654756,0.718765,0.665491,0.739151,0.809633,0.698777,0.00582153,0.00582153,0.00582153,0.00582153,0.00582153,
2,0.269662,0.63211,0.862817,0.667382,0.709623,0.727469,0.770996,0.683379,0.812002,0.746959,0.659293,0.512875,0.630295,0.926334,0.60719,0.900325,0.919851,0.18157,0.972715,0.960478,0.876791,0.90807,0.972861,0.889799,0.927079,0.921313,0.88267,0.875459,0.915425,0.910614,0.920578,0.944016,0.918852,0.918656,0.819617,0.805605,0.904428,0.780484,0.881894,0.803616,0.874563,0.906386,0.889206,0.899173,0.908851,0.896846,0.921609,0.921905,0.881585,0.823866,0.874272,0.885743,0.891984,0.829126,0.788098,0.897695,0.700101,0.913482,0.846945,0.871938,0.757021,0.826445,0.914596,0.917495,0.90331,0.915312,0.91392,0.923305,0.891637,0.886912,0.852681,0.94489,0.895687,0.936507,0.914187,0.897675,0.837898,0.857382,0.922891,0.83412,0.00603132,0.00603132,0.00603132,0.00603132,0.00603132,


Posibles clusters a dividir: [0, 1, 2]
Clusters a eliminar: [0]

Actualizando clusters


In [None]:
display(centroids)
for i in range(NUM_CLUSTERS):
    print("Cluster {}: {} elementos".format(i, df[df["Cluster"]==i].count()["Cluster"]))

std_devs = std_dev()
std_devs["Cluster"] = std_devs.mean(axis=1)
display(std_devs)

update_deltas()

dist_lists = []
for i, rc_i in centroids.iterrows():
    dist_lists.append([])
    for j, rc_j in centroids.iterrows():
        dist_lists[i].append(distance_qual(rc_i, rc_j))
display(np.array(dist_lists))