In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 
import plotly.plotly as py
import plotly.graph_objs as go
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.preprocessing import StandardScaler
 
%matplotlib inline

In [2]:
# Cargando datos
data = pd.read_csv("segmentacion_clientes.csv" , delimiter=";" , decimal="," , encoding='utf-8')
df=data

In [3]:
df.columns

Index(['PRODUCT_ID', 'SUBSCRIPTION_ID', 'SUBSCRIBER_ID', 'COMUNA', 'TI', 'TI2',
       'TIPO_REPARTO', 'QUEJAS', 'DAÑOS', 'CORTES', 'RECLAMOS',
       'FACTURACION_PROMEDIO', 'SALDO_PROMEDIO', 'ROTACION_CARTERA'],
      dtype='object')

In [4]:
## Se eliminan preduct_id mal formados (-)
df[df.PRODUCT_ID < 0] 
df = df[df.PRODUCT_ID > 0] 


## Convertimos las variables categoricas
#http://pbpython.com/categorical-encoding.html

df["COMUNA"] = df["COMUNA"].astype('category')
df["TIPO_REPARTO"] = df["TIPO_REPARTO"].astype('category')
df["TI"] = df["TI"].astype('category')
df["TI2"] = df["TI2"].astype('category')

# luego la encodificamos 
df["TI"] = df["TI"].cat.codes
df["TI2"] = df["TI2"].cat.codes
df["COMUNA_ENC"] = df["COMUNA"].cat.codes 

df[:3]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A 

Unnamed: 0,PRODUCT_ID,SUBSCRIPTION_ID,SUBSCRIBER_ID,COMUNA,TI,TI2,TIPO_REPARTO,QUEJAS,DAÑOS,CORTES,RECLAMOS,FACTURACION_PROMEDIO,SALDO_PROMEDIO,ROTACION_CARTERA,COMUNA_ENC
0,122551,122551,13185036,128 - San Esteban ...,0,0,N - REPARTO NORMAL,0,0,0,0,18964.8333,33256.6667,1.753504,9
1,122581,122581,122581,128 - San Esteban ...,0,0,N - REPARTO NORMAL,0,0,0,0,28445.75,36605.0833,1.286793,9
2,109938,109938,3665521,145 - San Felipe ...,0,1,N - REPARTO NORMAL,0,0,1,0,135045.417,327823.5,2.427488,22


 ## Clustering por KMEANS

In [7]:
# Seleccion de variables 

sample=df[['QUEJAS', 'DAÑOS',
            'CORTES', 'RECLAMOS','FACTURACION_PROMEDIO',
            'SALDO_PROMEDIO', 'ROTACION_CARTERA' ,
           'COMUNA_ENC' ]].sample(frac=0.09 , random_state=42) 

In [9]:
## KMEANS
####################################################################
  
X = np.array(sample)

registros = sample.shape[0]

# Estandarización
#scaler = StandardScaler().fit(X)
#X = scaler.transform(X)

# Creando el modelo
k=5
kmeans = KMeans(n_clusters=k  ).fit(X)
 
# Prediciendo los clusters
labels = kmeans.predict(X)
  
# Obteniendo los centroides
C = kmeans.cluster_centers_

# Asignando los colores
colores=['red','cyan','yellow','orange','purple' , 'red','cyan','yellow','orange','purple'] ## misma cantidad de colores que numero de k
asignar=[]

for row in labels:
    asignar.append(colores[row])


## COORDENADAS PARALELAS 

data = [
    go.Parcoords(
        line = dict(color = labels   ,autocolorscale=True ),  
        dimensions = list([
            dict(#range = [0,8], 
                label = 'QUEJAS' ,      values = X[:, 0]),
            dict( range = [0,30],
                label = 'DAÑOS',        values = X[:, 1]),
            dict( #range = [0,8],
                label = 'CORTES',       values = X[:, 2]),
            dict( #range = [0,8],
                label = 'RECLAMOS',     values = X[:, 3]),
            dict( range = [0,500000],
                label = 'FACTURACION_PROMEDIO', values = X[:, 4]),
            dict( range = [0,500000],
                label = 'SALDO_PROMEDIO',   values = X[:, 5]),             
            dict( range = [0,3],
                label = 'ROTACION_CARTERA', values = X[:, 6]),  
            dict( #range = [0,8],
                label = 'COMUNA',        values = X[:, 7]), 
            dict(
                range = [0,k-1],#   5 CLUSTERS
                label = 'CLUSTER', values = labels)  
        ])
    )
]

layout = go.Layout(
    title='Segmentación de clientes Kmeans '+str(registros)+' Registros' ,
    plot_bgcolor = '#FFFFFF',
    paper_bgcolor = '#FFFFFF' 
)

fig = go.Figure(data = data, layout = layout)
py.iplot(fig, filename = 'kmeans'+str(registros))