### Grouping Similiar Properties

<div class="alert alert-block alert-info">
**Goal:** Using a synthetic, low-dimensional data set, cluster similiar properties<br>

**Method:** 
1. Develop a three dimensional property data set for testing purposes. Intentionally incorporate similar and dissimilar properties. Use only three dimensions so that results can be graphically visualized.
2. Run a k-prototype clustering algorithm to cluster like properties. Test k values ranging from 2-6.
3. Visualize results using plotly.
4. When comfortable with this toy example, expand synthetic dataset to include more dimensions, and repeat analysis.

In [1]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import numpy as np
from kmodes.kprototypes import KPrototypes
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools
import pandas as pd

In [2]:
init_notebook_mode(connected=True)

### Import the dataset and format

In [3]:
prop_file = '/Users/rebeccaward/Documents/professional/Levinson-consulting/toy-data/syn_prop_data.csv'

# read in and pretty up for printing
data = pd.read_csv(prop_file,usecols=[0,1,2,3])

# import property data and format for clustering
syms = np.genfromtxt(prop_file,dtype=str, delimiter=',',skip_header=1)[:,0]

# the rest of the data
X = np.genfromtxt(prop_file, dtype=object, delimiter=',',skip_header=1,usecols=(0,1,2,3))[:, 1:]

#make numeric data floats
X[:, 1] = X[:, 1].astype(float)

data

Unnamed: 0,Property,Manager,IRR,Location
0,PropertyA,ManagerA,7.2,SW
1,PropertyB,ManagerA,7.3,SW
2,PropertyC,ManagerB,4.2,NE
3,PropertyD,ManagerA,4.2,NW
4,PropertyE,ManagerC,6.7,NW
5,PropertyF,ManagerC,6.5,NE
6,PropertyG,ManagerD,7.2,SW
7,PropertyH,ManagerA,6.5,SE
8,PropertyI,ManagerD,5.0,SE
9,PropertyJ,ManagerD,5.3,SE


### Plot raw data

In [4]:
# dict to convert cat to number for plotting
# switch key, value in dict to get cat data as keys
temp = dict(enumerate(set(X[:,0])))
managers = {v:k for k,v in temp.items()}
temp= dict(enumerate(set(X[:,2])))
locations = {v:k for k,v in temp.items()}

# plot synthetic data
trace = go.Scatter3d(
    x = [managers[k] for k in X[:,0]],
    y = X[:,1],
    z = [locations[k] for k in X[:,2]],
    mode = 'markers',
    marker=dict(opacity=0.8),
)

layout = go.Layout(
            scene = dict(
                xaxis = dict(
                    title='Manager'),
                    yaxis = dict(
                        title='IRR'),
                    zaxis = dict(
                        title='Location'),),
                  )

fig = go.Figure(data=[trace], layout=layout)
iplot(fig, filename = 'cluster')

### Perform k-prototype clustering

K-prototype clustering is an adaptation of the more common k-means clustering. This adaptation allows for mixed datasets with both categorical and numerical data, and we can expect most of our datasets to be. It scales well to large datasets, although that will likely not be a factor for us.

In [5]:
training_stats = 0 # set to 1 for training stat output

kproto = KPrototypes(n_clusters=4, init='Cao', verbose=0)
clusters = kproto.fit_predict(X, categorical=[0, 2])

if training_stats == 1:
    # Print cluster centroids of the trained model.
    print(kproto.cluster_centroids_)
    # Print training statistics
    print(kproto.cost_)
    print(kproto.n_iter_)

for s, c in zip(syms, clusters):
    print("Symbol: {}, cluster:{}".format(s, c))

Symbol: PropertyA, cluster:0
Symbol: PropertyB, cluster:0
Symbol: PropertyC, cluster:3
Symbol: PropertyD, cluster:3
Symbol: PropertyE, cluster:2
Symbol: PropertyF, cluster:2
Symbol: PropertyG, cluster:0
Symbol: PropertyH, cluster:1
Symbol: PropertyI, cluster:3
Symbol: PropertyJ, cluster:3
Symbol: PropertyK, cluster:2


### Plot clusters

In [6]:
# plot clustered data
trace = go.Scatter3d(
    x = [managers[k] for k in X[:,0]],
    y = X[:,1],
    z = [locations[k] for k in X[:,2]],
    mode = 'markers',
    marker=dict(
            dict(color=clusters),
            opacity=0.8),
)

layout = go.Layout(
            scene = dict(
                xaxis = dict(
                    title='Manager'),
                    yaxis = dict(
                        title='IRR'),
                    zaxis = dict(
                        title='Location'),),
                  )

fig = go.Figure(data=[trace], layout=layout)
iplot(fig, filename = 'cluster')

### Side-by-side with different number of clusters

In [7]:
cluster_list = []
for num_clusters in range(2,7):
    kproto = KPrototypes(n_clusters=num_clusters, init='Cao', verbose=0)
    cluster_list.append(kproto.fit_predict(X, categorical=[0, 2]))
    


In [9]:
# plot clustered data

scene = dict(
    xaxis=dict(
        gridcolor='rgb(255, 255, 255)',
        zerolinecolor='rgb(255, 255, 255)',
        showbackground=True,
        backgroundcolor='rgb(230, 230,230)'
    ),
    yaxis=dict(
        gridcolor='rgb(255, 255, 255)',
        zerolinecolor='rgb(255, 255, 255)',
        showbackground=True,
        backgroundcolor='rgb(230, 230,230)'
    ),
    zaxis=dict(
        gridcolor='rgb(255, 255, 255)',
        zerolinecolor='rgb(255, 255, 255)',
        showbackground=True,
        backgroundcolor='rgb(230, 230,230)'
    )
)

trace1 = go.Scatter3d(
    x = [managers[k] for k in X[:,0]],
    y = X[:,1],
    z = [locations[k] for k in X[:,2]],
    mode = 'markers',
    marker=dict(
            dict(color=cluster_list[0]),
            opacity=0.8),
)
trace2 = go.Scatter3d(
    x = [managers[k] for k in X[:,0]],
    y = X[:,1],
    z = [locations[k] for k in X[:,2]],
    mode = 'markers',
    marker=dict(
            dict(color=cluster_list[1]),
            opacity=0.8),
)
trace3 = go.Scatter3d(
    x = [managers[k] for k in X[:,0]],
    y = X[:,1],
    z = [locations[k] for k in X[:,2]],
    mode = 'markers',
    marker=dict(
            dict(color=cluster_list[2]),
            opacity=0.8),
)
trace4 = go.Scatter3d(
    x = [managers[k] for k in X[:,0]],
    y = X[:,1],
    z = [locations[k] for k in X[:,2]],
    mode = 'markers',
    marker=dict(
            dict(color=cluster_list[3]),
            opacity=0.8),
)
trace5 = go.Scatter3d(
    x = [managers[k] for k in X[:,0]],
    y = X[:,1],
    z = [locations[k] for k in X[:,2]],
    mode = 'markers',
    marker=dict(
            dict(color=cluster_list[4]),
            opacity=0.8),
)

fig = tools.make_subplots(rows=3, cols=2, 
                          specs=[[{'is_3d': True}, {'is_3d': True}],
                                 [{'is_3d': True}, {'is_3d': True}],
                                 [{'is_3d':True}, {'is_3d':True}]]
                         )

fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig.append_trace(trace3, 2, 1)
fig.append_trace(trace4, 2, 2)
fig.append_trace(trace5, 3, 1)


fig['layout'].update(height=1200, width=1000, title='X-axis: Manager, Y-axis: IRR, Z-axis: Location')
iplot(fig, filename='clusters-many')

This is the format of your plot grid:
[ (1,1) scene1 ]  [ (1,2) scene2 ]
[ (2,1) scene3 ]  [ (2,2) scene4 ]
[ (3,1) scene5 ]  [ (3,2) scene6 ]

