In [241]:
'''Load Packages'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as sm
import warnings
from sklearn.cluster import KMeans 
import random

In [266]:
# function to transform into numerical bins by normal/gaussian distribution
def categorical_transform(col):
    classnames, factorval, rank = np.unique(col, return_inverse = True,return_counts = True)
    
      
    temp = np.sort(rank)
    temp = np.concatenate((temp[::2], temp[1::2][::-1]))
    rank = random.sample(range(1, len(temp)+1),len(temp))
    
    category_count = {}
    for i in range(len(classnames)):
        category_count[classnames[i]] = rank[i]
    
    check_list = list(col)
    for i in range(len(check_list)):
        for key, value in category_count.items():
            if check_list[i] == key:
                factorval[i] = value 
                break
    
    return category_count, factorval

In [251]:
def cat_to_names(df,references):
    ref = df.columns[df.columns.isin(references.keys())].to_list()
    temp_df = df.copy()
    for key in ref:
        vals = references[key]
        for label_key in vals.keys():
            temp_df[key] = temp_df[key].replace(vals[label_key],label_key)
    return temp_df

In [252]:
## load data set
carmax_df = pd.read_csv('CaseDataRemodeled.csv')
new_df = pd.read_csv('newdata.csv')

In [253]:
carmax_df['job_assign'] = new_df['job_assign']

In [254]:
cluster_df = carmax_df.copy().drop(['customer_age','customer_previous_purchase','customer_distance_to_dealer',
                             'post_purchase_satisfaction','vehicle_warranty_used','subsequent_purchases',
                             'purchase_price_UB'], axis=1)

In [255]:
## references of transformation for later 
references = {}

# transform to numerical values 
for i in cluster_df.columns:
    if cluster_df[i].dtype == 'object':
        classes = cluster_df[i]
        references[i], cluster_df[i] = categorical_transform(classes)

In [259]:
from sklearn.preprocessing import StandardScaler

X = cluster_df.values[:,:]
X = np.nan_to_num(X)
cluster_dataset = StandardScaler().fit_transform(X)
cluster_dataset

array([[ 1.18651596, -0.09022377,  0.99205841, ...,  0.80818504,
        -0.23298338, -1.12743222],
       [-0.13593213,  0.48405951, -1.01442395, ..., -0.29504912,
        -0.23298338,  0.05074849],
       [ 0.65753673, -0.50437037,  0.18946547, ...,  0.80818504,
        -2.02193906,  0.73059506],
       ...,
       [ 1.89182161, -1.08417561,  0.99205841, ...,  0.25656796,
         1.55597229, -0.57959469],
       [-0.66491136,  1.21847948, -2.61960984, ..., -1.94990037,
         0.66149446, -0.40138248],
       [ 0.48121031,  1.04177693,  0.99205841, ...,  0.80818504,
        -0.23298338,  1.23552965]])

In [260]:
num_clusters = 10

k_means = KMeans(init="k-means++", n_clusters=num_clusters, n_init=50,algorithm = "elkan")
k_means.fit(cluster_dataset)
labels = k_means.labels_
cluster_df["Labels"] = labels

In [261]:
cluster_df.groupby('Labels').agg(lambda x: pd.Series.mode(x)[0])
#cluster_df.groupby('Labels').mean()

Unnamed: 0_level_0,purchase_make,purchase_model,purchase_vehicle_year,purchase_price_LB,trade_in,vehicle_financing,customer_income,customer_gender,AgeBin,purchase_class,made_in,job_assign
Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,23,19,2011,35001.0,1,1,5,2,8,6,4,246
1,25,496,2012,10001.0,0,1,8,2,5,7,6,246
2,18,395,2005,10001.0,0,1,8,2,5,7,6,642
3,18,104,2010,15001.0,1,1,3,2,11,7,6,529
4,6,104,2010,15001.0,0,0,5,2,8,7,7,642
5,12,104,2010,15001.0,1,1,8,2,11,7,7,397
6,37,222,2010,20001.0,0,1,3,2,8,7,4,246
7,12,104,2010,15001.0,0,1,8,2,5,7,7,618
8,38,73,2011,15001.0,0,1,8,3,5,7,8,642
9,8,268,2010,15001.0,0,1,8,2,11,2,7,246


In [262]:
cluster_with_names = cat_to_names(cluster_df,references)

In [263]:
#cluster_with_names.groupby('Labels').agg(lambda x: pd.Series.mode(x)[0])
cluster_with_names.groupby('Labels').count()

Unnamed: 0_level_0,purchase_make,purchase_model,purchase_vehicle_year,purchase_price_LB,trade_in,vehicle_financing,customer_income,customer_gender,AgeBin,purchase_class,made_in,job_assign
Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,17700,17700,17700,17700,17700,17700,17700,17700,17700,17700,17700,17700
1,42602,42602,42602,42602,42602,42602,42602,42602,42602,42602,42602,42602
2,36694,36694,36694,36694,36694,36694,36694,36694,36694,36694,36694,36694
3,37847,37847,37847,37847,37847,37847,37847,37847,37847,37847,37847,37847
4,51611,51611,51611,51611,51611,51611,51611,51611,51611,51611,51611,51611
5,39481,39481,39481,39480,39481,39481,39481,39481,39481,39481,39481,39481
6,33995,33995,33995,33993,33995,33995,33995,33995,33995,33995,33995,33995
7,35105,35105,35105,35105,35105,35105,35105,35105,35105,35105,35105,35105
8,21969,21969,21969,21969,21969,21969,21969,21969,21969,21969,21969,21969
9,38870,38870,38870,38870,38870,38870,38870,38870,38870,38870,38870,38870


In [264]:
cluster0 = cluster_with_names[cluster_with_names.Labels==0]
cluster0

Unnamed: 0,purchase_make,purchase_model,purchase_vehicle_year,purchase_price_LB,trade_in,vehicle_financing,customer_income,customer_gender,AgeBin,purchase_class,made_in,job_assign,Labels
37,CHEVROLET,AVALANCHE 1500,2011,30001.0,1,1,60001 - 80000,M,Fifties,truck,UNITED STATES,Electrical and Electronic Engineering Technolo...,0
69,LAND ROVER,LR4,2010,35001.0,1,1,120001 - 140000,M,Forties,suv,UNITED KINGDOM,Podiatrists,0
79,BUICK,ENCLAVE,2012,30001.0,1,1,40001 - 60000,U,Forties,suv,UNITED STATES,"Arts, Design, Entertainment, Sports, and Media...",0
106,FORD,F250,2008,30001.0,1,1,200001+,M,Fifties,truck,UNITED STATES,"Physicians, All Other; and Ophthalmologists, E...",0
116,JEEP,GRAND CHEROKEE,2014,40001.0,1,0,80001 - 100000,M,Twenties,suv,UNITED STATES,Agricultural Engineers,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
355737,PORSCHE,PANAMERA,2011,60001.0,1,1,200001+,M,Thirties,sedan,GERMANY,General Internal Medicine Physicians,0
355760,BMW,550,2010,35001.0,0,1,20001 - 40000,U,Twenties,sedan,GERMANY,Log Graders and Scalers,0
355764,AUDI,S5,2010,40001.0,0,0,0 - 20000,U,UnderTwenty,convertible,GERMANY,Cashiers,0
355853,BMW,Z4,2011,35001.0,1,1,100001 - 120000,U,Thirties,convertible,GERMANY,Computer Hardware Engineers,0
