In [58]:
# Load relevant packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as sm
import warnings
from sklearn.cluster import KMeans 

In [68]:
# function to transform into numerical bins by normal/gaussian distribution
def categorical_transform(col):
    classnames, factorval, rank = np.unique(col, return_inverse = True,return_counts = True)
    
    temp = np.array(rank)
    temp.sort()
    temp = np.concatenate((temp[::2], temp[~np.isin(temp,temp[::2])][::-1]))

    category_count = {}
    for i in range(len(classnames)):
        category_count[classnames[i]] = rank[i]
    
    vals = [i * 5 for i in range(len(temp))]
    
    for i in range(len(temp)):
        for key, value in category_count.items():
            if value == temp[i]:
                category_count[key] = vals[i]
            
    check_list = list(col)
    for i in range(len(check_list)):
        for key, value in category_count.items():
            if check_list[i] == key:
                factorval[i] = value 
                break
    
    return category_count, factorval

In [73]:
## load data set
carmax_df = pd.read_csv('CaseDataRemodeled.csv')
top10_df = pd.read_csv('Top10ByIncome.csv')

In [74]:
top10_df.columns
top10_df = top10_df.drop('Unnamed: 0', axis=1)

In [75]:
cluster_df = carmax_df[carmax_df['purchase_model'].isin(list(set(top10_df['purchase_model'])))]
cluster_df = cluster_df.drop('insert_num', axis=1)

In [76]:
## references of transformation for later 
references = {}

for i in cluster_df.columns:
    if cluster_df[i].dtype == 'object':
        classes = cluster_df[i]
        references[i], cluster_df[i] = categorical_transform(classes)

In [77]:
cluster_df = cluster_df.drop('customer_age', axis=1)
cluster_df = cluster_df.drop('post_purchase_satisfaction', axis=1)

In [78]:
from sklearn.preprocessing import StandardScaler

X = cluster_df.values[:,:]
X = np.nan_to_num(X)
cluster_dataset = StandardScaler().fit_transform(X)
cluster_dataset

array([[-0.52362348,  0.46550396, -1.30434939, ..., -0.44335392,
         1.27463107, -1.64014046],
       [ 2.24695256, -1.39124196,  0.0304836 , ..., -0.44335392,
        -0.86425427,  0.19713047],
       [ 0.39990186,  0.03702413, -0.41446073, ...,  0.39398434,
        -0.15129249,  0.19713047],
       ...,
       [ 0.86166454,  0.17985074, -1.30434939, ..., -0.44335392,
        -0.15129249,  1.11576594],
       [-0.06186081, -1.10558875,  0.47542793, ..., -0.44335392,
         1.27463107, -0.721505  ],
       [-0.98538616, -0.24862909,  0.92037227, ..., -0.44335392,
         0.56166929,  0.19713047]])

In [81]:
num_clusters = 4

k_means = KMeans(init="k-means++", n_clusters=num_clusters, n_init=50,algorithm = "elkan")
k_means.fit(cluster_dataset)
labels = k_means.labels_
cluster_df["Labels"] = labels

In [82]:
cluster_df.groupby('Labels').agg(lambda x: pd.Series.mode(x)[0])
#cluster_df.groupby('Labels').mean()

Unnamed: 0_level_0,purchase_make,purchase_model,purchase_vehicle_year,purchase_price_LB,purchase_price_UB,trade_in,vehicle_financing,customer_income,customer_gender,customer_previous_purchase,customer_distance_to_dealer,vehicle_warranty_used,subsequent_purchases,AgeBin,purchase_class
Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,30,75,2010,15001.0,20000.0,0,1,30,5,0,5.0,0,0,25,15
1,20,90,2011,20001.0,25000.0,1,1,25,5,0,5.0,0,0,20,5
2,30,75,2010,15001.0,20000.0,0,1,30,5,0,6.0,1,0,25,15
3,20,55,2012,15001.0,20000.0,1,1,25,5,0,2392.0,0,0,25,15


In [98]:
cluster_df.columns

Index(['purchase_make', 'purchase_model', 'purchase_vehicle_year',
       'purchase_price_LB', 'purchase_price_UB', 'trade_in',
       'vehicle_financing', 'customer_income', 'customer_gender',
       'customer_previous_purchase', 'customer_distance_to_dealer',
       'vehicle_warranty_used', 'subsequent_purchases', 'AgeBin',
       'purchase_class', 'Labels'],
      dtype='object')

In [99]:
references.keys()

dict_keys(['purchase_make', 'purchase_model', 'customer_age', 'customer_income', 'customer_gender', 'post_purchase_satisfaction', 'AgeBin', 'purchase_class'])

In [110]:
cluster_df.columns[cluster_df.columns.isin(references.keys())].to_list()

['purchase_make',
 'purchase_model',
 'customer_income',
 'customer_gender',
 'AgeBin',
 'purchase_class']

In [117]:
def cat_to_names(df,references):
    ref = df.columns[df.columns.isin(references.keys())].to_list()
    temp_df = df.copy()
    for key in ref:
        vals = references[key]
        for label_key in vals.keys():
            temp_df[key] = temp_df[key].replace(vals[label_key],label_key)
    return temp_df

In [118]:
### Clusters by Gender and Puchase Model
cluster_with_names.groupby('Labels').agg(lambda x: pd.Series.mode(x)[0])

Unnamed: 0_level_0,purchase_make,purchase_model,purchase_vehicle_year,purchase_price_LB,purchase_price_UB,trade_in,vehicle_financing,customer_income,customer_gender,customer_previous_purchase,customer_distance_to_dealer,vehicle_warranty_used,subsequent_purchases,AgeBin,purchase_class
Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,FORD,F150,2011,20001.0,25000.0,1,1,40001 - 60000,M,0,5.0,0,0,Forties,truck
1,NISSAN,ALTIMA,2010,15001.0,20000.0,0,1,20001 - 40000,M,0,5.0,0,0,Twenties,sedan
2,FORD,F150,2012,15001.0,20000.0,1,1,40001 - 60000,M,0,2392.0,0,0,Twenties,sedan
3,NISSAN,ALTIMA,2010,15001.0,20000.0,0,1,20001 - 40000,M,0,6.0,1,0,Twenties,sedan


In [119]:
cluster_with_names = cat_to_names(cluster_df,references)
counts_of_puchase_models = cluster_with_names.groupby(['Labels','customer_gender','purchase_model']).count().reset_index()
cluster1 = counts_of_puchase_models[counts_of_puchase_models.Labels==0]
cluster1[cluster1.customer_gender=='F'].sort_values(by='purchase_vehicle_year',ascending=False).head(10)

Unnamed: 0,Labels,customer_gender,purchase_model,purchase_make,purchase_vehicle_year,purchase_price_LB,purchase_price_UB,trade_in,vehicle_financing,customer_income,customer_previous_purchase,customer_distance_to_dealer,vehicle_warranty_used,subsequent_purchases,AgeBin,purchase_class
2,0,F,ALTIMA,3564,3564,3564,3564,3564,3564,3564,3564,3518,3564,3564,3564,3564
5,0,F,CAMRY,2818,2818,2818,2818,2818,2818,2818,2818,2780,2818,2818,2818,2818
7,0,F,COROLLA,2391,2391,2391,2391,2391,2391,2391,2391,2363,2391,2391,2391,2391
1,0,F,ACCORD,2262,2262,2262,2262,2262,2262,2262,2262,2236,2262,2262,2262,2262
17,0,F,MALIBU,2200,2200,2200,2200,2200,2200,2200,2200,2155,2200,2200,2200,2200
6,0,F,CIVIC,2055,2055,2055,2055,2055,2055,2055,2055,2030,2055,2055,2055,2055
27,0,F,VERSA,1975,1975,1975,1975,1975,1975,1975,1975,1947,1975,1975,1975,1975
23,0,F,SENTRA,1781,1781,1781,1781,1781,1781,1781,1781,1755,1781,1781,1781,1781
25,0,F,SONATA,1643,1643,1643,1643,1643,1643,1643,1643,1621,1643,1643,1643,1643
22,0,F,ROGUE,1612,1612,1612,1612,1612,1612,1612,1612,1596,1612,1612,1612,1612


In [14]:
#### Let's Filter out to only keep top 10

In [120]:
cluster_with_names = cat_to_names(cluster_df,references)
counts_of_puchase_models = cluster_with_names.groupby(['Labels','customer_gender','purchase_model']).count().reset_index().rename(columns={"purchase_vehicle_year":"count"})

top10_dfs = []
for i in counts_of_puchase_models['Labels'].unique():
    temp_df=counts_of_puchase_models[counts_of_puchase_models.Labels==i]
    for j in counts_of_puchase_models['customer_gender'].unique():
        temp_df2=temp_df[temp_df.customer_gender==j].sort_values(by='count',ascending=False).head(10)
        top10_dfs.append( temp_df2)
top10_dfs = pd.concat(top10_dfs)
top10_dfs[['Labels','customer_gender','purchase_model','count']]

Unnamed: 0,Labels,customer_gender,purchase_model,count
2,0,F,ALTIMA,3564
5,0,F,CAMRY,2818
7,0,F,COROLLA,2391
1,0,F,ACCORD,2262
17,0,F,MALIBU,2200
...,...,...,...,...
311,3,U,FOCUS,1
312,3,U,IMPALA,1
314,3,U,RAV4,1
315,3,U,SENTRA,1


In [16]:
print("Simplified from",cluster_with_names.shape[0],"rows to ",top10_dfs.shape[0],"rows.")

Simplified from 137788 rows to  118 rows.


In [17]:
### Let's Just Look at the top within this segmentation

In [18]:
cluster_with_names = cat_to_names(cluster_df,references)
counts_of_puchase_models = cluster_with_names.groupby(['Labels','customer_gender','purchase_model']).count().reset_index().rename(columns={"purchase_vehicle_year":"count"})

top10_dfs = []
for i in counts_of_puchase_models['Labels'].unique():
    temp_df=counts_of_puchase_models[counts_of_puchase_models.Labels==i]
    for j in counts_of_puchase_models['customer_gender'].unique():
        temp_df2=temp_df[temp_df.customer_gender==j].sort_values(by='count',ascending=False).head(1)
        top10_dfs.append( temp_df2)
top10_dfs = pd.concat(top10_dfs)[['Labels','customer_gender','purchase_model','count']]
top10_dfs

Unnamed: 0,Labels,customer_gender,purchase_model,count
4,0,F,C300,1122
41,0,M,F150,2866
62,0,U,C300,350
88,1,F,ALTIMA,3574
118,1,M,ALTIMA,3179
148,1,U,ALTIMA,1079
191,2,F,MALIBU,3
218,2,M,WRANGLER,6
219,2,U,CIVIC,2
229,3,F,ALTIMA,1320


In [19]:
counts_of_income_models = cluster_with_names.groupby(['Labels','customer_income','purchase_model']).count().reset_index().rename(columns={"purchase_vehicle_year":"count"})
top10_dfs = []
for i in counts_of_income_models['Labels'].unique():
    temp_df=counts_of_income_models[counts_of_income_models.Labels==i]
    for j in counts_of_income_models['customer_income'].unique():
        temp_df2=ta emp_df[temp_df.customer_income==j].sort_values(by='count',ascending=False).head(1)
        top10_dfs.append( temp_df2)
top10_dfs = pd.concat(top10_dfs)[['Labels','customer_income','purchase_model','count']]
top10_dfs

Unnamed: 0,Labels,customer_income,purchase_model,count
12,0,0 - 20000,F150,238
40,0,100001 - 120000,F150,258
66,0,120001 - 140000,F150,131
91,0,140001 - 160000,F150,148
117,0,160001 - 180000,F150,66
142,0,180001 - 200000,F150,44
168,0,200001+,F150,217
187,0,20001 - 40000,C300,266
224,0,40001 - 60000,F150,561
253,0,60001 - 80000,F150,480


In [121]:
k_means.inertia_

1631905.6355266874

In [122]:
k_means.inertia_/len(cluster_df)

11.843597668350563

In [135]:
cluster_df.shape

(137788, 16)

In [130]:
cluster_df_2

Unnamed: 0,purchase_model,purchase_vehicle_year,purchase_price_LB,purchase_price_UB,trade_in,vehicle_financing,customer_income,customer_gender,customer_previous_purchase,customer_distance_to_dealer,vehicle_warranty_used,subsequent_purchases,AgeBin,purchase_class,Labels
1,90,2007,15001.0,20000.0,0,0,40,10,1,19.0,0,0,35,5,1
2,25,2010,25001.0,30000.0,1,1,20,10,1,21.0,0,0,20,15,2
5,75,2009,15001.0,20000.0,1,1,20,0,0,2.0,1,1,25,15,0
8,50,2010,15001.0,20000.0,1,1,25,0,1,14.0,1,0,20,10,0
10,45,2009,10001.0,15000.0,0,1,30,0,0,,0,1,15,15,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355854,100,2013,20001.0,25000.0,1,1,45,5,1,4.0,0,2,30,15,2
355860,95,2011,20001.0,25000.0,1,0,35,5,0,7.0,0,1,20,15,2
355864,80,2007,15001.0,20000.0,0,1,30,10,0,12.0,0,0,25,20,1
355866,35,2011,20001.0,25000.0,1,1,25,10,1,3.0,0,0,35,10,2


In [136]:
references_2 = {}
for i in cluster_df_2.columns:
    if cluster_df_2[i].dtype == 'object':
        classes = cluster_df_2[i]
        references_2[i], cluster_df_2[i] = categorical_transform(classes)

In [125]:
cluster_df_2 = cluster_df.copy()
cluster_df_2= cluster_df_2.drop('purchase_make', axis=1)

## references of transformation for later 
references_2 = {}

for i in cluster_df_2.columns:
    if cluster_df_2[i].dtype == 'object':
        classes = cluster_df_2[i]
        references_2[i], cluster_df_2[i] = categorical_transform(classes)
        
        
X = cluster_df_2.values[:,:]
X = np.nan_to_num(X)
cluster_dataset_2 = StandardScaler().fit_transform(X)
cluster_dataset_2



num_clusters = 4

k_means_2 = KMeans(init="k-means++", n_clusters=num_clusters, n_init=50,algorithm = "elkan")
k_means_2.fit(cluster_dataset_2)
labels_2 = k_means_2.labels_
cluster_df_2["Labels"] = labels_2

In [126]:
cluster_df_2.groupby('Labels').agg(lambda x: pd.Series.mode(x)[0])
#cluster_df.groupby('Labels').mean()

Unnamed: 0_level_0,purchase_model,purchase_vehicle_year,purchase_price_LB,purchase_price_UB,trade_in,vehicle_financing,customer_income,customer_gender,customer_previous_purchase,customer_distance_to_dealer,vehicle_warranty_used,subsequent_purchases,AgeBin,purchase_class
Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,75,2010,15001.0,20000.0,0,1,30,5,0,6.0,1,0,25,15
1,75,2010,15001.0,20000.0,0,1,30,5,0,5.0,0,0,25,15
2,90,2011,20001.0,25000.0,1,1,25,5,0,5.0,0,0,20,5
3,55,2012,15001.0,20000.0,1,1,25,5,0,2392.0,0,0,25,15


In [137]:
cluster_with_names_2 = cat_to_names(cluster_df_2,references)

In [138]:
### Clusters by Gender and Puchase Model
cluster_with_names_2.groupby('Labels').agg(lambda x: pd.Series.mode(x)[0])

Unnamed: 0_level_0,purchase_model,purchase_vehicle_year,purchase_price_LB,purchase_price_UB,trade_in,vehicle_financing,customer_income,customer_gender,customer_previous_purchase,customer_distance_to_dealer,vehicle_warranty_used,subsequent_purchases,AgeBin,purchase_class
Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,ALTIMA,2010,15001.0,20000.0,0,1,20001 - 40000,M,0,6.0,1,0,Twenties,sedan
1,ALTIMA,2010,15001.0,20000.0,0,1,20001 - 40000,M,0,5.0,0,0,Twenties,sedan
2,F150,2011,20001.0,25000.0,1,1,40001 - 60000,M,0,5.0,0,0,Forties,truck
3,F150,2012,15001.0,20000.0,1,1,40001 - 60000,M,0,2392.0,0,0,Twenties,sedan


In [144]:
k_means.inertia_/len(cluster_df.columns)

101994.10222041796

In [143]:
k_means_2.inertia_/len(cluster_df_2.columns)

100253.3242158036

In [139]:
k_means_2.inertia_

1503799.863237054

In [140]:
k_means.inertia_

1631905.6355266874

In [146]:
cluster_dataset[1].sum()

6.430084222761828

In [151]:
k_means.cluster_centers_

array([[ 1.69708886e-01, -2.29108230e-02, -5.49082725e-02,
        -4.47414903e-01, -4.47414903e-01, -1.15740967e-01,
         3.16246489e-02,  4.87908485e-02,  2.96015185e-02,
        -7.76794647e-02, -2.19453893e-02, -5.35167099e-01,
        -9.82812162e-03, -5.68984801e-03,  9.12125172e-02],
       [-4.96689544e-01,  9.54727234e-02,  4.51809752e-01,
         1.41761265e+00,  1.41761265e+00,  2.92957408e-01,
        -1.54865117e-01, -1.28844059e-01, -8.93001069e-02,
         2.36106390e-01, -1.23050806e-02, -3.01284060e-01,
         9.71536002e-03,  1.46637709e-02, -2.63362965e-01],
       [ 7.84437459e-02, -4.20073523e-02, -3.54879178e-01,
        -3.27793508e-01, -3.27793508e-01, -1.91276184e-03,
         8.48871508e-02,  6.97994951e-03,  1.65952035e-02,
        -4.52275020e-02, -4.60877720e-02,  1.86848822e+00,
         1.69759406e-02, -5.98764073e-04,  3.78595199e-02],
       [-1.99913568e-01, -1.85314200e-01,  7.63541512e-02,
         1.23400002e-01,  1.23400002e-01,  2.01553756

In [152]:
cluster_dataset

array([[-0.52362348,  0.46550396, -1.30434939, ..., -0.44335392,
         1.27463107, -1.64014046],
       [ 2.24695256, -1.39124196,  0.0304836 , ..., -0.44335392,
        -0.86425427,  0.19713047],
       [ 0.39990186,  0.03702413, -0.41446073, ...,  0.39398434,
        -0.15129249,  0.19713047],
       ...,
       [ 0.86166454,  0.17985074, -1.30434939, ..., -0.44335392,
        -0.15129249,  1.11576594],
       [-0.06186081, -1.10558875,  0.47542793, ..., -0.44335392,
         1.27463107, -0.721505  ],
       [-0.98538616, -0.24862909,  0.92037227, ..., -0.44335392,
         0.56166929,  0.19713047]])

In [153]:
from scipy.cluster.vq import whiten

In [157]:
help(whiten)

Help on function whiten in module scipy.cluster.vq:

whiten(obs, check_finite=True)
    Normalize a group of observations on a per feature basis.
    
    Before running k-means, it is beneficial to rescale each feature
    dimension of the observation set with whitening. Each feature is
    divided by its standard deviation across all observations to give
    it unit variance.
    
    Parameters
    ----------
    obs : ndarray
        Each row of the array is an observation.  The
        columns are the features seen during each observation.
    
        >>> #         f0    f1    f2
        >>> obs = [[  1.,   1.,   1.],  #o0
        ...        [  2.,   2.,   2.],  #o1
        ...        [  3.,   3.,   3.],  #o2
        ...        [  4.,   4.,   4.]]  #o3
    
    check_finite : bool, optional
        Whether to check that the input matrices contain only finite numbers.
        Disabling may give a performance gain, but may result in problems
        (crashes, non-termination) if th

In [158]:
help(StandardScaler)

Help on class StandardScaler in module sklearn.preprocessing._data:

class StandardScaler(sklearn.base.TransformerMixin, sklearn.base.BaseEstimator)
 |  StandardScaler(*, copy=True, with_mean=True, with_std=True)
 |  
 |  Standardize features by removing the mean and scaling to unit variance
 |  
 |  The standard score of a sample `x` is calculated as:
 |  
 |      z = (x - u) / s
 |  
 |  where `u` is the mean of the training samples or zero if `with_mean=False`,
 |  and `s` is the standard deviation of the training samples or one if
 |  `with_std=False`.
 |  
 |  Centering and scaling happen independently on each feature by computing
 |  the relevant statistics on the samples in the training set. Mean and
 |  standard deviation are then stored to be used on later data using
 |  :meth:`transform`.
 |  
 |  Standardization of a dataset is a common requirement for many
 |  machine learning estimators: they might behave badly if the
 |  individual features do not more or less look like s

In [160]:
cluster_df_2 = cluster_df.copy()

## references of transformation for later 
references_2 = {}

for i in cluster_df_2.columns:
    if cluster_df_2[i].dtype == 'object':
        classes = cluster_df_2[i]
        references_2[i], cluster_df_2[i] = categorical_transform(classes)
        
        
X = cluster_df_2.values[:,:]
X = np.nan_to_num(X)
cluster_dataset_2 = whiten(X)
cluster_dataset_2



num_clusters = 4

k_means_2 = KMeans(init="k-means++", n_clusters=num_clusters, n_init=50,algorithm = "elkan")
k_means_2.fit(cluster_dataset_2)
labels_2 = k_means_2.labels_
cluster_df_2["Labels"] = labels_2

In [161]:
cluster_with_names_2 = cat_to_names(cluster_df_2,references)
cluster_with_names_2.groupby('Labels').agg(lambda x: pd.Series.mode(x)[0])

Unnamed: 0_level_0,purchase_make,purchase_model,purchase_vehicle_year,purchase_price_LB,purchase_price_UB,trade_in,vehicle_financing,customer_income,customer_gender,customer_previous_purchase,customer_distance_to_dealer,vehicle_warranty_used,subsequent_purchases,AgeBin,purchase_class
Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,FORD,F150,2011,20001.0,25000.0,1,1,40001 - 60000,M,0,5.0,0,0,Forties,truck
1,NISSAN,ALTIMA,2010,15001.0,20000.0,0,1,20001 - 40000,M,0,5.0,0,0,Twenties,sedan
2,FORD,F150,2012,15001.0,20000.0,1,1,40001 - 60000,M,0,2392.0,0,0,Twenties,sedan
3,NISSAN,ALTIMA,2010,15001.0,20000.0,0,1,20001 - 40000,M,0,6.0,1,0,Twenties,sedan


In [162]:
cluster_with_names.groupby('Labels').agg(lambda x: pd.Series.mode(x)[0])

Unnamed: 0_level_0,purchase_make,purchase_model,purchase_vehicle_year,purchase_price_LB,purchase_price_UB,trade_in,vehicle_financing,customer_income,customer_gender,customer_previous_purchase,customer_distance_to_dealer,vehicle_warranty_used,subsequent_purchases,AgeBin,purchase_class
Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,NISSAN,ALTIMA,2010,15001.0,20000.0,0,1,20001 - 40000,M,0,5.0,0,0,Twenties,sedan
1,FORD,F150,2011,20001.0,25000.0,1,1,40001 - 60000,M,0,5.0,0,0,Forties,truck
2,NISSAN,ALTIMA,2010,15001.0,20000.0,0,1,20001 - 40000,M,0,6.0,1,0,Twenties,sedan
3,FORD,F150,2012,15001.0,20000.0,1,1,40001 - 60000,M,0,2392.0,0,0,Twenties,sedan


In [163]:
k_means_2.inertia_

1631904.6465359956

In [164]:
k_means.inertia_

1631905.6355266874