In [1]:
# Load relevant packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as sm
import warnings
from sklearn.cluster import KMeans 

In [2]:
## load data set
carmax_df = pd.read_csv('CaseDataRemodeled.csv')
top10_df = pd.read_csv('Top10ByIncome.csv')

In [3]:
top10_df.columns
top10_df = top10_df.drop('Unnamed: 0', axis=1)

In [4]:
cluster_df = carmax_df[carmax_df['purchase_model'].isin(list(set(top10_df['purchase_model'])))]
cluster_df = cluster_df.drop('insert_num', axis=1)

In [5]:
cluster_df = cluster_df.drop('customer_age', axis=1)
cluster_df = cluster_df.drop('post_purchase_satisfaction', axis=1)
cluster_df = cluster_df.drop('purchase_make', axis=1)

In [6]:
# function to transform into numerical bins by normal/gaussian distribution
def categorical_transform(col):
    classnames, factorval, rank = np.unique(col, return_inverse = True,return_counts = True)
    
    temp = np.array(rank)
    temp.sort()
    temp = np.concatenate((temp[::2], temp[~np.isin(temp,temp[::2])][::-1]))

    category_count = {}
    for i in range(len(classnames)):
        category_count[classnames[i]] = rank[i]
    
    vals = [i * 5 for i in range(len(temp))]
    
    for i in range(len(temp)):
        for key, value in category_count.items():
            if value == temp[i]:
                category_count[key] = vals[i]
            
    check_list = list(col)
    for i in range(len(check_list)):
        for key, value in category_count.items():
            if check_list[i] == key:
                factorval[i] = value 
                break
    
    return category_count, factorval

In [7]:
## references of transformation for later 
references = {}

for i in cluster_df.columns:
    if cluster_df[i].dtype == 'object':
        classes = cluster_df[i]
        references[i], cluster_df[i] = categorical_transform(classes)

In [8]:
from sklearn.preprocessing import StandardScaler

X = cluster_df.values[:,:]
X = np.nan_to_num(X)
cluster_dataset = StandardScaler().fit_transform(X)
cluster_dataset

array([[ 0.46550396, -1.30434939, -0.09561966, ..., -0.44335392,
         1.27463107, -1.64014046],
       [-1.39124196,  0.0304836 ,  1.60397292, ..., -0.44335392,
        -0.86425427,  0.19713047],
       [ 0.03702413, -0.41446073, -0.09561966, ...,  0.39398434,
        -0.15129249,  0.19713047],
       ...,
       [ 0.17985074, -1.30434939, -0.09561966, ..., -0.44335392,
        -0.15129249,  1.11576594],
       [-1.10558875,  0.47542793,  0.75417663, ..., -0.44335392,
         1.27463107, -0.721505  ],
       [-0.24862909,  0.92037227, -0.09561966, ..., -0.44335392,
         0.56166929,  0.19713047]])

In [11]:
num_clusters = 4

k_means = KMeans(init="k-means++", n_clusters=num_clusters, n_init=50,algorithm = "elkan")
k_means.fit(cluster_dataset)
labels = k_means.labels_
cluster_df["Labels"] = labels

In [12]:
cluster_df.groupby('Labels').agg(lambda x: pd.Series.mode(x)[0])
#cluster_df.groupby('Labels').mean()

Unnamed: 0_level_0,purchase_model,purchase_vehicle_year,purchase_price_LB,purchase_price_UB,trade_in,vehicle_financing,customer_income,customer_gender,customer_previous_purchase,customer_distance_to_dealer,vehicle_warranty_used,subsequent_purchases,AgeBin,purchase_class
Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,90,2011,20001.0,25000.0,1,1,25,5,0,5.0,0,0,20,5
1,75,2010,15001.0,20000.0,0,1,30,5,0,5.0,0,0,25,15
2,75,2010,15001.0,20000.0,0,1,30,5,0,6.0,1,0,25,15
3,55,2012,15001.0,20000.0,1,1,25,5,0,2392.0,0,0,25,15


In [13]:
def cat_to_names(df,references):
    temp_df = df.copy()
    for key in references.keys():
        vals = references[key]
        for label_key in vals.keys():
            temp_df[key] = temp_df[key].replace(vals[label_key],label_key)
    return temp_df

In [14]:
### Clusters by Gender and Puchase Model


In [16]:
cluster_with_names = cat_to_names(cluster_df,references)
counts_of_puchase_models = cluster_with_names.groupby(['Labels','customer_gender','purchase_model']).count().reset_index()
cluster1 = counts_of_puchase_models[counts_of_puchase_models.Labels==0]
cluster1[cluster1.customer_gender=='F'].sort_values(by='purchase_vehicle_year',ascending=False).head(10)

Unnamed: 0,Labels,customer_gender,purchase_model,purchase_vehicle_year,purchase_price_LB,purchase_price_UB,trade_in,vehicle_financing,customer_income,customer_previous_purchase,customer_distance_to_dealer,vehicle_warranty_used,subsequent_purchases,AgeBin,purchase_class
10,0,F,EDGE,996,996,996,996,996,996,996,986,996,996,996,996
4,0,F,C300,905,905,905,905,905,905,905,892,905,905,905,905
0,0,F,328,781,781,781,781,781,781,781,771,781,781,781,781
14,0,F,G37,755,755,755,755,755,755,755,745,755,755,755,755
8,0,F,CR-V,586,586,586,586,586,586,586,581,586,586,586,586
21,0,F,RAV4,522,522,522,522,522,522,522,518,522,522,522,522
5,0,F,CAMRY,521,521,521,521,521,521,521,512,521,521,521,521
11,0,F,ESCAPE,517,517,517,517,517,517,517,511,517,517,517,517
2,0,F,ALTIMA,477,477,477,477,477,477,477,473,477,477,477,477
9,0,F,E350,443,443,443,443,443,443,443,434,443,443,443,443


In [None]:
#### Let's Filter out to only keep top 10

In [17]:
cluster_with_names = cat_to_names(cluster_df,references)
counts_of_puchase_models = cluster_with_names.groupby(['Labels','customer_gender','purchase_model']).count().reset_index().rename(columns={"purchase_vehicle_year":"count"})

top10_dfs = []
for i in counts_of_puchase_models['Labels'].unique():
    temp_df=counts_of_puchase_models[counts_of_puchase_models.Labels==i]
    for j in counts_of_puchase_models['customer_gender'].unique():
        temp_df2=temp_df[temp_df.customer_gender==j].sort_values(by='count',ascending=False).head(10)
        top10_dfs.append( temp_df2)
top10_dfs = pd.concat(top10_dfs)
top10_dfs[['Labels','customer_gender','purchase_model','count']]

Unnamed: 0,Labels,customer_gender,purchase_model,count
10,0,F,EDGE,996
4,0,F,C300,905
0,0,F,328,781
14,0,F,G37,755
8,0,F,CR-V,586
...,...,...,...,...
311,3,U,FOCUS,1
312,3,U,IMPALA,1
314,3,U,RAV4,1
315,3,U,SENTRA,1


In [None]:
print("Simplified from",cluster_with_names.shape[0],"rows to ",top10_dfs.shape[0],"rows.")

In [None]:
### Let's Just Look at the top within this segmentation

In [18]:
cluster_with_names = cat_to_names(cluster_df,references)
counts_of_puchase_models = cluster_with_names.groupby(['Labels','customer_gender','purchase_model']).count().reset_index().rename(columns={"purchase_vehicle_year":"count"})

top10_dfs = []
for i in counts_of_puchase_models['Labels'].unique():
    temp_df=counts_of_puchase_models[counts_of_puchase_models.Labels==i]
    for j in counts_of_puchase_models['customer_gender'].unique():
        temp_df2=temp_df[temp_df.customer_gender==j].sort_values(by='count',ascending=False).head(1)
        top10_dfs.append( temp_df2)
top10_dfs = pd.concat(top10_dfs)[['Labels','customer_gender','purchase_model','count']]
top10_dfs

Unnamed: 0,Labels,customer_gender,purchase_model,count
10,0,F,EDGE,996
41,0,M,F150,2854
62,0,U,C300,288
88,1,F,ALTIMA,3532
118,1,M,ALTIMA,3158
148,1,U,ALTIMA,1076
178,2,F,ALTIMA,1320
208,2,M,ALTIMA,1007
238,2,U,ALTIMA,430
281,3,F,MALIBU,3


In [19]:
counts_of_income_models = cluster_with_names.groupby(['Labels','customer_income','purchase_model']).count().reset_index().rename(columns={"purchase_vehicle_year":"count"})
top10_dfs = []
for i in counts_of_income_models['Labels'].unique():
    temp_df=counts_of_income_models[counts_of_income_models.Labels==i]
    for j in counts_of_income_models['customer_income'].unique():
        temp_df2=temp_df[temp_df.customer_income==j].sort_values(by='count',ascending=False).head(1)
        top10_dfs.append( temp_df2)
top10_dfs = pd.concat(top10_dfs)[['Labels','customer_income','purchase_model','count']]
top10_dfs

Unnamed: 0,Labels,customer_income,purchase_model,count
12,0,0 - 20000,F150,236
40,0,100001 - 120000,F150,253
66,0,120001 - 140000,F150,131
91,0,140001 - 160000,F150,148
116,0,160001 - 180000,F150,65
141,0,180001 - 200000,F150,44
166,0,200001+,F150,217
193,0,20001 - 40000,F150,261
222,0,40001 - 60000,F150,559
251,0,60001 - 80000,F150,480


In [20]:
test_df = pd.read_csv('CaseDataRemodeled.csv')

In [22]:
test_df = test_df.drop('customer_age', axis=1)
test_df = test_df.drop('post_purchase_satisfaction', axis=1)

In [24]:
test_df.dtypes

insert_num                       int64
purchase_make                    int64
purchase_model                   int64
purchase_vehicle_year            int64
purchase_price_LB              float64
purchase_price_UB              float64
trade_in                         int64
vehicle_financing                int64
customer_income                  int64
customer_gender                  int64
customer_previous_purchase       int64
customer_distance_to_dealer    float64
vehicle_warranty_used            int64
subsequent_purchases             int64
AgeBin                           int64
purchase_class                  object
dtype: object

In [23]:
## references of transformation for later 
test_references = {}
X\d = np.nan_to_num(X)
for i in test_df.columns:
    if test_df[i].dtype == 'object':
        classes = test_df[i]
        test_references[i], test_df[i] = categorical_transform(classes)

TypeError: '<' not supported between instances of 'str' and 'float'