In [1]:
# Load relevant packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as sm
import warnings
from sklearn.cluster import KMeans 

In [2]:
## load data set
carmax_df = pd.read_csv('CaseDataRemodeled.csv')
top10_df = pd.read_csv('Top10ByIncome.csv')

## Time to Cluster - Gonna use k-means initially 




In general, we can cluster observations on the basis of the features in order to identify subgroups among the observations, or we can cluster fea- tures on the basis of the observations in order to discover subgroups among the features. In what follows, for simplicity we will discuss clustering obser- vations on the basis of the features, though the converse can be performed by simply transposing the data matrix.

In [3]:
top10_df.columns
top10_df = top10_df.drop('Unnamed: 0', axis=1)

In [4]:
top10_df['purchase_model']

0        CAMRY
1       ALTIMA
2      COROLLA
3       MALIBU
4       ACCORD
        ...   
115     ACCORD
116    COROLLA
117      CIVIC
118       CR-V
119      FOCUS
Name: purchase_model, Length: 120, dtype: object

In [5]:
cluster_df = carmax_df[carmax_df['purchase_model'].isin(list(set(top10_df['purchase_model'])))]
cluster_df = cluster_df.drop('insert_num', axis=1)

In [6]:
cluster_df

Unnamed: 0,purchase_make,purchase_model,purchase_vehicle_year,purchase_price_LB,purchase_price_UB,trade_in,vehicle_financing,customer_age,customer_income,customer_gender,customer_previous_purchase,customer_distance_to_dealer,post_purchase_satisfaction,vehicle_warranty_used,subsequent_purchases,AgeBin
1,FORD,F150,2007,15001.0,20000.0,0,0,51 - 60,0 - 20000,F,1,19.0,?,0,0,Fifties
2,BMW,328,2010,25001.0,30000.0,1,1,41 - 50,60001 - 80000,F,1,21.0,?,0,0,Forties
5,NISSAN,ALTIMA,2009,15001.0,20000.0,1,1,21 - 30,60001 - 80000,U,0,2.0,?,1,1,Twenties
8,FORD,ESCAPE,2010,15001.0,20000.0,1,1,41 - 50,40001 - 60000,U,1,14.0,?,1,0,Forties
10,NISSAN,SENTRA,2009,10001.0,15000.0,0,1,61 - 70,20001 - 40000,U,0,,?,0,1,Sixties
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355854,FORD,FOCUS,2013,20001.0,25000.0,1,1,31 - 40,100001 - 120000,M,1,4.0,?,0,2,Thirties
355860,HYUNDAI,SONATA,2011,20001.0,25000.0,1,0,41 - 50,?,M,0,7.0,?,0,1,Forties
355864,HONDA,ACCORD,2007,15001.0,20000.0,0,1,21 - 30,20001 - 40000,F,0,12.0,?,0,0,Twenties
355866,TOYOTA,RAV4,2011,20001.0,25000.0,1,1,51 - 60,40001 - 60000,F,1,3.0,?,0,0,Fifties


In [7]:
len(list(set(top10_df['purchase_model'])))

30

In [8]:
cluster_df.dtypes

purchase_make                   object
purchase_model                  object
purchase_vehicle_year            int64
purchase_price_LB              float64
purchase_price_UB              float64
trade_in                         int64
vehicle_financing                int64
customer_age                    object
customer_income                 object
customer_gender                 object
customer_previous_purchase       int64
customer_distance_to_dealer    float64
post_purchase_satisfaction      object
vehicle_warranty_used            int64
subsequent_purchases             int64
AgeBin                          object
dtype: object

In [9]:
cluster_df = cluster_df.drop('customer_age', axis=1)
cluster_df = cluster_df.drop('post_purchase_satisfaction', axis=1)

In [None]:
# function to transform into numerical bins by normal/gaussian distribution
def categorical_transform(col):
    classnames, factorval, rank = np.unique(col, return_inverse = True,return_counts = True)
    
    temp = np.array(rank)
    temp.sort()
    temp = np.concatenate((temp[::2], temp[~np.isin(temp,temp[::2])][::-1]))

    category_count = {}
    for i in range(len(classnames)):
        category_count[classnames[i]] = rank[i]
    
    vals = [i * 5 for i in range(len(temp))]
    
    for i in range(len(temp)):
        for key, value in category_count.items():
            if value == temp[i]:
                category_count[key] = vals[i]
            
    check_list = list(col)
    for i in range(len(check_list)):
        for key, value in category_count.items():
            if check_list[i] == key:
                factorval[i] = value 
                break
    
    return category_count, factorval

In [23]:
## references of transformation for later 
references = {}

for i in cluster_df.columns:
    if cluster_df[i].dtype == 'object':
        classes = cluster_df[i]
        references[i], cluster_df[i] = categorical_transform(classes)

In [12]:
cluster_df.dtypes

purchase_make                   object
purchase_model                  object
purchase_vehicle_year            int64
purchase_price_LB              float64
purchase_price_UB              float64
trade_in                         int64
vehicle_financing                int64
customer_income                 object
customer_gender                 object
customer_previous_purchase       int64
customer_distance_to_dealer    float64
vehicle_warranty_used            int64
subsequent_purchases             int64
AgeBin                          object
dtype: object

In [32]:
cluster_df

Unnamed: 0,purchase_make,purchase_model,purchase_vehicle_year,purchase_price_LB,purchase_price_UB,trade_in,vehicle_financing,customer_income,customer_gender,customer_previous_purchase,customer_distance_to_dealer,vehicle_warranty_used,subsequent_purchases,AgeBin
1,20,90,2007,15001.0,20000.0,0,0,40,10,1,19.0,0,0,35
2,50,25,2010,25001.0,30000.0,1,1,20,10,1,21.0,0,0,20
5,30,75,2009,15001.0,20000.0,1,1,20,0,0,2.0,1,1,25
8,20,50,2010,15001.0,20000.0,1,1,25,0,1,14.0,1,0,20
10,30,45,2009,10001.0,15000.0,0,1,30,0,0,,0,1,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355854,20,100,2013,20001.0,25000.0,1,1,45,5,1,4.0,0,2,30
355860,45,95,2011,20001.0,25000.0,1,0,35,5,0,7.0,0,1,20
355864,35,80,2007,15001.0,20000.0,0,1,30,10,0,12.0,0,0,25
355866,25,35,2011,20001.0,25000.0,1,1,25,10,1,3.0,0,0,35


In [35]:
from sklearn.preprocessing import StandardScaler

X = cluster_df.values[:,1:]
X = np.nan_to_num(X)
cluster_dataset = StandardScaler().fit_transform(X)
cluster_dataset

array([[ 0.46550396, -1.30434939, -0.09561966, ..., -0.5351671 ,
        -0.44335392,  1.27463107],
       [-1.39124196,  0.0304836 ,  1.60397292, ..., -0.5351671 ,
        -0.44335392, -0.86425427],
       [ 0.03702413, -0.41446073, -0.09561966, ...,  1.86857526,
         0.39398434, -0.15129249],
       ...,
       [ 0.17985074, -1.30434939, -0.09561966, ..., -0.5351671 ,
        -0.44335392, -0.15129249],
       [-1.10558875,  0.47542793,  0.75417663, ..., -0.5351671 ,
        -0.44335392,  1.27463107],
       [-0.24862909,  0.92037227, -0.09561966, ..., -0.5351671 ,
        -0.44335392,  0.56166929]])

Lets run model cluster

In [36]:
num_clusters = 5

k_means = KMeans(init="k-means++", n_clusters=num_clusters, n_init=50,algorithm = "elkan")
k_means.fit(cluster_dataset)
labels = k_means.labels_

print(labels)

[4 0 1 ... 2 0 2]


In [37]:
cluster_df["Labels"] = labels
cluster_df.head(5)

Unnamed: 0,purchase_make,purchase_model,purchase_vehicle_year,purchase_price_LB,purchase_price_UB,trade_in,vehicle_financing,customer_income,customer_gender,customer_previous_purchase,customer_distance_to_dealer,vehicle_warranty_used,subsequent_purchases,AgeBin,Labels
1,20,90,2007,15001.0,20000.0,0,0,40,10,1,19.0,0,0,35,4
2,50,25,2010,25001.0,30000.0,1,1,20,10,1,21.0,0,0,20,0
5,30,75,2009,15001.0,20000.0,1,1,20,0,0,2.0,1,1,25,1
8,20,50,2010,15001.0,20000.0,1,1,25,0,1,14.0,1,0,20,1
10,30,45,2009,10001.0,15000.0,0,1,30,0,0,,0,1,15,2


In [57]:
# can easily change agg argument ---- is mode best?
cluster_df.groupby('Labels').agg(lambda x: pd.Series.mode(x)[0])

Unnamed: 0_level_0,purchase_make,purchase_model,purchase_vehicle_year,purchase_price_LB,purchase_price_UB,trade_in,vehicle_financing,customer_income,customer_gender,customer_previous_purchase,customer_distance_to_dealer,vehicle_warranty_used,subsequent_purchases,AgeBin
Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,20,90,2011,20001.0,25000.0,1,1,25,5,0,6.0,0,0,20
1,30,75,2010,10001.0,15000.0,0,1,30,5,0,6.0,1,0,25
2,30,75,2010,15001.0,20000.0,0,1,30,5,0,5.0,0,0,25
3,20,55,2012,15001.0,20000.0,1,1,25,5,0,2392.0,0,0,25
4,25,75,2010,15001.0,20000.0,0,0,35,5,0,5.0,0,0,20


#### Questions To Ask:
    How many clusters do we actually want?
    
    Are certain columns influencing negatively or contradicting? 
        e.g. cluster one average make is a Ford with Nissan Altima average model
    

In [48]:
cluster_df['purchase_make']

1         20
2         50
5         30
8         20
10        30
          ..
355854    20
355860    45
355864    35
355866    25
355873    15
Name: purchase_make, Length: 137788, dtype: int64

In [39]:
references['purchase_model']

{'328': 25,
 'ACCORD': 80,
 'ALTIMA': 75,
 'AVENGER': 125,
 'C300': 20,
 'CAMRY': 70,
 'CIVIC': 60,
 'COROLLA': 85,
 'CR-V': 130,
 'E350': 145,
 'EDGE': 120,
 'ESCAPE': 50,
 'F150': 90,
 'FOCUS': 100,
 'G37': 135,
 'GRAND CHEROKEE': 5,
 'IMPALA': 55,
 'MALIBU': 65,
 'MUSTANG': 105,
 'PRIUS': 15,
 'RAM 1500': 30,
 'RAV4': 35,
 'ROGUE': 110,
 'SENTRA': 45,
 'SILVERADO 1500': 115,
 'SONATA': 95,
 'TUNDRA': 140,
 'VERSA': 40,
 'WRANGLER': 10,
 'X5': 0}

## Raw Work / Testing 

In [10]:
classes = cluster_df['AgeBin']
classnames, factorval = np.unique(classes, return_inverse = True,return_counts = True)
factorval=[val*5 for val in factorval]

In [91]:
classnames, factorval, rank = np.unique(cluster_df['purchase_model'], return_inverse = True,return_counts = True)

In [92]:
factorval

array([12,  0,  2, ...,  1, 21, 17], dtype=int64)

In [93]:
temp = np.array(rank)
temp.sort()
temp[::2]

array([ 916, 2108, 2776, 3093, 3552, 3660, 3937, 4076, 4331, 4496, 4846,
       5429, 6123, 7100, 9462], dtype=int64)

In [94]:
temp[~np.isin(temp,temp[::2])][::-1]

array([11560,  7771,  6270,  5934,  5163,  4636,  4382,  4177,  3951,
        3762,  3601,  3478,  2934,  2393,  1871], dtype=int64)

In [95]:
temp = np.concatenate((temp[::2], temp[~np.isin(temp,temp[::2])][::-1]))
temp

array([  916,  2108,  2776,  3093,  3552,  3660,  3937,  4076,  4331,
        4496,  4846,  5429,  6123,  7100,  9462, 11560,  7771,  6270,
        5934,  5163,  4636,  4382,  4177,  3951,  3762,  3601,  3478,
        2934,  2393,  1871], dtype=int64)

In [98]:
category_count = {}
for i in range(len(classnames)):
    category_count[classnames[i]] = rank[i]

In [107]:
temp[15]

11560

In [105]:
vals = [i * 5 for i in range(len(temp))]
vals

[0,
 5,
 10,
 15,
 20,
 25,
 30,
 35,
 40,
 45,
 50,
 55,
 60,
 65,
 70,
 75,
 80,
 85,
 90,
 95,
 100,
 105,
 110,
 115,
 120,
 125,
 130,
 135,
 140,
 145]

In [121]:
for i in range(len(temp)):
    for key, value in category_count.items():
        if value == temp[i]:
            category_count[key] = vals[i]

In [124]:
category_count

{'328': 25,
 'ACCORD': 80,
 'ALTIMA': 75,
 'AVENGER': 125,
 'C300': 20,
 'CAMRY': 70,
 'CIVIC': 60,
 'COROLLA': 85,
 'CR-V': 130,
 'E350': 145,
 'EDGE': 120,
 'ESCAPE': 50,
 'F150': 90,
 'FOCUS': 100,
 'G37': 135,
 'GRAND CHEROKEE': 5,
 'IMPALA': 55,
 'MALIBU': 65,
 'MUSTANG': 105,
 'PRIUS': 15,
 'RAM 1500': 30,
 'RAV4': 35,
 'ROGUE': 110,
 'SENTRA': 45,
 'SILVERADO 1500': 115,
 'SONATA': 95,
 'TUNDRA': 140,
 'VERSA': 40,
 'WRANGLER': 10,
 'X5': 0}

In [112]:
category_count['X5']

916

In [144]:
check_list = list(cluster_df['purchase_model'])

for i in range(len(check_list)):
    for key, value in category_count.items():
        if check_list[i] == key:
            factorval[i] = value 
            break

In [145]:
factorval

array([90, 25, 75, ..., 80, 35, 65], dtype=int64)

In [29]:
cluster_df

Unnamed: 0,purchase_make,purchase_model,purchase_vehicle_year,purchase_price_LB,purchase_price_UB,trade_in,vehicle_financing,customer_income,customer_gender,customer_previous_purchase,customer_distance_to_dealer,vehicle_warranty_used,subsequent_purchases,AgeBin
1,20,90,2007,15001.0,20000.0,0,0,40,10,1,19.0,0,0,35
2,50,25,2010,25001.0,30000.0,1,1,20,10,1,21.0,0,0,20
5,30,75,2009,15001.0,20000.0,1,1,20,0,0,2.0,1,1,25
8,20,50,2010,15001.0,20000.0,1,1,25,0,1,14.0,1,0,20
10,30,45,2009,10001.0,15000.0,0,1,30,0,0,,0,1,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355854,20,100,2013,20001.0,25000.0,1,1,45,5,1,4.0,0,2,30
355860,45,95,2011,20001.0,25000.0,1,0,35,5,0,7.0,0,1,20
355864,35,80,2007,15001.0,20000.0,0,1,30,10,0,12.0,0,0,25
355866,25,35,2011,20001.0,25000.0,1,1,25,10,1,3.0,0,0,35
