In [12]:
'''Load Packages'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as sm
import warnings
from sklearn.cluster import KMeans 
import random
from sklearn import metrics 
from scipy.spatial.distance import cdist
from kmodes.kmodes import KModes

### Utilities

In [26]:
def top10cluster(cluster_name,
                 class_name,
                 p_make, 
                 p_model, 
                 t_in, 
                 vehicleFinance,
                 cust_income,
                 gender,
                 sub_purch,
                 Age,
                 madeIn, 
                 purch_price):
    cluster_number = ref[class_name].predict([[p_make, p_model, t_in, vehicleFinance,cust_income,gender,sub_purch,
                                               Age,madeIn, purch_price]])[0]
    
    return cluster_name[cluster_name.cluster_label == cluster_number].groupby(['purchase_make',
                                                                               'purchase_model']).count().sort_values(by = 'trade_in',
                                                                                                                      ascending = False).reset_index()[['purchase_make',
                                                                                                                      'purchase_model']].head(10)
def highlight_max(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_max = s == s.max()
    return ['background-color: green' if v else '' for v in is_max]


In [13]:
## load data set
carmax_df = pd.read_csv('CaseDataRemodeled.csv')
new_df = pd.read_csv('newdata.csv')

In [14]:
carmax_df['job_assign'] = new_df['job_assign']
carmax_df['customer_income'] = new_df['new_income']
carmax_df['customer_age'] = new_df['new_ages']
carmax_df['purchase_price'] = new_df['purchase_price']

In [15]:
cluster_df = carmax_df.copy().drop(['customer_age','customer_previous_purchase','customer_distance_to_dealer',
                             'post_purchase_satisfaction','vehicle_warranty_used','purchase_price_LB',
                             'purchase_price_UB','purchase_class','job_assign','purchase_vehicle_year'], axis=1)

In [16]:
cluster_df['purchase_class'] = carmax_df['purchase_class']
cluster_df

Unnamed: 0,purchase_make,purchase_model,trade_in,vehicle_financing,customer_income,customer_gender,subsequent_purchases,AgeBin,made_in,purchase_price,purchase_class
0,DODGE,CHARGER,1,1,40001 - 60000,U,1,Twenties,UNITED STATES,15001 - 20000,sedan
1,FORD,F150,0,0,0 - 20000,F,0,Fifties,UNITED STATES,15001 - 20000,truck
2,BMW,328,1,1,60001 - 80000,F,0,Forties,GERMANY,25001 - 30000,sedan
3,LEXUS,GS 300,0,1,20001 - 40000,M,0,Twenties,JAPAN,10001 - 15000,sedan
4,CHEVROLET,CRUZE,1,1,120001 - 140000,M,1,Thirties,UNITED STATES,15001 - 20000,sedan
5,NISSAN,ALTIMA,1,1,60001 - 80000,U,1,Twenties,JAPAN,15001 - 20000,sedan
6,NISSAN,FRONTIER,0,1,20001 - 40000,M,1,Twenties,JAPAN,20001 - 25000,truck
7,NISSAN,PATHFINDER,1,1,80001 - 100000,M,0,Sixties,JAPAN,35001 - 40000,suv
8,FORD,ESCAPE,1,1,40001 - 60000,U,0,Forties,UNITED STATES,15001 - 20000,suv
9,FORD,FUSION,0,1,140001 - 160000,M,1,Fifties,UNITED STATES,15001 - 20000,sedan


In [18]:
sedan = cluster_df[cluster_df.purchase_class=='sedan'].copy().reset_index(drop = True).drop('purchase_class', axis = 1)
truck = cluster_df[cluster_df.purchase_class=='truck'].copy().reset_index(drop = True).drop('purchase_class', axis = 1)
suv = cluster_df[cluster_df.purchase_class=='suv'].copy().reset_index(drop = True).drop('purchase_class', axis = 1)
wagon = cluster_df[cluster_df.purchase_class=='wagon'].copy().reset_index(drop = True).drop('purchase_class', axis = 1)
van = cluster_df[cluster_df.purchase_class=='van'].copy().reset_index(drop = True).drop('purchase_class', axis = 1)
coupe = cluster_df[cluster_df.purchase_class=='coupe'].copy().reset_index(drop = True).drop('purchase_class', axis = 1)
convertible = cluster_df[cluster_df.purchase_class=='convertible'].copy().reset_index(drop = True).drop('purchase_class', axis = 1)

vehicle_classes = [sedan,truck,suv,wagon,van,coupe,convertible]

### Go through each cluster and run and KModes clustering algorithm on it

Store the resulting KModes object and add the labels to the corresponding dataframe

In [19]:
ref = {}
vehicle_classes_name = ['sedan','truck','suv','wagon','van','coupe','convertible']
index = 0

for group in vehicle_classes:  
    k_modes = KModes(n_clusters=7, init='Huang', n_init=5, verbose=0,random_state = 2013)
    ref[vehicle_classes_name[index]] = k_modes.fit(group.values)    
    labels = k_modes.labels_
    group["cluster_label"] = labels
    index += 1

In [21]:
sedan.groupby('cluster_label').agg(lambda x: pd.Series.mode(x)[0])

Unnamed: 0_level_0,purchase_make,purchase_model,trade_in,vehicle_financing,customer_income,customer_gender,subsequent_purchases,AgeBin,made_in,purchase_price
cluster_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,FORD,FUSION,1,0,20001 - 40000,M,0,Thirties,UNITED STATES,15001 - 20000
1,NISSAN,ALTIMA,1,1,20001 - 40000,F,0,Twenties,JAPAN,10001 - 15000
2,CHEVROLET,MALIBU,0,1,20001 - 40000,M,0,Twenties,UNITED STATES,15001 - 20000
3,TOYOTA,CAMRY,0,1,60001 - 80000,F,0,Thirties,JAPAN,15001 - 20000
4,NISSAN,ALTIMA,0,1,40001 - 60000,M,0,Thirties,JAPAN,15001 - 20000
5,TOYOTA,COROLLA,0,1,20001 - 40000,M,0,Thirties,JAPAN,10001 - 15000
6,BMW,328,0,1,0 - 20000,M,0,Forties,GERMANY,10001 - 15000


In [9]:
truck.groupby('cluster_label').agg(lambda x: pd.Series.mode(x)[0])

Unnamed: 0_level_0,purchase_make,purchase_model,trade_in,vehicle_financing,customer_income,customer_gender,subsequent_purchases,AgeBin,made_in,purchase_price
cluster_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,CHEVROLET,SILVERADO 1500,0,0,40001 - 60000,M,0,Forties,UNITED STATES,10001 - 15000
1,FORD,F150,0,1,60001 - 80000,M,0,Fifties,UNITED STATES,15001 - 20000
2,CHEVROLET,SILVERADO 1500,1,1,60001 - 80000,M,0,Twenties,UNITED STATES,15001 - 20000
3,TOYOTA,TACOMA,1,1,40001 - 60000,M,0,Forties,JAPAN,20001 - 25000
4,DODGE,RAM 1500,1,1,20001 - 40000,M,0,Twenties,UNITED STATES,20001 - 25000
5,GMC,SIERRA 1500,1,1,40001 - 60000,M,0,Fifties,UNITED STATES,25001 - 30000
6,FORD,F150,1,1,20001 - 40000,M,0,Forties,UNITED STATES,10001 - 15000


In [10]:
k_modes.predict([['','','0','0','40001 - 60000','M','0','Forties','','25001 - 30000']])

array([3], dtype=uint16)

In [11]:
ref['suv'].cluster_centroids_

array([['JEEP', 'PATRIOT', '0', '1', '40001 - 60000', 'F', '0',
        'Twenties', 'UNITED STATES', '15001 - 20000'],
       ['FORD', 'ESCAPE', '0', '1', '20001 - 40000', 'M', '0', 'Fifties',
        'UNITED STATES', '15001 - 20000'],
       ['MERCEDES-BENZ', 'ML350', '1', '1', '20001 - 40000', 'F', '0',
        'Thirties', 'GERMANY', '20001 - 25000'],
       ['HONDA', 'CR-V', '0', '1', '60001 - 80000', 'F', '0', 'Fifties',
        'JAPAN', '20001 - 25000'],
       ['CHEVROLET', 'EQUINOX', '1', '1', '40001 - 60000', 'F', '0',
        'Forties', 'UNITED STATES', '10001 - 15000'],
       ['TOYOTA', 'RAV4', '1', '1', '40001 - 60000', 'M', '1',
        'Thirties', 'JAPAN', '15001 - 20000'],
       ['NISSAN', 'ROGUE', '1', '1', '60001 - 80000', 'M', '0',
        'Thirties', 'JAPAN', '20001 - 25000']], dtype='<U21')

In [38]:
sedan.head()

Unnamed: 0,purchase_make,purchase_model,trade_in,vehicle_financing,customer_income,customer_gender,subsequent_purchases,AgeBin,made_in,purchase_price,cluster_label
0,DODGE,CHARGER,1,1,40001 - 60000,U,1,Twenties,UNITED STATES,15001 - 20000,2
1,BMW,328,1,1,60001 - 80000,F,0,Forties,GERMANY,25001 - 30000,6
2,LEXUS,GS 300,0,1,20001 - 40000,M,0,Twenties,JAPAN,10001 - 15000,5
3,CHEVROLET,CRUZE,1,1,120001 - 140000,M,1,Thirties,UNITED STATES,15001 - 20000,0
4,NISSAN,ALTIMA,1,1,60001 - 80000,U,1,Twenties,JAPAN,15001 - 20000,1


In [36]:
from sklearn.datasets import load_iris
from sklearn import tree

In [37]:
clf = tree.DecisionTreeClassifier(random_state=0)

In [39]:
! pip install apyori

Collecting apyori
  Downloading https://files.pythonhosted.org/packages/5e/62/5ffde5c473ea4b033490617ec5caa80d59804875ad3c3c57c0976533a21a/apyori-1.1.2.tar.gz
Building wheels for collected packages: apyori
  Building wheel for apyori (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/musaup/.cache/pip/wheels/5d/92/bb/474bbadbc8c0062b9eb168f69982a0443263f8ab1711a8cad0
Successfully built apyori
Installing collected packages: apyori
Successfully installed apyori-1.1.2


In [41]:
from apyori import apriori

In [50]:
sedan.columns

Index(['purchase_make', 'purchase_model', 'trade_in', 'vehicle_financing',
       'customer_income', 'customer_gender', 'subsequent_purchases', 'AgeBin',
       'made_in', 'purchase_price', 'cluster_label'],
      dtype='object')

In [99]:
sedan

Unnamed: 0,purchase_make,purchase_model,trade_in,vehicle_financing,customer_income,customer_gender,subsequent_purchases,AgeBin,made_in,purchase_price,cluster_label
0,DODGE,CHARGER,1,1,40001 - 60000,U,1,Twenties,UNITED STATES,15001 - 20000,2
1,BMW,328,1,1,60001 - 80000,F,0,Forties,GERMANY,25001 - 30000,6
2,LEXUS,GS 300,0,1,20001 - 40000,M,0,Twenties,JAPAN,10001 - 15000,5
3,CHEVROLET,CRUZE,1,1,120001 - 140000,M,1,Thirties,UNITED STATES,15001 - 20000,0
4,NISSAN,ALTIMA,1,1,60001 - 80000,U,1,Twenties,JAPAN,15001 - 20000,1
5,FORD,FUSION,0,1,140001 - 160000,M,1,Fifties,UNITED STATES,15001 - 20000,0
6,NISSAN,SENTRA,0,1,20001 - 40000,U,1,Sixties,JAPAN,10001 - 15000,1
7,NISSAN,VERSA,0,0,140001 - 160000,F,1,UnderTwenty,JAPAN,10001 - 15000,1
8,KIA,FORTE,1,0,40001 - 60000,M,0,Sixties,SOUTH KOREA,10001 - 15000,0
9,TOYOTA,COROLLA,1,1,60001 - 80000,F,0,Thirties,JAPAN,15001 - 20000,3


In [105]:
trans = []
for i in range(sedan.shape[0]):
    trans.append(tuple(sedan.loc[i][['purchase_make',
       'customer_income', 'customer_gender','AgeBin','purchase_price', 'cluster_label']].to_numpy().astype(str)))

### Association Mining Sedans

In [106]:
from efficient_apriori import apriori
itemsets, rules = apriori(trans, min_support=0.05,  min_confidence=0.7)
for i in list(rules):
    print(i)

{5} -> {10001 - 15000} (conf: 0.884, supp: 0.064, lift: 2.309, conv: 5.306)
{1} -> {F} (conf: 0.702, supp: 0.179, lift: 1.740, conv: 2.002)
{CHEVROLET} -> {2} (conf: 0.831, supp: 0.098, lift: 4.104, conv: 4.711)
{4} -> {M} (conf: 0.777, supp: 0.090, lift: 1.611, conv: 2.322)
{10001 - 15000, NISSAN} -> {1} (conf: 0.782, supp: 0.056, lift: 3.060, conv: 3.413)
{1, 10001 - 15000} -> {F} (conf: 0.716, supp: 0.104, lift: 1.775, conv: 2.101)
{F, NISSAN} -> {1} (conf: 0.802, supp: 0.060, lift: 3.140, conv: 3.767)
{1, 20001 - 40000} -> {F} (conf: 0.714, supp: 0.081, lift: 1.770, conv: 2.086)
{10001 - 15000, CHEVROLET} -> {2} (conf: 0.791, supp: 0.054, lift: 3.907, conv: 3.813)
{10001 - 15000, 2} -> {CHEVROLET} (conf: 0.715, supp: 0.054, lift: 6.063, conv: 3.097)
{CHEVROLET, M} -> {2} (conf: 0.900, supp: 0.050, lift: 4.446, conv: 7.967)
{1, 10001 - 15000, 20001 - 40000} -> {F} (conf: 0.715, supp: 0.053, lift: 1.774, conv: 2.096)


In [104]:
sedan.groupby('cluster_label').agg(lambda x: pd.Series.mode(x)[0])

Unnamed: 0_level_0,purchase_make,purchase_model,trade_in,vehicle_financing,customer_income,customer_gender,subsequent_purchases,AgeBin,made_in,purchase_price
cluster_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,FORD,FUSION,1,0,20001 - 40000,M,0,Thirties,UNITED STATES,15001 - 20000
1,NISSAN,ALTIMA,1,1,20001 - 40000,F,0,Twenties,JAPAN,10001 - 15000
2,CHEVROLET,MALIBU,0,1,20001 - 40000,M,0,Twenties,UNITED STATES,15001 - 20000
3,TOYOTA,CAMRY,0,1,60001 - 80000,F,0,Thirties,JAPAN,15001 - 20000
4,NISSAN,ALTIMA,0,1,40001 - 60000,M,0,Thirties,JAPAN,15001 - 20000
5,TOYOTA,COROLLA,0,1,20001 - 40000,M,0,Thirties,JAPAN,10001 - 15000
6,BMW,328,0,1,0 - 20000,M,0,Forties,GERMANY,10001 - 15000


In [108]:

for i in range(suv.shape[0]):
    suv_trans.append(tuple(suv.loc[i][['purchase_make',
       'customer_income', 'customer_gender','AgeBin','purchase_price', 'cluster_label']].to_numpy().astype(str)))

### Association Rules Suv

In [109]:
from efficient_apriori import apriori
itemsets, rules = apriori(suv_trans, min_support=0.05,  min_confidence=0.7)
for i in list(rules):
    print(i)

{6} -> {M} (conf: 0.724, supp: 0.074, lift: 1.514, conv: 1.890)
{2} -> {F} (conf: 0.714, supp: 0.066, lift: 1.668, conv: 1.998)
{1} -> {M} (conf: 0.799, supp: 0.176, lift: 1.672, conv: 2.598)
{3} -> {F} (conf: 0.704, supp: 0.079, lift: 1.645, conv: 1.931)
{1, 15001 - 20000} -> {M} (conf: 0.729, supp: 0.062, lift: 1.526, conv: 1.927)
{FORD, M} -> {1} (conf: 0.874, supp: 0.069, lift: 3.968, conv: 6.201)
{1, Fifties} -> {M} (conf: 0.729, supp: 0.052, lift: 1.526, conv: 1.927)


### All of the Categories

In [146]:
vehicle_classes = [sedan,truck,suv,wagon,van,coupe,convertible]
rule_gens = []
for i in vehicle_classes:
    trans = []
    for j in range(i.shape[0]):
        trans.append(tuple(i.loc[j][['purchase_make',
                                           'customer_income', 'customer_gender','AgeBin','purchase_price', 'cluster_label']].to_numpy().astype(str)))
    itemsets, rules = apriori(trans, min_support=0.10,  min_confidence=0.7)
    rule_gens.append(list(rules))


In [147]:
rule_gens

[[{1} -> {F}, {1, 10001 - 15000} -> {F}],
 [{20001 - 25000} -> {M},
  {20001 - 40000} -> {M},
  {4} -> {M},
  {Twenties} -> {M},
  {3} -> {M},
  {TOYOTA} -> {3},
  {3} -> {TOYOTA},
  {60001 - 80000} -> {M},
  {TOYOTA} -> {M},
  {Thirties} -> {M},
  {CHEVROLET} -> {M},
  {Fifties} -> {M},
  {10001 - 15000} -> {M},
  {4} -> {DODGE},
  {40001 - 60000} -> {M},
  {DODGE} -> {M},
  {25001 - 30000} -> {M},
  {FORD} -> {M},
  {1} -> {M},
  {15001 - 20000} -> {M},
  {0} -> {M},
  {Forties} -> {M},
  {M, TOYOTA} -> {3},
  {3, TOYOTA} -> {M},
  {3, M} -> {TOYOTA},
  {TOYOTA} -> {3, M},
  {1, FORD} -> {M}],
 [{1} -> {M}],
 [],
 [{6} -> {20001 - 25000},
  {6} -> {CHRYSLER},
  {DODGE} -> {1},
  {6} -> {M},
  {15001 - 20000, DODGE} -> {1},
  {DODGE, M} -> {1}],
 [{15001 - 20000} -> {HONDA},
  {10001 - 15000} -> {HONDA},
  {2} -> {HONDA},
  {1} -> {HONDA},
  {MERCEDES-BENZ} -> {0},
  {0} -> {MERCEDES-BENZ},
  {3} -> {10001 - 15000},
  {3} -> {HONDA},
  {15001 - 20000, F} -> {HONDA},
  {10001 - 15000, 