In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io

## Load Brand CSV File and Merge to Customer Master

In [22]:
# Load the customer dataset
file_path = "data/Customers.csv"
df_customers = pd.read_csv(file_path,dtype={"ZipCode":"string"} )
df_customers

Unnamed: 0,CustomerID,ZipCode,State
0,10012,49649,MI
1,24012,68713,NE
2,36012,54460,WI
3,48012,91331,CA
4,49012,52070,IA
...,...,...,...
141456,375213012,68701,NE
141457,375214012,70094,LA
141458,375215012,77990,TX
141459,375216012,42437,KY


In [23]:
# Load the customer brand dataset
file_path = "data/Customer_brands.csv"
df_customer_brands = pd.read_csv(file_path)
df_customer_brands

Unnamed: 0,CustomerID,ArticCat_Tot,Bobcat_Tot,Can-Am_Tot,Honda_Tot,John-Deere_Tot,Kawasaki_Tot,Kubota_Tot,Other-Brand_Tot,Polaris_Tot,Suzuki_Tot,Yamaha_Tot
0,10012,0.0,0.0,121.98,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00
1,24012,0.0,0.0,590.97,139.99,0.00,0.00,0.0,0.00,402.68,0.00,18.93
2,36012,0.0,0.0,0.00,0.00,0.00,124.99,0.0,19.99,123.85,0.00,0.00
3,48012,0.0,0.0,499.93,1595.25,49.99,669.89,0.0,0.00,2629.71,44.99,1525.10
4,49012,0.0,0.0,0.00,0.00,0.00,655.94,0.0,0.00,1664.53,199.99,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...
141456,375213012,0.0,0.0,0.00,0.00,0.00,0.00,0.0,25.00,0.00,0.00,0.00
141457,375214012,0.0,0.0,0.00,60.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00
141458,375215012,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.00,0.00,16.99,0.00
141459,375216012,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.00,24.94,0.00,0.00


In [24]:
df_customer_data  = pd.merge(df_customers, df_customer_brands, left_on='CustomerID', right_on='CustomerID')
df_customer_data

Unnamed: 0,CustomerID,ZipCode,State,ArticCat_Tot,Bobcat_Tot,Can-Am_Tot,Honda_Tot,John-Deere_Tot,Kawasaki_Tot,Kubota_Tot,Other-Brand_Tot,Polaris_Tot,Suzuki_Tot,Yamaha_Tot
0,10012,49649,MI,0.0,0.0,121.98,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00
1,24012,68713,NE,0.0,0.0,590.97,139.99,0.00,0.00,0.0,0.00,402.68,0.00,18.93
2,36012,54460,WI,0.0,0.0,0.00,0.00,0.00,124.99,0.0,19.99,123.85,0.00,0.00
3,48012,91331,CA,0.0,0.0,499.93,1595.25,49.99,669.89,0.0,0.00,2629.71,44.99,1525.10
4,49012,52070,IA,0.0,0.0,0.00,0.00,0.00,655.94,0.0,0.00,1664.53,199.99,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141456,375213012,68701,NE,0.0,0.0,0.00,0.00,0.00,0.00,0.0,25.00,0.00,0.00,0.00
141457,375214012,70094,LA,0.0,0.0,0.00,60.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00
141458,375215012,77990,TX,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.00,0.00,16.99,0.00
141459,375216012,42437,KY,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.00,24.94,0.00,0.00


## Load LOB CSV File and Merge to Customer Master

In [25]:
# Load the customer lob dataset
file_path = "data/Customer_lob.csv"
df_customer_lobs = pd.read_csv(file_path )
df_customer_lobs

Unnamed: 0,CustomerID,LOB_Misc_Tot,LOB_NOS-Rebuilt-Part_Tot,LOB_Rebuilt-Engines-Tot,LOB_Salvage_Tot
0,10012,0.0,0.00,0.0,121.98
1,24012,0.0,0.00,0.0,1152.57
2,36012,0.0,14.94,0.0,253.89
3,48012,0.0,119.98,0.0,6894.88
4,49012,0.0,49.99,0.0,2470.47
...,...,...,...,...,...
141456,375213012,0.0,0.00,0.0,25.00
141457,375214012,0.0,0.00,0.0,60.00
141458,375215012,0.0,0.00,0.0,16.99
141459,375216012,0.0,24.94,0.0,0.00


In [26]:
df_customer_data  = pd.merge(df_customer_data, df_customer_lobs, left_on='CustomerID', right_on='CustomerID')
df_customer_data

Unnamed: 0,CustomerID,ZipCode,State,ArticCat_Tot,Bobcat_Tot,Can-Am_Tot,Honda_Tot,John-Deere_Tot,Kawasaki_Tot,Kubota_Tot,Other-Brand_Tot,Polaris_Tot,Suzuki_Tot,Yamaha_Tot,LOB_Misc_Tot,LOB_NOS-Rebuilt-Part_Tot,LOB_Rebuilt-Engines-Tot,LOB_Salvage_Tot
0,10012,49649,MI,0.0,0.0,121.98,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.0,0.00,0.0,121.98
1,24012,68713,NE,0.0,0.0,590.97,139.99,0.00,0.00,0.0,0.00,402.68,0.00,18.93,0.0,0.00,0.0,1152.57
2,36012,54460,WI,0.0,0.0,0.00,0.00,0.00,124.99,0.0,19.99,123.85,0.00,0.00,0.0,14.94,0.0,253.89
3,48012,91331,CA,0.0,0.0,499.93,1595.25,49.99,669.89,0.0,0.00,2629.71,44.99,1525.10,0.0,119.98,0.0,6894.88
4,49012,52070,IA,0.0,0.0,0.00,0.00,0.00,655.94,0.0,0.00,1664.53,199.99,0.00,0.0,49.99,0.0,2470.47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141456,375213012,68701,NE,0.0,0.0,0.00,0.00,0.00,0.00,0.0,25.00,0.00,0.00,0.00,0.0,0.00,0.0,25.00
141457,375214012,70094,LA,0.0,0.0,0.00,60.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.0,0.00,0.0,60.00
141458,375215012,77990,TX,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.00,0.00,16.99,0.00,0.0,0.00,0.0,16.99
141459,375216012,42437,KY,0.0,0.0,0.00,0.00,0.00,0.00,0.0,0.00,24.94,0.00,0.00,0.0,24.94,0.0,0.00


## Load Market CSV File and Merge to Customer Master

In [27]:
# Load the customer lob dataset
file_path = "data/Customer_market.csv"
df_customer_market = pd.read_csv(file_path )
df_customer_market

Unnamed: 0,CustomerID,Mkt-Counter_Tot,Mkt-Other_Tot,Mkt-Website_Tot,Mkt-eBay_Tot
0,10012,0.0,0.0,0.00,121.98
1,24012,0.0,0.0,1152.57,0.00
2,36012,0.0,0.0,0.00,268.83
3,48012,0.0,0.0,0.00,7014.86
4,49012,0.0,0.0,1496.84,1023.62
...,...,...,...,...,...
141456,375213012,0.0,0.0,25.00,0.00
141457,375214012,0.0,0.0,0.00,60.00
141458,375215012,0.0,0.0,0.00,16.99
141459,375216012,0.0,0.0,24.94,0.00


In [28]:
df_customer_data  = pd.merge(df_customer_data, df_customer_market, left_on='CustomerID', right_on='CustomerID')
df_customer_data

Unnamed: 0,CustomerID,ZipCode,State,ArticCat_Tot,Bobcat_Tot,Can-Am_Tot,Honda_Tot,John-Deere_Tot,Kawasaki_Tot,Kubota_Tot,...,Suzuki_Tot,Yamaha_Tot,LOB_Misc_Tot,LOB_NOS-Rebuilt-Part_Tot,LOB_Rebuilt-Engines-Tot,LOB_Salvage_Tot,Mkt-Counter_Tot,Mkt-Other_Tot,Mkt-Website_Tot,Mkt-eBay_Tot
0,10012,49649,MI,0.0,0.0,121.98,0.00,0.00,0.00,0.0,...,0.00,0.00,0.0,0.00,0.0,121.98,0.0,0.0,0.00,121.98
1,24012,68713,NE,0.0,0.0,590.97,139.99,0.00,0.00,0.0,...,0.00,18.93,0.0,0.00,0.0,1152.57,0.0,0.0,1152.57,0.00
2,36012,54460,WI,0.0,0.0,0.00,0.00,0.00,124.99,0.0,...,0.00,0.00,0.0,14.94,0.0,253.89,0.0,0.0,0.00,268.83
3,48012,91331,CA,0.0,0.0,499.93,1595.25,49.99,669.89,0.0,...,44.99,1525.10,0.0,119.98,0.0,6894.88,0.0,0.0,0.00,7014.86
4,49012,52070,IA,0.0,0.0,0.00,0.00,0.00,655.94,0.0,...,199.99,0.00,0.0,49.99,0.0,2470.47,0.0,0.0,1496.84,1023.62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141456,375213012,68701,NE,0.0,0.0,0.00,0.00,0.00,0.00,0.0,...,0.00,0.00,0.0,0.00,0.0,25.00,0.0,0.0,25.00,0.00
141457,375214012,70094,LA,0.0,0.0,0.00,60.00,0.00,0.00,0.0,...,0.00,0.00,0.0,0.00,0.0,60.00,0.0,0.0,0.00,60.00
141458,375215012,77990,TX,0.0,0.0,0.00,0.00,0.00,0.00,0.0,...,16.99,0.00,0.0,0.00,0.0,16.99,0.0,0.0,0.00,16.99
141459,375216012,42437,KY,0.0,0.0,0.00,0.00,0.00,0.00,0.0,...,0.00,0.00,0.0,24.94,0.0,0.00,0.0,0.0,24.94,0.00


## Load RFM CSV File and Merge to Customer Master

In [29]:
# Load the customer lob dataset
file_path = "data/Customer_rfm.csv"
df_customer_rfm = pd.read_csv(file_path )
df_customer_rfm

Unnamed: 0,CustomerID,Recency,Frequency,Monetary,R,F,M,RFM Score,Segment
0,10012,636,3,121.98,1,5,4,154,can't loose
1,24012,50,11,1152.57,5,5,5,555,champions
2,36012,48,7,268.83,5,5,5,555,champions
3,48012,15,42,7014.86,5,5,5,555,champions
4,49012,43,14,2460.47,5,5,5,555,champions
...,...,...,...,...,...,...,...,...,...
141456,375213012,1,1,25.00,5,1,2,512,new customers
141457,375214012,1,1,60.00,5,1,3,513,new customers
141458,375215012,1,1,16.99,5,1,1,511,new customers
141459,375216012,1,1,24.94,5,1,2,512,new customers


In [30]:
df_customer_data  = pd.merge(df_customer_data, df_customer_rfm, left_on='CustomerID', right_on='CustomerID')
df_customer_data

Unnamed: 0,CustomerID,ZipCode,State,ArticCat_Tot,Bobcat_Tot,Can-Am_Tot,Honda_Tot,John-Deere_Tot,Kawasaki_Tot,Kubota_Tot,...,Mkt-Website_Tot,Mkt-eBay_Tot,Recency,Frequency,Monetary,R,F,M,RFM Score,Segment
0,10012,49649,MI,0.0,0.0,121.98,0.00,0.00,0.00,0.0,...,0.00,121.98,636,3,121.98,1,5,4,154,can't loose
1,24012,68713,NE,0.0,0.0,590.97,139.99,0.00,0.00,0.0,...,1152.57,0.00,50,11,1152.57,5,5,5,555,champions
2,36012,54460,WI,0.0,0.0,0.00,0.00,0.00,124.99,0.0,...,0.00,268.83,48,7,268.83,5,5,5,555,champions
3,48012,91331,CA,0.0,0.0,499.93,1595.25,49.99,669.89,0.0,...,0.00,7014.86,15,42,7014.86,5,5,5,555,champions
4,49012,52070,IA,0.0,0.0,0.00,0.00,0.00,655.94,0.0,...,1496.84,1023.62,43,14,2460.47,5,5,5,555,champions
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141456,375213012,68701,NE,0.0,0.0,0.00,0.00,0.00,0.00,0.0,...,25.00,0.00,1,1,25.00,5,1,2,512,new customers
141457,375214012,70094,LA,0.0,0.0,0.00,60.00,0.00,0.00,0.0,...,0.00,60.00,1,1,60.00,5,1,3,513,new customers
141458,375215012,77990,TX,0.0,0.0,0.00,0.00,0.00,0.00,0.0,...,0.00,16.99,1,1,16.99,5,1,1,511,new customers
141459,375216012,42437,KY,0.0,0.0,0.00,0.00,0.00,0.00,0.0,...,24.94,0.00,1,1,24.94,5,1,2,512,new customers


## Merge Demographics into the Customer Merge

In [19]:
# Load the zipcode demographics dataset
file_path = "data/zipcode_demo.csv"
df_zipcodes = pd.read_csv(file_path,dtype={"ZipCode":"string"})
df_zipcodes

Unnamed: 0,ZipCode,lat,lng,city,state_id,state_name,zcta,population,density,county_fips
0,00601,18.18004,-66.75218,Adjuntas,PR,Puerto Rico,True,17242,111.4,72001
1,00602,18.36073,-67.17517,Aguada,PR,Puerto Rico,True,38442,523.5,72003
2,00603,18.45439,-67.12202,Aguadilla,PR,Puerto Rico,True,48814,667.9,72005
3,00606,18.16724,-66.93828,Maricao,PR,Puerto Rico,True,6437,60.4,72093
4,00610,18.29032,-67.12243,Anasco,PR,Puerto Rico,True,27073,312.0,72011
...,...,...,...,...,...,...,...,...,...,...
33094,99923,55.97796,-130.03671,Hyder,AK,Alaska,True,15,2.1,2198
33095,99925,55.55767,-132.97627,Klawock,AK,Alaska,True,927,5.7,2198
33096,99926,55.12617,-131.48928,Metlakatla,AK,Alaska,True,1635,4.2,2198
33097,99927,56.25100,-133.37571,Point Baker,AK,Alaska,True,38,0.2,2198


In [31]:
df_customer_data = pd.merge(df_customer_data, df_zipcodes, left_on='ZipCode', right_on='ZipCode')
df_customer_data

Unnamed: 0,CustomerID,ZipCode,State,ArticCat_Tot,Bobcat_Tot,Can-Am_Tot,Honda_Tot,John-Deere_Tot,Kawasaki_Tot,Kubota_Tot,...,Segment,lat,lng,city,state_id,state_name,zcta,population,density,county_fips
0,10012,49649,MI,0.00,0.0,121.98,0.00,0.0,0.0,0.0,...,can't loose,44.56776,-85.53483,Kingsley,MI,Michigan,True,7406,28.9,26055
1,240859012,49649,MI,0.00,0.0,0.00,0.00,0.0,0.0,0.0,...,hibernating,44.56776,-85.53483,Kingsley,MI,Michigan,True,7406,28.9,26055
2,244631012,49649,MI,0.00,0.0,0.00,0.00,0.0,0.0,0.0,...,hibernating,44.56776,-85.53483,Kingsley,MI,Michigan,True,7406,28.9,26055
3,268475012,49649,MI,0.00,0.0,0.00,0.00,0.0,0.0,0.0,...,hibernating,44.56776,-85.53483,Kingsley,MI,Michigan,True,7406,28.9,26055
4,290120012,49649,MI,0.00,0.0,0.00,0.00,0.0,0.0,0.0,...,hibernating,44.56776,-85.53483,Kingsley,MI,Michigan,True,7406,28.9,26055
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140528,375095012,70812,LA,0.00,0.0,0.00,58.99,0.0,0.0,0.0,...,new customers,30.50073,-91.11044,Baton Rouge,LA,Louisiana,True,11503,994.2,22033
140529,375106012,97231,OR,0.00,0.0,0.00,0.00,0.0,0.0,0.0,...,new customers,45.67952,-122.82747,Portland,OR,Oregon,True,4019,25.7,41051
140530,375135012,58746,ND,74.94,0.0,0.00,0.00,0.0,0.0,0.0,...,new customers,48.71825,-102.08516,Kenmare,ND,North Dakota,True,1348,1.4,38101
140531,375189012,83272,ID,0.00,0.0,0.00,0.00,0.0,0.0,0.0,...,new customers,42.10497,-111.36387,Saint Charles,ID,Idaho,True,265,0.9,16007


## Write out Merge Customer Data File
> Need to merge additional demographics for this file. This file contains a single row with roll-up data into columns per customer

In [32]:
df_customer_data.to_csv('data/Customer_data_merged.csv',index=False)