# Customer Segmentation - Step 4: Subclustering at Class Level

In [1]:
# Input: class level penetration for dominating department of each cluster
output_dataset = 'USER_SXM4690'
cs_class_pen_table = 'cs_temp_class_penetration'

# Output: subcluster index
cs_output_subcluster_class_table = 'cs_output_subcluster_class'
cs_output_subcluster_class_table_copy = 'cs_par_pasa_output_subcluster_class'

In [2]:
import google.datalab.bigquery as bq
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.metrics import silhouette_score
from operator import itemgetter

def load_to_BQ_table(table_name, dataframe):
  
  dataframe.columns = dataframe.columns.str.replace('\s+', '_')
  dataframe.columns = dataframe.columns.str.replace('/', '_')
  dataframe.columns = dataframe.columns.str.replace('(', '_')
  dataframe.columns = dataframe.columns.str.replace(')', '_')
  dataframe = dataframe.where(pd.notnull(dataframe), None)
  
  import datalab.bigquery as bq
  bq_table = bq.Table(output_dataset + '.' + table_name)
  bq_table.delete()

  # create schema
  if not bq_table.exists():
    table_schema = bq.Schema.from_dataframe(dataframe)
    bq_table.create(schema = table_schema, overwrite = True)

  # load data
  bq_table.insert_data(dataframe)

  return(dataframe.head())


def cal_cluster_silhouette(dataset, clusters):
       
    clusterer = KMeans(init='k-means++', n_jobs=1, n_clusters=clusters, n_init=10, random_state=5)
    cluster_labels = clusterer.fit_predict(dataset)
    silhouette_avg = silhouette_score(dataset, cluster_labels, sample_size=min(10000, len(dataset.index)), random_state=5)
    
    print(str(clusters) + ': ' + str(silhouette_avg))
    return (clusters, silhouette_avg)
  
def auto_clustering(dataset, input_df_scaled, min_clusters, max_clusters, cluster_col):
  
  range_n_clusters = range(min_clusters, max_clusters+1)
  avg_silhouette = [] # create a list for number of clusters and average silhouette value

  for i in range_n_clusters:
    avg_silhouette.append(cal_cluster_silhouette(input_df_scaled, i))

  recommended_clusters = max(avg_silhouette,key=itemgetter(1))[0] # recommend num of clusters with highest silhouette

  mbk = KMeans(init='k-means++', n_jobs=1, n_clusters=recommended_clusters, n_init=10,  random_state=5)

  cluster_labels = mbk.fit_predict(input_df_scaled)

  dataset = dataset.assign(cluster_temp = cluster_labels + 1)

  # adjust cluster index by number of customers
  # reorder cluster number based on R12 sales
  cluster_size = dataset.groupby('cluster_temp').agg({'cust_id': 'count'}).reset_index()
  cluster_size['cluster_new'] = cluster_size['cust_id'].rank(ascending=False, method='first')

  # update new cluster index
  dataset = dataset.merge(cluster_size[['cluster_temp', 'cluster_new']], on='cluster_temp')
  dataset = dataset.drop(['cluster_temp'], axis=1).rename(columns = {'cluster_new': cluster_col})

  return (dataset)
  
def clustering(dataset, min_clusters, max_clusters, cluster_col):
  if cluster_col == 'cluster':
    input_df_scaled = dataset.iloc[:,4:len(dataset.columns)]
    input_df_scaled = StandardScaler().fit_transform(input_df_scaled)
    input_df_scaled = pd.DataFrame(input_df_scaled)
    
    dataset_output = auto_clustering(dataset, input_df_scaled, min_clusters, max_clusters, cluster_col)
    
  else:
    mbk = KMeans(init='k-means++', n_jobs=1, n_clusters=2, n_init=10,  random_state=5)
    
    if cluster_col == 'subcluster':
      clustering_sales = dataset[['tot_r12_sls_amt','tot_r12_txn_cnt']]
      clustering_sales = StandardScaler().fit_transform(clustering_sales)
      cluster_labels = mbk.fit_predict(clustering_sales)
        
    
    elif cluster_col == 'subcluster_class':
      clustering_sales = dataset.dept_r12_sls_amt
      cluster_labels = mbk.fit_predict(clustering_sales.reshape(-1,1))    
    
    dataset = dataset.assign(cluster_temp = cluster_labels + 1)

    # adjust cluster index by number of customers
    # reorder cluster number based on R12 sales
    cluster_size = dataset.groupby('cluster_temp').agg({'cust_id': 'count'}).reset_index()
    cluster_size['cluster_r1'] = cluster_size['cust_id'].rank(ascending=False, method='first')

    # update new cluster index
    dataset = dataset.merge(cluster_size[['cluster_temp', 'cluster_r1']], on='cluster_temp').drop(['cluster_temp'], axis=1)
  
    dataset_r1 = dataset[dataset.cluster_r1 == 2]
    
    dataset_r2 = dataset[dataset.cluster_r1 == 1].drop(['cluster_r1'], axis=1)
    
    if cluster_col == 'subcluster':    
      input_df_scaled = dataset_r2.iloc[:,4:len(dataset_r2.columns)]
    
    elif cluster_col == 'subcluster_class':      
      input_df_scaled= dataset_r2.iloc[:,1:(len(dataset_r2.columns)-1)]
    
    dataset_r2_output = auto_clustering(dataset_r2, input_df_scaled, min_clusters, max_clusters, cluster_col)
    
    dataset_r1_output = dataset_r1.assign(cluster_r1 = max(dataset_r2_output[cluster_col])+1).rename(columns = {'cluster_r1': cluster_col})

    dataset_output = dataset_r2_output.append(dataset_r1_output)
    
  return (dataset_output)


In [3]:
# pull class penetration data into dataframe
class_subcluster = bq.Table(output_dataset + '.' + cs_class_pen_table).to_dataframe()

# compose class name
class_subcluster = class_subcluster.assign(class_nm = class_subcluster.dept_nbr.astype(str) + '_' +
                                           class_subcluster.item_class_cd.astype(str) + '_' + 
                                           class_subcluster.item_class_desc)

In [4]:
# loop through cluster
cluster_list = class_subcluster['cluster'].drop_duplicates().tolist()
cluster_list.sort()

for i in cluster_list:
  print('Cluster ' + str(i))
  subcluster_data = class_subcluster[class_subcluster.cluster == i]
  
  dept_spending = subcluster_data[['cust_id', 'dept_r12_sls_amt']].drop_duplicates()
  
  # reshape data 
  col_index = subcluster_data[['dept_nbr', 'item_class_cd', 'class_nm']].drop_duplicates().sort_values(['dept_nbr', 'item_class_cd'])

  subcluster_data = subcluster_data.pivot(index='cust_id', columns='class_nm', values='class_penetration').reset_index()
  subcluster_data = subcluster_data.reindex_axis(['cust_id'] + list(col_index.class_nm), axis=1)
  subcluster_data = subcluster_data.fillna(0)
  
  subcluster_data_melt = pd.melt(subcluster_data, id_vars=['cust_id'], value_vars=list(subcluster_data)[1:])
  subcluster_data = subcluster_data.merge(dept_spending, on='cust_id')
  
  # subclustering
  subcluster = clustering(subcluster_data, 2, 10, 'subcluster_class')
  
  subcluster_data_melt = subcluster_data_melt.merge(dept_spending, on='cust_id')
  subcluster_data_melt = subcluster_data_melt.merge(subcluster[['cust_id', 'subcluster_class']], on='cust_id')
  
  # save result
  if i == min(cluster_list):
    cs_subcluster_class = subcluster_data_melt
    
  else:
    cs_subcluster_class = cs_subcluster_class.append(subcluster_data_melt, ignore_index=True)

Cluster 2.0




2: 0.16834441466695702
3: 0.15756136000880297
4: 0.1552823329182826
5: 0.13652401471527556
6: 0.13135377709330992
7: 0.14080181102753556
8: 0.142025477738313
9: 0.1487692413439231
10: 0.15135548697614007
Cluster 3.0
2: 0.32265749561854384
3: 0.3678316932485892
4: 0.40782098213893647
5: 0.38272074938032336
6: 0.39041387879940526
7: 0.40226386826862953
8: 0.3832556528181117
9: 0.40012343240671333
10: 0.37748334971471087
Cluster 4.0
2: 0.2591059255451502
3: 0.28065569003327445
4: 0.32234648402461913
5: 0.36214403504134424
6: 0.40834590001577176
7: 0.4317952458676078
8: 0.42590783976567587
9: 0.39451849528167765
10: 0.40829252235051516
Cluster 5.0
2: 0.3107521970157391
3: 0.32593884644370885
4: 0.3222457717093858
5: 0.3394526624852517
6: 0.33740224888157516
7: 0.27089979562604255
8: 0.27742053766882774
9: 0.2746981687097859
10: 0.261554075820793
Cluster 6.0
2: 0.6223600354506872
3: 0.7339697172033567
4: 0.7807744429643344
5: 0.7694275947551898
6: 0.7268646639719817
7: 0.7370655704267898
8:

In [5]:
# load into BQ
load_to_BQ_table(table_name = cs_output_subcluster_class_table, dataframe = cs_subcluster_class)

Unnamed: 0,cust_id,class_nm,value,dept_r12_sls_amt,subcluster_class
0,285,025H_3_FASTENERS,0.098707,172905.3,1.0
1,285,025H_4_BUILDER'S HARDWARE,0.047503,172905.3,1.0
2,285,025H_5_SECURITY/SAFETY,0.108354,172905.3,1.0
3,285,025H_10_DOOR LOCKS,0.045492,172905.3,1.0
4,285,025H_12_PNEUMATIC FASTENERS,0.045341,172905.3,1.0


In [None]:
load_to_BQ_table(table_name = cs_output_subcluster_class_table_copy, dataframe = cs_subcluster_class)