**Following code is for Customer Segmentation  using food delivery data. In the code you will find the following:**

* EDA and Feature Creation
* PCA for dimensionality reduction
* Elbow method to select optimal cluster

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score
from sklearn import preprocessing
from scipy.spatial import ConvexHull
import datetime
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:

df = pd.read_csv(r'/kaggle/input/customer-order-data/SampleAssessment.csv')
df.head()

In [None]:
df.columns

**EDA**

In [None]:
## num of customers
print ('Number of Customers:',df.customer_id.nunique())

In [None]:
df.describe()
#Observations: 
# Amount in last 7 days,Amount in last 4 weeks have missing values (as min=0), 
# Avg_DistanceFromResturant is negative in few cases

In [None]:
df.dtypes
# convert dates to date format

In [None]:
# format date
df['First_Order_Time'] = pd.to_datetime(df['First Time'])
df['Recent_Order_Time'] = pd.to_datetime(df['Recent Time'])

df['First_Order_Time'] = df['First_Order_Time'].dt.strftime('%m/%d/%Y')
df['Recent_Order_Time'] = df['Recent_Order_Time'].dt.strftime('%m/%d/%Y')

df['First_Order_Time'] = pd.to_datetime(df['First_Order_Time'])
df['Recent_Order_Time'] = pd.to_datetime(df['Recent_Order_Time'])

In [None]:
df.head()

**Feature Creation**

In [None]:
# drop redundant columns
df.drop(['First Time','Recent Time'], axis = 1, inplace=True)

# assume current date to be just the next day after latest transaction
df['current_date'] = max(df['Recent_Order_Time'])+ datetime.timedelta(days=1)

In [None]:
# extract num_days_since_last_order, num_days_since_first_order
df['num_days_since_last_order'] = df['current_date'] - df['Recent_Order_Time']
df['num_days_since_first_order'] = df['current_date'] - df['First_Order_Time']

In [None]:

## rename columns

df.columns = ['customer_id', 'num_of_Orders', 'num_of_Orders_in_last_7_days',
       'num_of_Orders_in_last_4_weeks', 'Amount', 'Amount_in_last_7_days',
       'Amount_in_last_4_weeks', 'Avg_DistanceFromResturant',
       'Avg_DeliveryTime', 'First_Order_Time', 'Recent_Order_Time',
       'current_date', 'num_days_since_last_order',
       'num_days_since_first_order']

In [None]:
df.head()

In [None]:
# check for cases where order value is null for last 7 days and 4 Weeks
null_orders_in_last_7_days = df[df.num_of_Orders_in_last_7_days.isna()]
null_orders_in_last_4_weeks = df[df.num_of_Orders_in_last_4_weeks.isna()]

print (null_orders_in_last_7_days.num_days_since_last_order.min())
print (null_orders_in_last_4_weeks.num_days_since_last_order.min())
# it means it these are actually null not missing  so replace them by 0

df.num_of_Orders_in_last_7_days.fillna(0, inplace = True)
df.num_of_Orders_in_last_4_weeks.fillna(0, inplace = True)

In [None]:
# replace neagtive values with 0
df['Avg_DistanceFromResturant'] = np.where(df['Avg_DistanceFromResturant']<0, 0,df['Avg_DistanceFromResturant'])

In [None]:
# check
df.describe()

In [None]:
# avg of one order for a customer
df['AOV'] = round(df['Amount']/df['num_of_Orders'],0)

# avg of one order for a customer in last 7 days
df['AOV_last_7_days'] = np.where(df['num_of_Orders_in_last_7_days']==0, 0,
                                 round(df['Amount_in_last_7_days']/df['num_of_Orders_in_last_7_days'],0))

# avg of one order for a customer in last 4 weeks
df['AOV_last_4_weeks'] = np.where(df['num_of_Orders_in_last_4_weeks']==0, 0,
                                  round(df['Amount_in_last_4_weeks']/df['num_of_Orders_in_last_4_weeks'],0))

In [None]:
# % of users transacted in last 7 days
print ('% of users transacted in last 7 days:',df[df['num_of_Orders_in_last_7_days']!=0].shape[0]/df.shape[0])

# % of users transacted in last 4 weeks 
print ('% of users transacted in last 4 weeks:',df[df['num_of_Orders_in_last_4_weeks']!=0].shape[0]/df.shape[0])

In [None]:
# filtering for only relevant columns
df_mod = df[['customer_id','num_of_Orders', 'AOV','AOV_last_7_days','AOV_last_4_weeks', 'Avg_DistanceFromResturant', 'Avg_DeliveryTime', 'num_days_since_last_order', 'num_days_since_first_order']]

# extract days
df_mod['num_days_since_last_order'] = df_mod['num_days_since_last_order'].dt.days
df_mod['num_days_since_last_order'] = df_mod['num_days_since_last_order'].astype(int)

df_mod['num_days_since_first_order'] = df_mod['num_days_since_first_order'].dt.days
df_mod['num_days_since_first_order'] = df_mod['num_days_since_first_order'].astype(int)

**PCA and K-means Clustering**

In [None]:
# l2-normalize  
X_normalized = preprocessing.normalize(df_mod[df_mod.columns[1:]], norm='l2')

In [None]:
for i in range(7):
    pca = PCA(n_components=i)
    pca_result = pca.fit_transform(X_normalized)
    print (i,"explained variance : ",sum(pca.explained_variance_.round(2)), "|","explained variance ratio : ",sum(pca.explained_variance_ratio_.round(2)))
    print ('')

In [None]:
pca = PCA(n_components = 4)
pca_result = pca.fit_transform(X_normalized)

In [None]:
distortions = []
silhouette_avg_list = []
labels_temp = [2,3,4,5]
for k in labels_temp:
    kmeanModel = KMeans(n_clusters=k,random_state=100)
    kmeanModel.fit(pca_result)
    cluster_labels = kmeanModel.fit_predict(pca_result)
    silhouette_avg = silhouette_score(pca_result, cluster_labels)
    print("For n_clusters =", k,
          "The average silhouette_score is :", silhouette_avg)
    distortions.append(kmeanModel.inertia_)
    silhouette_avg_list.append(silhouette_avg)

In [None]:
# plotting elbow curve for optimal k value (inertia vs no. of clusters)
plt.plot(labels_temp, distortions)
plt.xlabel('number of clusters(k)')
plt.ylabel('Distortions')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:
'''Make Clusters'''

cluster_size = 4
k = int(cluster_size)
kmeans = KMeans(n_clusters=k, random_state=100).fit(pca_result)
cluster_labels = list(kmeans.predict(pca_result))

In [None]:
print ("cluster distribution")
projected = pca.fit_transform(X_normalized.data)
projected = pd.concat([pd.DataFrame(projected),pd.DataFrame(cluster_labels).rename(columns ={0:'Labels'})],axis = 1)

In [None]:
fig, ax = plt.subplots(1, figsize=(20,10))
colors = ['#DF2020', '#81DF20', '#2095DF','#ffff99']
# plot data
plt.scatter(projected[0].values, projected[1].values, c=list(kmeans.labels_), alpha = 0.6, s=10)
# plot centers
centroids = kmeans.cluster_centers_
cen_x = [i[0] for i in centroids] 
cen_y = [i[1] for i in centroids]

plt.scatter(cen_x, cen_y, marker='^', c=colors, s=70)
# draw enclosure
for i in projected['Labels'].unique():
    points = projected[projected.Labels == i][[0,1]].values
    # get convex hull
    hull = ConvexHull(points)
    # get x and y coordinates
    # repeat last point to close the polygon
    x_hull = np.append(points[hull.vertices,0],
                       points[hull.vertices,0][0])
    y_hull = np.append(points[hull.vertices,1],
                       points[hull.vertices,1][0])
    # plot shape
    plt.fill(x_hull, y_hull, alpha=0.3, c=colors[i])
    
# plt.xlim(0,200)
# plt.ylim(0,200)