# Customer Segmentation and Analysis

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import scipy.cluster.hierarchy as sch

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data=pd.read_csv('../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')
data.head()

### Checking for null values

In [None]:
data.isna().sum()

### Checking data  info"

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe()

## EDA 

In [None]:
data['Gender'].value_counts()

In [None]:
sns.countplot(data['Gender'], palette='Blues_r')

#### From above count we know demographics of customer based on gender of all 200,  44% are Male and 66% are female, Female customers are more

In [None]:
plt.figure(figsize=(20,15))
plt.subplot(3,1,1)
sns.countplot(data['Age'], palette='icefire')
plt.subplot(3,1,2)
sns.countplot(data['Annual Income (k$)'], palette='icefire')
plt.subplot(3,1,3)
sns.countplot(data['Spending Score (1-100)'], palette='icefire')

#### From inspecting all above  we can tell there are more customer from age of 40-70 that have higher annual income, and we can see same having higher spending score, below distribution plot verify our analysis

In [None]:
plt.figure(figsize=(20,12))
plt.subplot(1,3,1)
sns.distplot(data['Age'], kde=True, rug=True)

plt.subplot(1,3,2)
sns.distplot(data['Annual Income (k$)'],kde=True, rug=True, color='Pink')
plt.tight_layout()

plt.subplot(1,3,3)
sns.distplot(data['Spending Score (1-100)'],kde=True, rug=True, color='Purple')
plt.tight_layout()

#### Checking for correlation between feature

In [None]:
sns.heatmap(data.corr(), annot=True, color='Blue')

In [None]:
sns.catplot('Gender', 'Spending Score (1-100)', data=data, kind='violin',palette = 'Blues')


In [None]:
sns.catplot('Gender', 'Annual Income (k$)',data=data, palette = 'cividis',kind='box')

In [None]:
sns.lmplot(x='Annual Income (k$)', y='Spending Score (1-100)',hue='Gender', data=data, fit_reg=False)

#### Annual income and spending score shows  grouping in chart which show high possibility to group and segment customer based on there annual income and spending pattern

In [None]:
sns.lmplot(x='Age', y='Spending Score (1-100)',hue='Gender', data=data, fit_reg=False)

#### Customer with age group upto 40 have higher spending score  means they are frequent shopper that go with trends and shop with low to average annual income if we see insight from annual income distribution and age -annual income relationship  below.

In [None]:
sns.lmplot(x='Age', y='Annual Income (k$)',hue='Gender', data=data, fit_reg=False)

### Creating dummies for categorical feature

In [None]:
df=pd.get_dummies(data, drop_first=True)
df.head()

In [None]:
df.drop('CustomerID',inplace=True, axis=1)
df.head()

### Performing scaling of features

In [None]:
scaler=StandardScaler()
dfscaled=scaler.fit_transform(df)
dfscaled=pd.DataFrame(dfscaled)
dfscaled.columns=['Age', 'Annual Income (k$)', 'Spending Score (1-100)', 'Gender_Male']
dfscaled.drop('Gender_Male',inplace=True, axis=1)
dfscaled.head()

### Clustering on age and spending score

### K-Means Clustering Algorithm to determine various segment of customer based on age and  spending score

In [None]:
kdata=dfscaled[['Age','Spending Score (1-100)']]
kdata.head()

#### Plotting sum of square distance to get elbow curve , so that we can get optimal cluster where  cluster count doesnt make any huge significant improvement

In [None]:
kssd=[]
for i in range(1, 11):
    km = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
    km.fit(kdata)
    kssd.append(km.inertia_)
plt.plot(range(1, 11), kssd)

#### Plotting cluster and identifying labels based on spending score and age to find customer segments

In [None]:
plt.figure(figsize=(20,12))
km = KMeans(n_clusters = 4, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
y_means = km.fit_predict(kdata)
kdata = np.array(kdata)
plt.scatter(kdata[y_means == 0, 0], kdata[y_means == 0, 1], s = 100, c = 'pink', label = 'Normal shopper')
plt.scatter(kdata[y_means == 1, 0], kdata[y_means == 1, 1], s = 100, c = 'yellow', label = 'Young Frequent')
plt.scatter(kdata[y_means == 2, 0], kdata[y_means == 2, 1], s = 100, c = 'cyan', label = 'Old frequent')
plt.scatter(kdata[y_means == 3, 0], kdata[y_means == 3, 1], s = 100, c = 'magenta', label = 'Privilege')

plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:, 1], s = 50, c = 'blue' , label = 'centeroid')

plt.style.use('fivethirtyeight')
plt.title('K Means Clustering - based on age group', fontsize = 20)
plt.xlabel('Age')
plt.ylabel('Spending Score')
plt.legend()
plt.grid()
plt.show()


### Determine various segment of customer based on annual income and  spending score i.e spending habits

In [None]:
kdata=dfscaled[['Annual Income (k$)','Spending Score (1-100)']]
kdata.head()

#### Plotting sum of square distance to get elbow curve , so that we can get optimal cluster where  cluster count doesnt make any huge significant improvement

In [None]:
kssd=[]
for i in range(1, 11):
    km = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
    km.fit(kdata)
    kssd.append(km.inertia_)
plt.plot(range(1, 11), kssd)

#### According to  elbow chart 5 are optimal cluster to be  form


#### Plotting cluster and identifying labels based on annual income and  spending score

In [None]:
plt.figure(figsize=(20,12))
km = KMeans(n_clusters = 5, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
y_means = km.fit_predict(kdata)
kdata = np.array(kdata)
plt.scatter(kdata[y_means == 0, 0], kdata[y_means == 0, 1], s = 100, c = 'pink', label = 'penny-pincher')
plt.scatter(kdata[y_means == 1, 0], kdata[y_means == 1, 1], s = 100, c = 'yellow', label = 'spendthrift')
plt.scatter(kdata[y_means == 2, 0], kdata[y_means == 2, 1], s = 100, c = 'cyan', label = 'general customer')
plt.scatter(kdata[y_means == 3, 0], kdata[y_means == 3, 1], s = 100, c = 'magenta', label = 'target customer')
plt.scatter(kdata[y_means == 4, 0], kdata[y_means == 4, 1], s = 100, c = 'orange', label = 'careful spender')
plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:, 1], s = 50, c = 'blue' , label = 'centeroid')

plt.style.use('fivethirtyeight')
plt.title('K Means Clustering-mbased on spending habits', fontsize = 20)
plt.xlabel('Annual income')
plt.ylabel('Spending Score')
plt.legend()
plt.grid()
plt.show()