In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

---
### The data science methodology followed for this project has been outlined by John Rollins, IBM

- Business Understanding
- Analytical Approach
- Data requirements
- Data collection
- Data Understanding
- Data Preparation
- Modeling
- Evaluation

## 1. Business Understanding
Customer segmentation is the practice of dividing a company's customers into groups that reflect similarity among customers in each group. The goal of this project is to divide customers into groups based on common characteristics in order to maximize the value of each customer to the business. 

## 2. Analytical Approach 
Clustering of Customers based on similar characteristics is an Unsupervised Learning as for each observation we do not have any target variable. 
For this project I will use two Machine Learning models
- I will use KMeans Clustering Algorithm which aims to partition n observations into k clusters in which each observation belongs to the cluster with the nearest mean
- I will also use Hierarchical clustering which is an algorithm that groups similar objects into groups called clusters. The endpoint is a set of clusters, where each cluster is distinct from each other cluster, and the objects within each cluster are broadly similar to each other.

## 3,4. Data Requirements and Data Collection
We would require a dataset which gives us information regarding customers from a market. For this project, the dataset has been provided to us on Kaggle. This data set is created only for the learning purpose of the customer segmentation concepts , also known as market basket analysis

---

## Libraries Used

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering

## Import the dataset

In [None]:
df = pd.read_csv('/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')
df.head()

## 5. Data Understanding / Exploratory Data Analysis

In [None]:
df.info()

In [None]:
df.columns

In [None]:
#number of missing values
df.isnull().sum()

---
Dataset Understanding:
- There are total of 200 observations with each having 5 variables. 
- The column of the dataset include CustomerID, Gender, Age, Annual Income, Spending Score.
- There are no missing values (Good day for us XD)
- There is one categorical variable - Gender
---

In [None]:
# statistical information about the dataset
df.describe()

In [None]:
# statistical information about the categorical variable
df.describe(include=['O']).T

In [None]:
# let's change the name of the columns to make it easier for analysis
df.rename(columns={"Annual Income (k$)": "Income", "Spending Score (1-100)": "Score"}, inplace=True)

## Visualisation

In [None]:
# distribution of age
sns.displot(x='Age', data=df, kde=True)

In [None]:
# unique values for spending score
df['Score'].unique()

In [None]:
# distribution of spending score
sns.displot(x='Score', data=df, kde=True)

In [None]:
# distribution of annual income
sns.displot(x='Income', data=df, kde=True)

In [None]:
# distribution of categorical variable
sns.countplot(x='Gender', data=df)

In [None]:
df['Gender'].value_counts()

## Correlation of the Data

In [None]:
sns.heatmap(df.corr(), annot=True)

## 6. Data Cleaning

### Feature Engineering

In [None]:
# drop CustomerID as it is not useful
df.drop('CustomerID', axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df = pd.get_dummies(df).reset_index(drop=True)

In [None]:
df.head()

## 7. Modeling

### K Means Clustering
In K-means, objects are assigned to a cluster based on the Euclidean distance between the object and the center of the cluster, also referred to as the cluster centroid.

But we do not know in advance how many clusters there are, and we do not know what the clusters will look like. That is why we work in two steps.
- First, we determine the optimal number of clusters, and then
- We determine starting values for each cluster.

In [None]:
X = df.iloc[:,].values

In [None]:
# Using the elbow method to find the optimal number of clusters
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
# Fitting K-Means to the dataset
kmeans = KMeans(n_clusters = 5, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(X)

In [None]:
# Visualising the clusters
plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X[y_kmeans == 4, 0], X[y_kmeans == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroids')
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()

---

### Hierarchical clustering
Hierarchical clustering, also known as hierarchical cluster analysis, is an algorithm that groups similar objects into groups called clusters. The endpoint is a set of clusters, where each cluster is distinct from each other cluster, and the objects within each cluster are broadly similar to each other.

In [None]:
hc = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage = 'ward')
y_hc = hc.fit_predict(X)

In [None]:
# Visualising the clusters
plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_hc == 3, 0], X[y_hc == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X[y_hc == 4, 0], X[y_hc == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()

## That brings us to the end of this project. Please do consider giving an upvote if you find this notebook to be useful. 
## I regularly share contents related to Data Science on Twitter, you can connect with mere [@PiyalBanik](https://twitter.com/PiyalBanik)