# CUSTOMER SEGMENTATION

### Loading data

In [None]:
#importing libraries
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
#loading data
df = pd.read_csv('../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')

In [None]:
#inspecting data
df.head()

In [None]:
#dropping customerID
df = df.drop(['CustomerID'], axis = 1)

In [None]:
#inspecting data
df.head()

### Data Preprocessing

In [None]:
#inspecting datatypes
df.dtypes

##### Conclusion:

All datatypes are coherent to expectation

In [None]:
#checking for missing values
missing_df = df.isnull()
missing_df.head()

In [None]:
for column in missing_df.columns.values.tolist():
    print(column)
    print(missing_df[column].value_counts())
    print()

##### Conclusion:

No missing values in the dataset

In [None]:
#plotting data for insights
sns.pairplot(df, kind = 'reg')

##### Conclusion:

There is some correlation between Age and Spending score.

In [None]:
#performing binning on age
bins = np.array([min(df['Age']), 35, 55, max(df['Age'])])
bins

In [None]:
#making groups
group_names = ['Young', 'Adult', 'Elder']

In [None]:
#adding groups to the table
df['Age-binned'] = pd.cut(df['Age'], bins, labels = group_names, include_lowest = True)
df.head()

In [None]:
#plotting distribution of each bin
sns.set(style="whitegrid")
sns.countplot(x = df['Age-binned'])

##### Conclusion

The mall is frequented mostly by young people followed by adults and hardly by elders.

In [None]:
#performing binning on income
bins = np.linspace(min(df['Annual Income (k$)']), max(df['Annual Income (k$)']), 4)
bins

In [None]:
#making groups
group_names = ['Low', 'Medium', 'High']

In [None]:
#adding groups to table
df['Income-binned'] = pd.cut(df['Annual Income (k$)'], bins, labels = group_names, include_lowest = True)
df.head()

In [None]:
#plotting distribution of each bin
sns.countplot(x = df['Income-binned'])

##### Conclusion:

The mall is frequented mostly by low and medium wealth income people and hardly by high income people.

### Exploratory Data Analysis

In [None]:
#Understanding data distribution
df.describe()

In [None]:
#understanding inter column correlations
df.corr()

##### Conclusion:

There seems to be some correlation between age and spending score.

In [None]:
#understanding gender distribution
sns.countplot(x = df['Gender'])

##### Conclusion:
More females frequent the mall than males

In [None]:
#finding relation between income and spending score
sns.boxplot(x = df['Income-binned'], y = df['Spending Score (1-100)'], hue = df['Gender']).legend(loc='upper left')

##### Conclusion:

High income males tend to spend more in the mall.

In [None]:
#finding relation between Age and Spending Score
sns.boxplot(x = df['Age-binned'], y = df['Spending Score (1-100)'], hue = df['Gender'])

##### Conclusion:

Young people tend to spend more in the mall. Especially females.

In [None]:
#finding relation between age and annual income
sns.boxplot(x = df['Age-binned'], y = df['Annual Income (k$)'], hue = df['Gender'])

##### Conclusion:

There is no relation in income and age brackets. Males tend to have more income than females on an average.

### Kmeans Clustering

Now we segregate customers based on their spending habits using Kmeans.

In [None]:
#Dropping extra columns used for analysis
df.drop(['Age-binned', 'Income-binned'], axis = 1, inplace = True)

In [None]:
#Handling categorical data
df = pd.concat([df, pd.get_dummies(df['Gender'])], axis = 1)

In [None]:
#inspecting dataframe
df.head()

In [None]:
#dropping Gender column
df.drop(['Gender'], axis = 1, inplace = True)

In [None]:
#inspecting dataframe
df.head()

In [None]:
#Normalizing data
from sklearn import preprocessing
normalized_df = preprocessing.normalize(df)

In [None]:
#Calculating Kmeans intertia
from sklearn.cluster import KMeans
Kmeans_inertia = []
K = range(1,15)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(normalized_df)
    Kmeans_inertia.append(km.inertia_)

In [None]:
#Plotting inertia to find knee.
ax = sns.lineplot(K, Kmeans_inertia)
ax.set(xlabel='clusters', ylabel='Sum of squared distances')

##### Conclusion:

Optimal number of clusters is five. Therefore there are broadly five types of customers.

In [None]:
#Visualizing the five types of customers
km = KMeans(n_clusters = 5)
km.fit(df)
cluster = km.predict(df)
ax = sns.scatterplot(df['Annual Income (k$)'], df['Spending Score (1-100)'], hue = cluster)
ax.legend(['cluster 1', 'cluster 2', 'cluster 3', 'cluster 4', 'cluster 5'])

# Conclusions:

1) High income males are to be targeted.

2) Low income young males and females are to be advertised.

3) There are five types of customers: High income High spenders, High income low spenders, Average income average spenders,        Low income high spenders and low income low spenders. 