![Image](https://images.unsplash.com/photo-1519567241046-7f570eee3ce6?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&auto=format&fit=crop&w=1000&q=80)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
sns.set(rc={'figure.figsize':(11.7,8.27)})
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')
data.head()

In [None]:
sns.distplot(data['Age'])

In [None]:
sns.lineplot(x='Age',y = 'Annual Income (k$)',data=data)

In [None]:
sns.lmplot('Age','Annual Income (k$)',data=data,fit_reg=False)

In [None]:
gender_spends_df = data.groupby('Gender').agg({'Annual Income (k$)':'mean','Spending Score (1-100)':'mean'}).reset_index()
gender_spends_df

In [None]:
sns.distplot(data[data['Gender']=='Male']['Spending Score (1-100)'],
            color = 'b',
            label = 'Male')

sns.distplot(data[data['Gender']=='Female']['Spending Score (1-100)'],
            color = 'r',
            label = 'Female')
plt.legend()

### We notice that Females tend to Spend more than Males

In [None]:
sns.distplot(data[data['Gender']=='Male']['Annual Income (k$)'],
            color = 'b',
            label = 'Male')

sns.distplot(data[data['Gender']=='Female']['Annual Income (k$)'],
            color = 'r',
            label = 'Female')
plt.legend()

### Males tend to Earn More than Women

In [None]:
sns.barplot(x = 'Gender',y='Annual Income (k$)',data = gender_spends_df)

In [None]:
sns.barplot(x = 'Gender',y='Spending Score (1-100)',data = gender_spends_df)

### Normalising the Data

In [None]:
scaler = StandardScaler()
scaled_df = scaler.fit_transform(data[['Age','Annual Income (k$)','Spending Score (1-100)']])

### Using Elbow Curve method

In [None]:
cluster_range = range(1,10)
cluster_errors = []
for num_clusters in cluster_range:
    clusters = KMeans(num_clusters)
    clusters.fit(scaled_df)
    cluster_errors.append(clusters.inertia_)
plt.figure(figsize=(10,6))
plt.plot(cluster_range, cluster_errors,marker = "o")

In [None]:
k = 4
clusters = KMeans(k,random_state = 42)
clusters.fit(scaled_df)
data['cluster_id'] = clusters.labels_
data.sample(5)

In [None]:
data.groupby(['cluster_id'])['Age','Annual Income (k$)','Spending Score (1-100)'].agg({'mean'}).reset_index()

# First Cluster

In [None]:
cluster_0 = data[data['cluster_id']==0]
cluster_0_df = cluster_0.groupby('Gender').agg({'Spending Score (1-100)':'mean','Annual Income (k$)':'mean'}).reset_index()

In [None]:
sns.distplot(cluster_0[cluster_0['Gender']=='Male']['Spending Score (1-100)'],
            color = 'b',
            label = 'Male')

sns.distplot(cluster_0[cluster_0['Gender']=='Female']['Spending Score (1-100)'],
            color = 'r',
            label = 'Female')
plt.legend()

In [None]:
sns.distplot(cluster_0[cluster_0['Gender']=='Male']['Annual Income (k$)'],
            color = 'b',
            label = 'Male')

sns.distplot(cluster_0[cluster_0['Gender']=='Female']['Annual Income (k$)'],
            color = 'r',
            label = 'Female')
plt.legend()

### Cluster 0 - Observation
1. Female and Male Earn almost the same.
2. Women has more Spending habits.
3. Average age of customers is 39 and Average Annual Income is $86K and spends is very less compared to their income.

# 2nd Cluster

In [None]:
cluster_1 = data[data['cluster_id']==1]
cluster_1_df = cluster_1.groupby('Gender').agg({'Spending Score (1-100)':'mean','Annual Income (k$)':'mean'}).reset_index()

In [None]:
sns.distplot(cluster_1[cluster_1['Gender']=='Male']['Spending Score (1-100)'],
            color = 'b',
            label = 'Male')

sns.distplot(cluster_1[cluster_1['Gender']=='Female']['Spending Score (1-100)'],
            color = 'r',
            label = 'Female')
plt.legend()

In [None]:
sns.distplot(cluster_1[cluster_1['Gender']=='Male']['Annual Income (k$)'],
            color = 'b',
            label = 'Male')

sns.distplot(cluster_1[cluster_1['Gender']=='Female']['Annual Income (k$)'],
            color = 'r',
            label = 'Female')
plt.legend()

## Observation:
1. Spends and Income in this group is both similar for Male and Female
2. Average Age is 25yrs.
3. Income is lesser than their Spends.
4. Male income and Spends is slightly above Females.

# 3rd Cluster

In [None]:
cluster_2 = data[data['cluster_id']==2]
cluster_2_df = cluster_2.groupby('Gender').agg({'Spending Score (1-100)':'mean','Annual Income (k$)':'mean'}).reset_index()

In [None]:
sns.distplot(cluster_2[cluster_2['Gender']=='Male']['Spending Score (1-100)'],
            color = 'b',
            label = 'Male')

sns.distplot(cluster_2[cluster_2['Gender']=='Female']['Spending Score (1-100)'],
            color = 'r',
            label = 'Female')
plt.legend()

In [None]:
sns.distplot(cluster_2[cluster_2['Gender']=='Male']['Annual Income (k$)'],
            color = 'b',
            label = 'Male')

sns.distplot(cluster_2[cluster_2['Gender']=='Female']['Annual Income (k$)'],
            color = 'r',
            label = 'Female')
plt.legend()

## Observation:
1. Average age for this group is 33
2. Annual Income is high for this group and so is the Spending Score.
3. We observe that the Spends and Income of Males is slightly higher than Women.
4. Average Income is slightly higher than Average Spends

## 4th Cluster

In [None]:
cluster_3 = data[data['cluster_id']==3]
cluster_3_df = cluster_3.groupby('Gender').agg({'Spending Score (1-100)':'mean','Annual Income (k$)':'mean'}).reset_index()

In [None]:
sns.distplot(cluster_3[cluster_3['Gender']=='Male']['Annual Income (k$)'],
            color = 'b',
            label = 'Male')

sns.distplot(cluster_3[cluster_3['Gender']=='Female']['Annual Income (k$)'],
            color = 'r',
            label = 'Female')
plt.legend()

In [None]:
sns.distplot(cluster_3[cluster_3['Gender']=='Male']['Spending Score (1-100)'],
            color = 'b',
            label = 'Male')

sns.distplot(cluster_3[cluster_3['Gender']=='Female']['Spending Score (1-100)'],
            color = 'r',
            label = 'Female')
plt.legend()

## Observation
1. Average age of customers is 53
2. The number of customers in this category is 65.
3. The average income and Spends is medium.
4. Males annual income is slighlty higher and Spends a bit lower compared to Females.