In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')
df = df.rename(columns={'Annual Income (k$)':'Annual_Income_in_1000$','Spending Score (1-100)':'Spending_Score'})
df.head(10)

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
# Correlation between Annual Income and Spending Score for both Gender
plt.figure(figsize=(15,5))
ax = sns.scatterplot(x = 'Annual_Income_in_1000$',y = 'Spending_Score',data=df, hue='Gender')
ax.set_title('Correlation between Annual Income and Spending Score for both Gender')

In [None]:
# Average Spending Score for Age Range
bins = [18, 26, 31, 36, 41, 46, 51, 56, 61, 66, 71]
labels = ['18-25 Years','26-30 Years','31-35 Years','36-40 Years','41-45 Years','46-50 Years','51-55 Years','56-60 Years','61-65 Years','66-70 Years']
df['Age_Range'] = pd.cut(df.Age, bins, labels = labels, include_lowest = True)
agerange_spending_score_mean = df.groupby('Age_Range').Spending_Score.mean().sort_values(ascending=False)
agerange_spending_score_mean

In [None]:
# Plot Average Spending Score for Age range
plt.figure(figsize=(15,5))
ax = sns.barplot(agerange_spending_score_mean.index, agerange_spending_score_mean.values)
ax.set_title('Average Spending Score for Each Age Range')
ax.set_xlabel('Age Range (Years)')
ax.set_ylabel('Spending Score')
plt.show()

In [None]:
# Customer Age classified 
age_classified = df.groupby('Age_Range')['CustomerID'].count()
age_classified

In [None]:
# Plot the data for each Gender
plt.figure(figsize=(15,7))
ax = sns.countplot(y = 'Age_Range', data = df, hue = 'Gender', palette = 'gnuplot', order = df['Age_Range'].value_counts().index)
plt.show()

In [None]:
# Age range distribution of Mall Customer
pie, ax = plt.subplots(figsize=[10,6])
labels = age_classified.keys()
plt.pie(x=age_classified, autopct='%.1f%%', explode=[0.05]*10,labels=labels, pctdistance=0.5)
plt.title('Mall Customer Age Distribution (years)')
plt.show()

In [None]:
# Customer Gender Distribution
gender_classified = df.groupby('Gender')['CustomerID'].count()
gender_classified

In [None]:
# Gender distribution of Mall Customer
pie, ax = plt.subplots(figsize=[10,6])
labels = gender_classified.keys()
plt.pie(x=gender_classified, autopct='%.1f%%', explode=[0.02]*2,labels=labels, pctdistance=0.5)
plt.title('Gender Distribution (years)')
plt.show()

In [None]:
# Annual Income classified Customer
bins = [10, 31, 51, 71, 91, 111, 131, 150]
labels = ['10-30 (in 1000$)','31-50 (in 1000$)','51-70 (in 1000$)','71-90 (in 1000$)','91-110 (in 1000$)','111-130 (in 1000$)','> 130 (in 1000$)']
df['Income Classified'] = pd.cut(df['Annual_Income_in_1000$'], bins, labels = labels, include_lowest = True)
df.head()

In [None]:
# Distribution of Mall Customer based on Income Classified
income_classified = df.groupby('Income Classified')['CustomerID'].count()
income_classified

In [None]:
# Customer Annual Income distribution
pie, ax = plt.subplots(figsize=[7,7])
labels = income_classified.keys()
plt.pie(x=income_classified, autopct='%.1f%%', explode=[0.02]*7,labels=labels, pctdistance=0.7)
plt.title('Customer Annual Income Distribution (in 1000$)')
plt.show()

In [None]:
# Boxplotting data
fig = px.box(df, y = 'Age')
fig.update_layout(title='Gender of Mall Customer')
fig.show()

In [None]:
fig = px.box(df, y = 'Annual_Income_in_1000$')
fig.update_layout(title='Annual Income of Mall Customer')
fig.show()

In [None]:
fig = px.box(df, y = 'Spending_Score')
fig.update_layout(title='Spending Score of Mall Customer')
fig.show()

In [None]:
df.head()

In [None]:
# Total Spending Score per Age range grouped by Annual Income
fig = px.treemap(df, names = 'Age_Range', values = 'Spending_Score',path=['Income Classified', 'Age_Range'],
                title = 'Total Spending Score per Age range grouped by Annual Income',
                 color_discrete_sequence = px.colors.qualitative.Set1)
fig.show()

In [None]:
# Which spend the most in mall, Male or Female ?
gender_spender = df.groupby('Gender')['Spending_Score'].sum()
gender_spender

In [None]:
# The most spender gender
plt.figure(figsize=(7,5))
ax = sns.barplot(gender_spender.index, gender_spender.values)
ax.set_title('Which spend the most in Mall ? Male or Female')
ax.set_ylabel('Total Spending Score')
plt.show()