In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only '../input/' directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using 'Save & Run All' 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing necessary Libraries

In [None]:
# Import Necessary Libraries
import statistics
import scipy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from pandas import plotting

In [None]:
# Generic function to plot labels along with percentage in pie-charts
def plot_pie(pct, allvals):
    absolute = int(pct/100.*np.sum(allvals))
    return '{:.0f}%\n({:d})'.format(pct, absolute)

#colors for pie charts
colors = ['#ff9999','#66b3ff','#99ff99','#ffcc99', '#3eb489']

def autopct_format(values):
    def my_format(pct):
        total = sum(values)
        val = int(round(pct*total/100.0))
        return '{v:d}'.format(v=val)
    return my_format

### Reading the dataset

In [None]:
df = pd.read_csv('/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')
df.head()

### Identify rows and columns in our dataset

In [None]:
print('Our dataset has {0} rows and {1} columns'.format(df.shape[0], df.shape[1]))

# **Data Cleaning**

### Identify if there are any NA values present

In [None]:
df.isnull().sum()

### Drop unique identifier columns

In [None]:
# Drop Customer ID column, since it is unique
df.drop(columns=['CustomerID'], inplace=True)
df.head()

### Check if we have skewness in our data. If yes, then whether the skewness is based on data distribution or outliers

In [None]:
df.skew()

# **Descriptive Statistics**

### Understanding our customer's age

In [None]:
scipy.stats.describe(df['Age'], ddof=1, bias=False)

As per the Age summary, minimum Age of a Customer entering the mall is 18, and the maximum aged customer is of 70. This pinpoints that mall has visitors which are not particular to a certain age range. In fact there are all types of customers coming on a daily basis including youngsters to old people

### Summarizing our customer's data

In [None]:
df.describe()

- **Age**: Summarized description shows that the average age of a customer is 39. Most visitors are aged between 28-49. There are less teenagers or elderly visitors, who visit the mall.
- **Income**: Average income of a customer if 60k dollars. More customers have their annual incomes ranging between 15-41k dollars, whereas very less customers have annual incomes above 80k dollars.
- **Spending Score**: Customers have shown very varied Spending scores. This might be based on their shopping habits along with their monthly incomes. Almost similar number of customers have shown their spending scores below 50 and above 50.

### Gender-wise customers
This has to be an obvious statement, that women shop more than men. We will further learn more about their shopping patterns in detail.

In [None]:
plt.figure(figsize = (7, 7))
plt.pie(
    x=df['Gender'].value_counts().tolist(), 
    labels=['Women', 'Men'],
    autopct = autopct_format(df['Gender'].value_counts().tolist()),
    colors=colors
)
plt.title('Gender wise count of customers')
plt.show()

### Identify whether Spending score depends on Annual Income of our Customer

In [None]:
# Identify whether Spending score has a relationship with Annual Income
import scipy.stats

plt.figure(figsize=(20, 5))
plt.title('Understanding corelation between Annual Income and Spending Score')
sns.scatterplot(y=df['Spending Score (1-100)'], x=df['Annual Income (k$)'])
plt.show()

print('Correlation between Annual Income & Spending Score: ' + str(df['Annual Income (k$)'].corr(df['Spending Score (1-100)'])))
print('\n')
print(scipy.stats.linregress(df['Annual Income (k$)'], df['Spending Score (1-100)']))

print('\nThere is almost negligible correlation between Annual Income and Spending Scores')

# **Inferential Statistics**

### Hypothesis Testing on spending habits of customers between different age groups
To do this Hypothesis Testing, we will be using **One Way F-test(Anova)**

**Aim:** Identify if spending habits of people between different age groups are similar
<br>
**H0:** Spending Habits are similar
<br>
**H1:** Spending Habits are not similar

In [None]:
# One Way F-test(Anova)
# Aim : Identify if spending habits of people between different age groups are similar
# H0: Spending Habits are similar
# H1: Spending Habits are not similar

from scipy import stats

# Create 3 groups of ages
age_18_30 = df[df['Age'] <= 40]
age_30_40 = df[(df['Age'] > 30) & (df['Age'] <= 40)]
age_40plus = df[df['Age'] > 40]

age_18_30 = age_18_30[['Age', 'Spending Score (1-100)']]
age_30_40 = age_30_40[['Age', 'Spending Score (1-100)']]
age_40plus = age_40plus[['Age', 'Spending Score (1-100)']]

# Identify whether there is any relationship within their spending scores
F, p = stats.f_oneway(age_18_30['Spending Score (1-100)'], age_30_40['Spending Score (1-100)'], age_40plus['Spending Score (1-100)'])

print('F-Score: ' + str(F))
print('P-value: ' + str(p))


if p < 0.05:
    print('Reject Null hypothesis. Spending habits are not similar')
else:
    print('Accept Null hypothesis. Spending habits are similar')

As per the hypothesis testing, we can infer that customers between different age groups [18-30, 30-40, 40+] have no similar spending habits.
Every age group shops based on their own requirements and needs.

### Hypothesis Testing on spending habits of customers to identify whether men have similar spending habits as women do.
To do this Hypothesis Testing, we will be using **One Way F-test(Anova)**

**Aim:** Identify if spending habits of Men are similar to women
<br>
**H0:** Spending Habits of Men & Women are similar
<br>
**H1:** Spending Habits of Men & Women are not similar

In [None]:
# One Way F-test(Anova)
# Aim : Identify if spending habits of Men are similar to women
# H0: Spending Habits of Men & Women are similar
# H1: Spending Habits of Men & Women are not similar

from scipy import stats

men_df = df[df['Gender'] == 'Male']['Spending Score (1-100)']
women_df = df[df['Gender'] == 'Female']['Spending Score (1-100)']
 
F, p = stats.f_oneway(men_df, women_df)

print('F-Score: ' + str(F))
print('P-value: ' + str(p))

if p < 0.05:
    print('Reject Null hypothesis. Spending habits are not similar')
else:
    print('Accept Null hypothesis. Spending habits are similar')

Surprisingly, both men and women seem to be having similar spending habits. One thing to be noted here is, that this hypothesis is not based on what they are shopping or how they are shopping, but instead it highlights that they both spend similar amount of money during shopping.

# **Exploratory Data Analysis**

In [None]:
plt.figure(figsize=(18, 5))
plt.title('Age-wise Annual Income')
sns.barplot(x=df['Age'], y=df['Annual Income (k$)'], palette='rainbow')
plt.show()

**Exploring age-wise annual customer incomes**-
On a high-level, people beteen age 28-44 have higher salaries. Surprisingly, it is observed that 18 year olds have exact same income amount as 70-yr olds. Also, it can be seen that people having highest salaries are of 32 and 42 years of age.

In [None]:
plt.figure(figsize=(30, 8))
plt.title('Income-wise spending scores')
sns.barplot(x=df['Annual Income (k$)'], y=df['Spending Score (1-100)'], palette='viridis')
plt.show()

**Exploring income-wise spending scores** - People with annual income of 69k dollars have the highest spending scores, whereas people with 26k and 30k dollars have lowest spending scores. It is also observed that, although many people have higher annual incomes, there are very less among them who show higher spending scores.

In [None]:
plt.figure(figsize=(10, 6))
plt.title('Gender-wise spending score')
sns.boxenplot(x=df['Gender'], y=df['Spending Score (1-100)'], palette='Set2')
plt.grid(False)
plt.show()

**Exploring Gender-wise spending scores** - It is obvious from this plot that men have lower spending scores as compared to women. One very important thing to note here,is their median spending scores are similar, and this is why our hypothesis testing proved that majority of gender-wise segregated customers have similar spending habits. We have purposefully used boxen plots instead of box plots, just to understand their spending habits in more detail.

In [None]:
plt.figure(figsize=(20, 5))
plt.title('Count of people with spending score')
sns.countplot(x=df['Spending Score (1-100)'])
plt.show()

**Exploring how many people have how much spending scores** - The highest count of people have given their spending scores as 42, 46, 55 and 73. One important thing to be noted here, is that more number of customers have shown 35-59 spending scores.

### Comparing top customer visits (age-wise) vs. least customer visits (age-wise) in the Mall

In [None]:
top_ten_most_age_visits = pd.DataFrame(df['Age'].value_counts()[:10])
top_ten_most_age_visits = top_ten_most_age_visits.rename_axis('age').reset_index()
top_ten_most_age_visits.columns = ['Age', 'count_of_ppl']

least_ten_most_age_visits = pd.DataFrame(df['Age'].value_counts()[-10:])
least_ten_most_age_visits = least_ten_most_age_visits.rename_axis('age').reset_index()
least_ten_most_age_visits.columns = ['Age', 'count_of_ppl']

fig, axs = plt.subplots(ncols=2, figsize=(15,5))
ax1 = sns.barplot(x=top_ten_most_age_visits['Age'], y=top_ten_most_age_visits['count_of_ppl'], palette='rainbow', ax=axs[0], )
ax1.set_title('Highest customer visits (age-wise)')
ax1.set(ylim=(0, 12))

ax2 = sns.barplot(x=least_ten_most_age_visits['Age'], y=least_ten_most_age_visits['count_of_ppl'], palette='rainbow', ax=axs[1])
ax2.set_title('Lowest customer visits (age-wise)')
ax2.set(ylim=(0, 12))
plt.show()

It is seen that elderly people are the ones who visit the mall the least times, whereas the middle-aged people are the ones, who visit more often

# **Data Preprocessing before modelling**

In [None]:
# Perform Hot encoding using 'LabelEncoder' technique
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
temp = df['Gender']
df.drop(columns=['Gender'], inplace=True)

df['Gender'] = label_encoder.fit_transform(temp)
# This encodes Men as '1' and Women as '0' labels

df = df[['Age', 'Gender', 'Annual Income (k$)', 'Spending Score (1-100)']]
df.head()

# **K-means clustering Model Development**

### Create an elbow plot to identify the fittest number of clusters

In [None]:
from sklearn.cluster import KMeans

num_of_clusters = []
score_per_cluster = []

# For each val. of K, we need to plot the sum of square of distance
for i in range(1, 15):  # Identify best estimate between 1-15 Clusters
    temp = KMeans(n_clusters = i, init = 'k-means++', random_state = 10)
    temp.fit(df)
    temp_score = temp.fit(df).score(df)
    print('KMeans Score for ' + str(i) + ' clusters is: ' + str(np.absolute(temp_score)))
    
    num_of_clusters.append(i)
    score_per_cluster.append(np.absolute(temp_score))


# Plot our estimations using MatplotLib
plt.figure(figsize=(10,10))  # NOTE: This line has to be used always before creating the plot

plt.plot(num_of_clusters, score_per_cluster, color='coral', marker='*', linewidth=2.5)  # Sample line chart plotting
plt.xlabel('Number of Clusters')
plt.ylabel('Sum of Square of Distance Per Cluster')
plt.title('Elbow plot on SNS data')


plt.grid(True)  # Add a Grid in Graph
plt.show()

**'5'** seems to be the ideal number of clusters, since after that, sum of squared distances between clusters seem to be gradually reducing

### Predict clusters

In [None]:
km = KMeans(n_clusters = 5, init = 'k-means++', random_state=0)
labels = km.fit_predict(df)

labels  # This returns the index of each cluster to which the sample belongs to.

In [None]:
# Verify unique total number of clusters returned by our model
print(np.unique(labels))  # Hence, in total there are 5 clusters
print('{0} number of clusters'.format(len(np.unique(labels))))

# **Plot clusters to identify target customers**

In [None]:
# Plot clusters:
unique_labels = np.unique(labels)
print(unique_labels)

fig = plt.figure(figsize=(20, 10))
plt.title('Plotting our Clusters')
fig.patch.set_alpha(0.0)

for i in unique_labels:
    plt.scatter(
        x=df[labels == i]['Annual Income (k$)'].values.tolist(), 
        y=df[labels == i]['Spending Score (1-100)'].values.tolist(),
        sizes=(20, 200)
    )


plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.grid(True)
plt.legend(['Bargain Hunters', 'Target Customers', 'Average Customers', 'Frugal Customers', 'Big Spenders'])
plt.show()

- **'Bargain Hunters'** are those customers, who have minimal annual income, and hence they always try to capture the best bargains possible and hence, their spending scores are very low
<br>
- **'Target Customers'** are those customers, who we need to target, since they have better annual incomes, also they show higher spending scores. These are the customers who are willing to take a leap and shop at their will, considering the mall varieties fulfill their expectations. 
<br>
- **'Average Customers'** are the ones, who have average annual incomes. Also they spend their incomes on not more than what they wish for. They only seem to shop for the needful basics and nothing more.
<br>
- **'Frugal Customers'** are the ones, who have the most higher incomes, but yet they spend very low on shopping. Either they tend to live a very simple life and only shop for the bare essentials, or they are very economical with regards to their money spent on buying new things.
- **'Big Spenders'**! This is another segment of customers which can be targetted apart from our Ideal target customers. These are the people, who have less annual incomes, but yet they tend to spend the most of it on shopping new things.