In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

In [None]:
dataset = pd.read_csv('../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')

In [None]:
dataset.head()

In [None]:
dataset.describe(include = 'all')

In [None]:
dataset.isna().sum()

In [None]:
sns.catplot(x = 'Gender', kind = 'count', data=dataset, height = 6, aspect = 1)
plt.title('Gender Distribution', size = 20)
plt.xlabel("")
plt.ylabel("")
plt.show()

In [None]:
dataset['Age'].plot(kind='hist', figsize = (10, 6))
plt.title('Age Distribution', size = 20)
plt.xlabel('Age', size = 15)

In [None]:
dataset['Annual Income (k$)'].plot(kind='hist', figsize = (10, 6))
plt.title('Income Distribution', size = 20)
plt.xlabel('Annual Income (k$)', size = 15)

In [None]:
dataset['Spending Score (1-100)'].plot(kind='hist', figsize = (10, 6))
plt.title('Spending Score Distribution', size = 20)
plt.xlabel('Spending Score (1-100)', size = 15)

In [None]:
sns.catplot(x = 'Gender', y = 'Spending Score (1-100)', data = dataset, 
            kind = 'bar', height = 6, aspect = 1)
plt.title('Gender vs. Spending Score', size = 20)
plt.xlabel('')
plt.ylabel('Avg. Spending Score', size = 15)

In [None]:
sns.set(rc={'figure.figsize':(10,6)})
sns.scatterplot(x = 'Age', y = 'Spending Score (1-100)', data=dataset)
plt.title('Age vs. Spending Score', size = 20)
plt.xlabel('Age', size = 15)
plt.ylabel('Spending Score (1-100)', size = 15)

In [None]:
sns.set(rc={'figure.figsize':(10,6)})
sns.scatterplot(x = 'Annual Income (k$)', y = 'Spending Score (1-100)', data=dataset)
plt.title('Annual Income (k$) vs. Spending Score', size = 20)
plt.xlabel('Annual Income (k$)', size = 15)
plt.ylabel('Spending Score (1-100)', size = 15)

###### By looking at the last two graphs we can see that there are differences in spending scores based on age and anual income 

## Clustering 

In [None]:
# Get rid of the ID column 

X = dataset.iloc[:,1:]

# Peform Feature Scaling

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X.iloc[:,1:] = sc.fit_transform(X.iloc[:,1:])

# Recode the Gender column

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [None]:
print(X[:5])

### K-Means Clusering

#### In order to find the optimal number of clusters I will use the elbow method

In [None]:
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++')
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

###### It seems that the optimal number of clusters is 6

In [None]:
kmeans = KMeans(n_clusters = 6, init = 'k-means++')
y_kmeans = kmeans.fit_predict(X)

In [None]:
kmeans_dataset = dataset.copy()
kmeans_dataset['Clusters'] = y_kmeans

In [None]:
kmeans_dataset.head()

### PCA

In [None]:
# Get rid of the ID column 

X = dataset.iloc[:,1:-1]
y = dataset.iloc[:,-1]

# Peform Feature Scaling

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X.iloc[:,1:] = sc.fit_transform(X.iloc[:,1:])

# Recode the Gender column

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 1)
X = pca.fit_transform(X)
print(X[:5])

In [None]:
pca_dataframe = pd.DataFrame({'Spending_Score':y, 'PCA_Var':X[:,0]})

In [None]:
pca_dataframe.head()

In [None]:
X = pca_dataframe.iloc[:].values

In [None]:
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
    kmeans_pca = KMeans(n_clusters = i, init = 'k-means++')
    kmeans_pca.fit(X)
    wcss.append(kmeans_pca.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
kmeans_pca = KMeans(n_clusters = 3, init = 'k-means++')
y_kmeans_pca = kmeans_pca.fit_predict(X)

In [None]:
pca_dataframe['Cluster'] = y_kmeans_pca

In [None]:
sns.set(rc={'figure.figsize':(10,6)})
sns.scatterplot(x = 'PCA_Var', y = 'Spending_Score', hue = 'Cluster', data=pca_dataframe, palette = "deep")

###### We can see that after applying PCA the number of clusters reduced to only 3, and it looks like there is a clear difference between the three regarding their spending score, but probably the model won't be able to discriminate well between the customers based on the dimension resulted from applying PCA