# Data Analysis and Operations Performed
* Exploratory Data Analysis
*  K-Means Clustering
*  Elbow Method Plot
*  PCA Dimensionality Reduction
*  TSNE Dimensionality Reduction
*  K-Nearest Neighbour (KNN)
*  Confusion Matrix
*  Classification Report

In [None]:
# Import Python Libraries

import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
plt.rcParams['font.size']=14
%matplotlib inline

In [None]:
# Reading Dataset
df = pd.read_csv("../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv")

In [None]:
df

# Exploring ,Transforming and Visulaizing Dataset

In [None]:
df.shape

In [None]:
df[['Age','Annual Income (k$)','Spending Score (1-100)']].describe()

In [None]:
%matplotlib inline
import seaborn as sns

sns.set(font_scale=1.1)
sns.set_style('whitegrid')

grid = sns.pairplot(data=df, vars=df.columns[2:5])

After Exploring PairPlot it is seen that there are some cluster forming when we are scatter plotting 'Annual Income (k$)' with 'Spending Score (1-100)'.

It is visible that approximately 5 Clusters are present.

In [None]:
# There are no null values in the dataset so no need to clean.
# Transforming Age Column into dummy/indicator variables to make easy for computation.
df['Gender'] = pd.get_dummies(df['Gender'])
df.head()

# Scaling Dataset

In [None]:
scaler = StandardScaler()
dataset_Scaled = scaler.fit_transform(df)

# Fit the data in to KMeans Model

In [None]:
km = KMeans(n_clusters=5, random_state=10)
y_predicted = km.fit_predict(df[['Annual Income (k$)','Spending Score (1-100)']])
y_predicted

In [None]:
df['cluster']= y_predicted
df

Now we have labelled or dataset on the basis of cluster.

In [None]:
# Centroid Coordinate For Clusters

km.cluster_centers_

In [None]:
df1 = df[df.cluster==0]
df2 = df[df.cluster==1]
df3 = df[df.cluster==2]
df4 = df[df.cluster==3]
df5 = df[df.cluster==4]

plt.scatter(df1['Spending Score (1-100)'],df1['Annual Income (k$)'],color='green')
plt.scatter(df2['Spending Score (1-100)'],df2['Annual Income (k$)'],color='red')
plt.scatter(df3['Spending Score (1-100)'],df3['Annual Income (k$)'],color='black')
plt.scatter(df4['Spending Score (1-100)'],df4['Annual Income (k$)'],color='yellow')
plt.scatter(df5['Spending Score (1-100)'],df5['Annual Income (k$)'],color='orange')

plt.xlabel('Spending Score (1-100)')
plt.ylabel('Annual Income (k$)')


### Calculating Elbow Plot -  Further Experimenting with the data

In [None]:
sse = []
k_rng = range(1,20)
for k in k_rng:
    km = KMeans(n_clusters=k)
    km.fit(df[['Spending Score (1-100)','Annual Income (k$)']])
    sse.append(km.inertia_)

In [None]:
plt.xlabel('K')
plt.ylabel('Sum of squared error')
plt.plot(k_rng,sse)

According to Elbow Rule For K Clustering optimal cluster numbers is 5 and we have taken that only.

## Dimensionality reduction with Principal Component Analysis (PCA)

In [None]:
from sklearn.decomposition import PCA

# Reduce to two components

reduced_data_pca = PCA(n_components=2, random_state=10).fit_transform(dataset_Scaled)

In [None]:
reduced_data_pca.shape

In [None]:
reduced_data_pca[1:5]

In [None]:
# Visulaising Reduced Data

reduced_df = pd.DataFrame(reduced_data_pca,columns=['Component 1', 'Component 2'])
reduced_df['cluster'] = df.cluster
plt.figure(figsize=(8,6))
axes = sns.scatterplot(data=reduced_df, hue='cluster', legend='brief', 
                       x='Component 1', y='Component 2')

The clusters or not well defined and merge with one another

## Dimensionality reduction with TSNE

In [None]:
from sklearn.manifold import TSNE

# Reduce to two components 

reduced_data_tsne = TSNE(n_components=2, random_state=10).fit_transform(dataset_Scaled)

In [None]:
reduced_data_tsne.shape

In [None]:
reduced_data_tsne[1:5]

In [None]:
# Visulaising Reduced Data

reduced_df = pd.DataFrame(reduced_data_tsne,columns=['Component 1', 'Component 2'])
reduced_df['cluster'] = df.cluster
plt.figure(figsize=(8,6))
axes = sns.scatterplot(data=reduced_df, hue='cluster', legend='brief', 
                       x='Component 1', y='Component 2')

# Supervised Learning for Labelled Dataset

In [None]:
df

In [None]:
df.shape

In [None]:
# Target Column is 'cluster'
# Here, x is independent variable and y is dependent variable

x = df.drop(['cluster'], axis=1).values
y = df['cluster'].values

In [None]:
#Shape of Sample and Target Set

print(f'Shape of Sample Set = {x.shape}')
print(f'Shape of Target Set = {y.shape}')

###  Split the data for training and testing

In [None]:
# Split the data for training and testing

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test =  train_test_split(x,y,test_size=0.3, random_state=10)


### Explore the data and target values

In [None]:
# Shapes of the training and testing data

print(f'X_train.shape = {X_train.shape}')
print(f'X_test.shape  = {X_test.shape}')

print()
print(f'y_train.shape = {y_train.shape}')
print(f'y_test.shape  = {y_test.shape}')

### Create and train the k-nearest neighbors model

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X=X_train, y=y_train)

In [None]:
# Test the model by predicting cluster

predicted = knn.predict(X=X_test)
expected = y_test

print('First twenty predictions:')
print(f'predicted[:20] = {predicted[:20]}')
print(f' expected[:20] = {expected[:20]}')

In [None]:
wrong = [ (pred, exp) 
          for (pred, exp) in zip(predicted, expected) 
          if pred != exp
        ]

print('Wrong predictions:')
print(wrong)

In [None]:
# Model prediction accuracy

print(f'Prediction accuracy score = {knn.score(X_test, y_test):.2%}')

### Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

confusion = confusion_matrix(y_true=expected, y_pred=predicted)

print('Confusion matrix:')
print(confusion)

In [None]:
import pandas as pd
import seaborn as sns

confusion_df = pd.DataFrame(confusion, index=range(5), columns=range(5))
axes = sns.heatmap(confusion_df, annot=True, cmap='nipy_spectral_r')

### Classification Report

In [None]:
from sklearn.metrics import classification_report

names = [0,1,2,3,4]
print(classification_report(expected, predicted, names))