<div class="alert alert-block alert-success">
    <h1 align="center">Machine Learning in Python</h1>
    <h3 align="center">Mini Project2 - Clustering</h3>
</div>

<img src = "https://miro.medium.com/max/5760/1*_SRbz8535-CxPzo6IL9pBA.png" width=80%>

## Importing the libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import sklearn.utils

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from sklearn import metrics
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.cluster import DBSCAN

## Load and Prepare Data

In [None]:
market_data = pd.read_csv('../input/123qweasd/Mall_Customers.csv')

## EDA

In [None]:
market_data.info()

In [None]:
market_data.describe()

In [None]:
market_data.tail()

In [None]:
market_data['Gender'].value_counts(normalize = True)*100

In [None]:
market_data.isnull().sum()

## Data Preprocessing

In [None]:
#Encoding object into binary values ...
market_data['Gender']=market_data['Gender'].map({'Male':1,'Female':0})

In [None]:
market_data = market_data.drop(['CustomerID'] , axis = 1)

In [None]:
market_data.head()

In [None]:
X = market_data.drop(['Gender','Age'] , axis = 1)
X = X.values
X

In [None]:
sc = StandardScaler()
X = sc.fit_transform(X)
X

## Strorytelling - Visualization

In [None]:
sns.catplot(x = "Gender" , kind = "count" , data = market_data)
# 0 = Male
# 1 = Female

In [None]:
sns.displot(market_data , x = "Age" , kind = "kde" , hue = "Gender" )
# 0 = Male
# 1 = Female

In [None]:
sns.catplot(x = "Age" , y = "Annual Income (k$)" , hue = "Gender" , kind = "point" , data = market_data , height = 5 , aspect = 4 )

In [None]:
sns.relplot(x="Age", y="Spending Score (1-100)", hue="Annual Income (k$)", data=market_data)

In [None]:
sns.relplot(x="Annual Income (k$)", y="Spending Score (1-100)" , hue = "Age" , data = market_data , style = "Gender" )

In [None]:
t_corr = market_data.corr()
plt.figure(figsize=(6,6))
sns.heatmap(t_corr, annot=True, annot_kws={'size':12}, cmap='Greens' )

## Train the Model (Clustering)

# K-means

In [None]:
# Within Cluster Sum of Squares (WCSS) :
# WCSS is the sum of squared distance between each point and the centroid in a cluster.
wcss = []
for i in range(1, 21):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

In [None]:
plt.plot(range(1, 21), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
# 5 seems to be good choice!

In [None]:
wcss

In [None]:
kmeans = KMeans(n_clusters = 5, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(X)

In [None]:
kmeans.cluster_centers_.shape

In [None]:
y_kmeans

In [None]:
plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 100, c = 'purple', label = 'Cluster 4')
plt.scatter(X[y_kmeans == 4, 0], X[y_kmeans == 4, 1], s = 100, c = 'pink', label = 'Cluster 5')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'cyan', label = 'Centroids')
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()

# DBSCAN

In [None]:
db = DBSCAN(eps=0.4,min_samples=8).fit(X)

In [None]:
labels = db.labels_

In [None]:
labels

In [None]:
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

In [None]:
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True

In [None]:
print('Estimated number of clusters: %d' % n_clusters_)

In [None]:
print('Estimated number of noise points: %d' % n_noise_)

In [None]:
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
          for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]

    class_member_mask = (labels == k)

    xy = X[class_member_mask & core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=14)

    xy = X[class_member_mask & ~core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=6)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

In [None]:
# M.Hossein Hashemi