In [None]:
import pandas as pd
import numpy as np
ds = pd.read_csv('bank.csv',sep=';')
ds

In [None]:
ds.isnull()

In [None]:
ds.isnull().sum()

In [None]:
ds.isnull().any()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
ds['job'] = le.fit_transform(ds['job'])
ds['marital'] = le.fit_transform(ds['marital'])
ds['education'] = le.fit_transform(ds['education'])
ds['contact'] = le.fit_transform(ds['contact'])
ds['month'] = le.fit_transform(ds['month'])
ds['poutcome'] = le.fit_transform(ds['poutcome'])
ds['default'] = le.fit_transform(ds['default'])
ds['housing'] = le.fit_transform(ds['housing'])
ds['loan'] = le.fit_transform(ds['loan'])
ds['y'] = le.fit_transform(ds['y'])
ds

In [None]:
x=ds.iloc[:,:-1].values

In [None]:
y=ds.iloc[:,-1].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [None]:
from sklearn.preprocessing import StandardScaler 
sc = StandardScaler() 
  
X_train = sc.fit_transform(X_train) 
X_test = sc.fit_transform(X_test)

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
%config InlineBackend.figure_format='retina'

x = sc.fit_transform(x)

In [None]:
import seaborn as sns 
%matplotlib inline

In [None]:
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])

In [None]:
finalDf = pd.concat([principalDf, ds[['y']]], axis = 1)

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = [0, 1]
colors = ['r', 'g']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf['y'] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
               , finalDf.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()

In [None]:
PCA_components = pd.DataFrame(principalComponents)


In [None]:
ks = range(1, 17)
inertias = []
for k in ks:
    # Create a KMeans instance with k clusters: model
    model = KMeans(n_clusters=k)
    
    # Fit model to samples
    model.fit(PCA_components.iloc[:,:3])
    
    # Append the inertia to the list of inertias
    inertias.append(model.inertia_)
    
plt.plot(ks, inertias, '-o', color='black')
plt.title('Elbow Method')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()

In [None]:
Km = KMeans(n_clusters=2,init='k-means++', max_iter=300, n_init=10, random_state=0)

In [None]:
Km.fit_predict(PCA_components)

In [None]:
Km.cluster_centers_

In [None]:
plt.figure(figsize=(10, 7))  
plt.scatter(PCA_components[0], PCA_components[1], s=50, c='b')
plt.scatter(-0.6759737 ,  0.0367424, s=100, c='g', marker='s')
plt.scatter(3.26735159, -0.17759617, s=100, c='r', marker='s')
plt.xlabel('INDEX 0')
plt.ylabel('INDEX 1')
plt.show()

In [None]:
Km.labels_

In [None]:
from kneed import DataGenerator, KneeLocator
kl = KneeLocator(range(1,17), inertias, curve="convex", direction = "decreasing")

kl.elbow

In [None]:
import plotly.figure_factory as ff
import numpy as np
np.random.seed(1)


fig = ff.create_dendrogram(PCA_components)
fig.update_layout(width=800, height=500)
fig.show()

In [None]:
# Using the dendrogram to find the optimal number of clusters
import scipy.cluster.hierarchy as sch
dend=sch.dendrogram(sch.linkage(PCA_components, method='ward'))
plt.title("Dendrogram")
plt.xlabel('Customer')
plt.ylabel('euclidean')
plt.show()

In [None]:
# Fitting Hierarchical Clustering to the dataset
from sklearn.cluster import AgglomerativeClustering
hc=AgglomerativeClustering(n_clusters=4, affinity='euclidean', linkage='ward' )
hc.fit_predict(PCA_components)

In [None]:
plt.figure(figsize=(20, 17))  
plt.scatter(PCA_components[0], PCA_components[1], s=200, c='b')

plt.title('Clusters')
plt.xlabel('INDEX 0')
plt.ylabel('INDEX 1')
plt.show()

In [None]:
sample_test=np.array([-3.0,-3.0])
second_test=sample_test.reshape(1, -1)
Km.predict(second_test)

In [None]:
# ***** RESULT EXPLINATION *****

# We have two components in PCA process, Index[0] and Index[1] of 'y' are out target.
# After applying K-means, the test data point is for index [0] cluster which is green centroid. data point of index [0] is near data point of index [1]. We have two clusters in K-means as we had 2 components in PCA process. 
# After applyting Hierarchical, by using Dendrogram we found 4 clusters

#PCA represented all n data vectors as linear combination of a small number of eigenvectors in order to minimize the mean-squared reconstrcution error but in K-means, it represents all n data vectors by small number of cluster centroids in order to show them as linear combination of a small number of centroid vectors.

#K-means algorithm was better option to work with for finding clusters since it works better with large data and has less time complexity.