In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering

In [None]:
# Importing dataset
data = pd.read_excel('https://github.com/rushikeshw791/Clustering-1/files/9793741/EastWestAirlines.xlsx',sheet_name='data')
data.head()

In [None]:
# Column rename.
data.rename(columns={'ID#':'ID', 'Award?':'Award'}, inplace=True)
data.head()

In [None]:
data.isna().sum()

In [None]:
data.duplicated().sum()

In [None]:
data1=data.drop(labels='ID',axis=1)
data1.head()

In [None]:
# correlation heatmap

f,ax = plt.subplots(figsize=(18,12))
sns.heatmap(data.corr(), annot=True, linewidths =.5, fmt ='.1f',ax=ax)
plt.show()

**Data preprocessing**

1.Standardizing the data

In [None]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler

In [None]:
standard_scaler = StandardScaler()
std_df = standard_scaler.fit_transform(data1)
std_df.shape

In [None]:
plt.figure(figsize=(20,12))
dendo = sch.dendrogram(sch.linkage(std_df,method='single'))

In [None]:
plt.figure(figsize=(20,12))
dendogram = sch.dendrogram(sch.linkage(std_df,method='complete'))

In [None]:
plt.figure(figsize=(20,12))
dendogram = sch.dendrogram(sch.linkage(std_df,method='average'))

In [None]:
plt.figure(figsize=(20,12))
dendogram = sch.dendrogram(sch.linkage(std_df,method='ward'))

2.Normalizing the data

In [None]:
# Using Minmaxscaler for accuracy result comparison

minmax = MinMaxScaler()

minmax_df = minmax.fit_transform(data1)
minmax_df.shape

(3999, 11)

In [None]:
plt.figure(figsize=(20,12))
dendogram = sch.dendrogram(sch.linkage(minmax_df,method='single'))

In [None]:
plt.figure(figsize=(20,12))
dendogram = sch.dendrogram(sch.linkage(minmax_df,method='complete'))

In [None]:
plt.figure(figsize=(20,12))
dendogram = sch.dendrogram(sch.linkage(minmax_df,method='average'))

In [None]:
plt.figure(figsize=(20,12))
dendogram = sch.dendrogram(sch.linkage(minmax_df,method='ward'))

**Create clusters (y)**

In [None]:
hclusters=AgglomerativeClustering(n_clusters=5,affinity='euclidean',linkage='ward')
hclusters

In [None]:
y_hc=hclusters.fit_predict(minmax_df)

In [None]:
y_hc

array([1, 1, 1, ..., 0, 4, 4])

In [None]:
data['h_clusterid']=hclusters.labels_

In [None]:
data

In [None]:
y=pd.DataFrame(hclusters.fit_predict(minmax_df),columns=['clusterid'])
y['clusterid'].value_counts()

In [None]:
# adding clusters to the dataset
data1['clusterid']=hclusters.labels_
data1

In [None]:
data1.groupby('clusterid').agg(['mean']).reset_index()

In [None]:
# plot clusters
plt.figure(figsize=(10,7))
plt.scatter(data1['clusterid'],data1['Balance'],c=hclusters.labels_)

**K means**

In [None]:
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist

In [None]:
airline1= pd.read_excel('https://github.com/rushikeshw791/Clustering-1/files/9793741/EastWestAirlines.xlsx',sheet_name='data')
airline1.head()

In [None]:
#Normalized data fuction
def norm_func(i):
    x=(i-i.min())/(i.max()-i.min())
    return(x)

In [None]:
df_norm = norm_func(airline1.iloc[:,1:])

In [None]:
# Elbow curve
wcss=[]
for i in range(1,5):
  kmeans=KMeans(n_clusters=i)
  kmeans.fit(df_norm)
  wcss.append(kmeans.inertia_)
plt.plot(range(1,5),wcss)
plt.title('Elbow Curve')
plt.xlabel('Number of clusters')
plt.ylabel('wcss')
plt.show()

In [None]:
# select 4 clusters from above plot
model=KMeans(n_clusters=4)
model.fit(df_norm)
model.labels_

array([0, 0, 0, ..., 3, 0, 0], dtype=int32)

In [None]:
x=pd.Series(model.labels_)
airline1['Clust']=x

In [None]:
airline1

**DBSCAN**

In [None]:
from sklearn.cluster import DBSCAN

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
airline2 = pd.read_excel('https://github.com/rushikeshw791/Clustering-1/files/9793741/EastWestAirlines.xlsx',sheet_name='data')
airline2.head()

In [None]:
airline2.info()

In [None]:
df=airline2.iloc[:,1:5]

In [None]:
df.values

In [None]:
stscaler=StandardScaler().fit(df.values)
x=stscaler.transform(df.values)

In [None]:
x

In [None]:
dbscan=DBSCAN(eps=2,min_samples=5)
dbscan.fit(x)

In [None]:
dbscan.labels_

In [None]:
c1=pd.DataFrame(dbscan.labels_,columns=['cluster'])

In [None]:
c1

In [None]:
pd.concat([airline2,c1],axis=1)