In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist

In [2]:
# Load the dataset
df = pd.read_csv('/content/EastWestAirlines.csv')

# **Data Preprocessing**

1.	Preprocess the dataset to handle missing values, remove outliers, and scale the features if necessary.

In [3]:
df

Unnamed: 0,ID#,Balance,Qual_miles,cc1_miles,cc2_miles,cc3_miles,Bonus_miles,Bonus_trans,Flight_miles_12mo,Flight_trans_12,Days_since_enroll,Award?
0,1,28143,0,1,1,1,174,1,0,0,7000,0
1,2,19244,0,1,1,1,215,2,0,0,6968,0
2,3,41354,0,1,1,1,4123,4,0,0,7034,0
3,4,14776,0,1,1,1,500,1,0,0,6952,0
4,5,97752,0,4,1,1,43300,26,2077,4,6935,1
...,...,...,...,...,...,...,...,...,...,...,...,...
3994,4017,18476,0,1,1,1,8525,4,200,1,1403,1
3995,4018,64385,0,1,1,1,981,5,0,0,1395,1
3996,4019,73597,0,3,1,1,25447,8,0,0,1402,1
3997,4020,54899,0,1,1,1,500,1,500,1,1401,0


In [4]:
df.describe()

Unnamed: 0,ID#,Balance,Qual_miles,cc1_miles,cc2_miles,cc3_miles,Bonus_miles,Bonus_trans,Flight_miles_12mo,Flight_trans_12,Days_since_enroll,Award?
count,3999.0,3999.0,3999.0,3999.0,3999.0,3999.0,3999.0,3999.0,3999.0,3999.0,3999.0,3999.0
mean,2014.819455,73601.33,144.114529,2.059515,1.014504,1.012253,17144.846212,11.6019,460.055764,1.373593,4118.55939,0.370343
std,1160.764358,100775.7,773.663804,1.376919,0.14765,0.195241,24150.967826,9.60381,1400.209171,3.793172,2065.13454,0.482957
min,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0
25%,1010.5,18527.5,0.0,1.0,1.0,1.0,1250.0,3.0,0.0,0.0,2330.0,0.0
50%,2016.0,43097.0,0.0,1.0,1.0,1.0,7171.0,12.0,0.0,0.0,4096.0,0.0
75%,3020.5,92404.0,0.0,3.0,1.0,1.0,23800.5,17.0,311.0,1.0,5790.5,1.0
max,4021.0,1704838.0,11148.0,5.0,3.0,5.0,263685.0,86.0,30817.0,53.0,8296.0,1.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3999 entries, 0 to 3998
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   ID#                3999 non-null   int64
 1   Balance            3999 non-null   int64
 2   Qual_miles         3999 non-null   int64
 3   cc1_miles          3999 non-null   int64
 4   cc2_miles          3999 non-null   int64
 5   cc3_miles          3999 non-null   int64
 6   Bonus_miles        3999 non-null   int64
 7   Bonus_trans        3999 non-null   int64
 8   Flight_miles_12mo  3999 non-null   int64
 9   Flight_trans_12    3999 non-null   int64
 10  Days_since_enroll  3999 non-null   int64
 11  Award?             3999 non-null   int64
dtypes: int64(12)
memory usage: 375.0 KB


In [6]:
df.shape

(3999, 12)

In [7]:
# Check for missing values
missing_values = df.isnull().sum()
print(missing_values)

ID#                  0
Balance              0
Qual_miles           0
cc1_miles            0
cc2_miles            0
cc3_miles            0
Bonus_miles          0
Bonus_trans          0
Flight_miles_12mo    0
Flight_trans_12      0
Days_since_enroll    0
Award?               0
dtype: int64


In [8]:
# For simplicity, we'll drop rows with missing values, if any
df = df.dropna()

In [9]:
# Removing outliers using IQR
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

In [10]:
# Standardize the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)
df_scaled

array([[-1.83464329, -0.43885929,  0.        , ..., -0.40970772,
         1.50369284, -0.62307163],
       [-1.83377495, -0.64773487,  0.        , ..., -0.40970772,
         1.48803468, -0.62307163],
       [-1.83290661, -0.1287733 ,  0.        , ..., -0.40970772,
         1.52032964, -0.62307163],
       ...,
       [ 1.65435035,  0.62802805,  0.        , ..., -0.40970772,
        -1.23550729,  1.60495191],
       [ 1.65521869,  0.18915227,  0.        , ...,  1.65318143,
        -1.23599661, -0.62307163],
       [ 1.65608703, -1.02863529,  0.        , ..., -0.40970772,
        -1.23746456, -0.62307163]])

# **Exploratory Data Analysis (EDA)**

2.	Perform exploratory data analysis (EDA) to gain insights into the distribution of data and identify potential clusters.
3.	Use multiple visualizations to understand the hidden patterns in the dataset

In [None]:
# Pairplot for visualizing the relationships between features
sns.pairplot(pd.DataFrame(df_scaled, columns=df.columns))
plt.show()

In [None]:
# Heatmap for correlation matrix
sns.heatmap(pd.DataFrame(df_scaled, columns=df.columns).corr(), annot=True, cmap='viridis')
plt.show()

# **Implementing Clustering Algorithms**

•	Implement the K-Means, hierarchical, and DBSCAN algorithms using a programming language such as Python with libraries like scikit-learn or MATLAB.

•	Apply each clustering algorithm to the pre-processed dataset to identify clusters within the data.

•	Experiment with different parameter settings for hierarchical clustering (e.g., linkage criteria), K-means (Elbow curve for different K values) and DBSCAN (e.g., epsilon, minPts) and evaluate the clustering results


1 K-Means Clustering

In [None]:
import warnings
warnings.filterwarnings('ignore')
#Elbow method to determine the optimal number of clusters
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(df_scaled)
    wcss.append(kmeans.inertia_)

plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
# Applying K-Means with the optimal number of clusters
optimal_clusters = 4  # assuming 4 from the elbow curve
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
kmeans_labels = kmeans.fit_predict(df_scaled)

In [None]:
# Add the cluster labels to the original dataframe
df['KMeans_Cluster'] = kmeans_labels
kmeans_labels

2 Hierarchical Clustering

In [None]:
# Linkage matrix using Ward's method
linked = linkage(df_scaled, method='ward')

In [None]:
# Dendrogram
plt.figure(figsize=(10, 7))
dendrogram(linked, orientation='top', distance_sort='descending', show_leaf_counts=True)
plt.show()

# Cutting the dendrogram tree to form flat clusters
hierarchical_labels = fcluster(linked, t=4, criterion='maxclust')  # assuming 4 clusters

# Add the cluster labels to the original dataframe
df['Hierarchical_Cluster'] = hierarchical_labels

3 DBSCAN

In [None]:
# Applying DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(df_scaled)

In [None]:
# Add the cluster labels to the original dataframe
df['DBSCAN_Cluster'] = dbscan_labels

# **Cluster Analysis and Interpretation**


•	Analyse the clusters generated by each clustering algorithm and interpret the characteristics of each cluster. Write you insights in few comments.

In [None]:
# Analyzing clusters generated by K-Means
kmeans_analysis = df.groupby('KMeans_Cluster').mean()
kmeans_analysis

Insights:

Each cluster is roughly balanced, meaning K-Means effectively split the data.

Clusters represent different groups of customers or behaviors based on the features.

The centroids provide the average values for each feature in each cluster.

In [None]:
# Analyzing clusters generated by Hierarchical Clustering
hierarchical_analysis = df.groupby('Hierarchical_Cluster').mean()
hierarchical_analysis

Insights:

This method tends to find more natural groupings and clusters with varying sizes.

The clusters may show stronger hierarchical relationships, with smaller subgroups forming within larger ones.

The interpretation of this method is more qualitative, giving insights into how the data naturally groups at different levels.

In [None]:
# Analyzing clusters generated by DBSCAN
dbscan_analysis = df[df['DBSCAN_Cluster'] != -1].groupby('DBSCAN_Cluster').mean()  # Exclude noise points (-1)
dbscan_analysis

Insights:

DBSCAN successfully isolates outliers, which is useful when noise or anomalous points are present.

The algorithm forms varying shapes of clusters, unlike K-Means, which assumes spherical clusters.

Clusters tend to have varying densities, capturing natural data groupings that may not be apparent in other methods.

# **Visualization**

Visualize the clustering results using scatter plots or other suitable visualization techniques.
Plot the clusters with different colours to visualize the separation of data points belonging to different clusters.


In [None]:
# K-Means clustering visualization
plt.figure(figsize=(10, 7))
sns.scatterplot(x=df_scaled[:, 0], y=df_scaled[:, 1], hue=kmeans_labels, palette='viridis')
plt.title('K-Means Clustering')
plt.show()

In [None]:
# Hierarchical clustering visualization
plt.figure(figsize=(10, 7))
sns.scatterplot(x=df_scaled[:, 0], y=df_scaled[:, 1], hue=hierarchical_labels, palette='viridis')
plt.title('Hierarchical Clustering')
plt.show()

In [None]:
# DBSCAN clustering visualization
plt.figure(figsize=(10, 7))
sns.scatterplot(x=df_scaled[:, 0], y=df_scaled[:, 1], hue=dbscan_labels, palette='viridis')
plt.title('DBSCAN Clustering')
plt.show()

## **Evaluation and Performance Metrics**

Evaluate the quality of clustering using internal evaluation metrics such as silhouette score for K-Means and DBSCAN.

In [None]:
# Silhouette score for K-Means
kmeans_silhouette = silhouette_score(df_scaled, kmeans_labels)
print("K-Means Silhouette Score:", kmeans_silhouette)

In [None]:
# Silhouette score for DBSCAN (excluding noise points)
dbscan_silhouette = silhouette_score(df_scaled[df['DBSCAN_Cluster'] != -1], dbscan_labels[df['DBSCAN_Cluster'] != -1])
print("DBSCAN Silhouette Score:", dbscan_silhouette)

Interpreting Silhouette Scores -

The Silhouette Score ranges between -1 and 1:

+1 : Indicates that data points are well-clustered and appropriately assigned to their clusters.

0 : Indicates that data points are on or very close to the decision boundary between two neighboring clusters.

-1 : Indicates that data points may have been assigned to the wrong clusters