# Uber Eats USA Restaurants and Menus
## 1.0 Import Library

In [None]:
# !pip install squarify
# !pip install folium
# !pip install wordcloud
# !pip install scikit-fuzzy
# !pip install umap-learn
# !pip install missingno

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import plotly.express as px
import warnings
import squarify
import folium
import plotly.graph_objects as go

from mpl_toolkits.mplot3d import Axes3D
from folium import plugins
from folium.plugins import HeatMap
from matplotlib import gridspec
from wordcloud import WordCloud
warnings.filterwarnings("ignore")

## 1.1 Load the CSV file

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# path1 = "/content/drive/MyDrive/ML_Assignment/restaurants.csv"
# path2 = "/content/drive/MyDrive/ML_Assignment/restaurant-menus.csv"

# restaurant = pd.read_csv(path1)
# menu = pd.read_csv(path2)

In [None]:
restaurant = pd.read_csv("restaurants.csv")
menu = pd.read_csv("restaurant-menus.csv")

# 2.0 Sanity Check of Data

## 2.1 Display the dimensions of the dataframe (number of rows, number of columns)

In [None]:
restaurant.shape

In [None]:
menu.shape

## 2.2 Check the first 5 rows of the dataframe
#### To understanding the data structure

In [None]:
restaurant.head()

In [None]:
menu.head()

## 2.3 Check the data type of each column

In [None]:
restaurant.dtypes

In [None]:
menu.dtypes

## 2.4 Check the information of dataframe
#### check the index, number of columns, column labels, data types of each column, memory usage and number of non-null value

In [None]:
restaurant.info()

In [None]:
menu.info()

## 2.5 Check the column names

In [None]:
restaurant.columns

In [None]:
menu.columns

## 2.6 Generate descriptive statistics
#### To summarize the central tendency, dispersion, and shape of the distribution of the numerical columns

In [None]:
restaurant.describe()

# 3.0 Preprocessing

## 3.1 Modify the dataset

### 3.1.1 Copy the dataframe
#### To avoid modifying the original data, preserving its integrity for reference or future analysis

In [None]:
new_restaurant = restaurant.copy()
new_restaurant.head()

In [None]:
new_menu = menu.copy()
new_menu.head()

### 3.1.2 Change the first character of column to capital letter
#### To improve readability and consistency in the dataset

In [None]:
def capitalize_first_letter(s):
    return s.str.capitalize()

In [None]:
new_restaurant.columns = capitalize_first_letter(new_restaurant.columns)
new_restaurant.head()

In [None]:
new_menu.columns = capitalize_first_letter(new_menu.columns)
new_menu.head()

### 3.1.3 Rename the column
#### To make column easier to understand and identified

In [None]:
new_restaurant.rename(columns = {"Full_address" : "Address",
                                 "Zip_code" : "Postal_Code",
                                 "Lat" : "Latitude",
                                 "Lng" : "Longitude"}, inplace = True)
new_restaurant.head()

In [None]:
new_menu.rename(columns = {"Restaurant_id" : "Restaurant ID"}, inplace = True)
new_menu.head()

## 3.2 Duplicate Data
### 3.2.1 Drop the duplicated rows
#### To maintain the data integrity and prevent data consistency

In [None]:
new_restaurant.duplicated().sum()

In [None]:
new_menu.duplicated().sum()

In [None]:
new_menu.drop_duplicates(inplace=True)
new_menu.shape

## 3.3 Missing Value
### 3.3.1 Calculate the number of missing value of each column in the dataframe

In [None]:
new_restaurant.isna().sum()

In [None]:
new_menu.isna().sum()

### 3.3.2 Matrix Plot
#### Showing the presence of missing values in dataframe

In [None]:
msno.matrix(new_restaurant)

In [None]:
msno.matrix(new_menu)

### 3.3.3 Calculate the missing value
#### To find out the total of missing value of each columns

In [None]:
new_restaurant.isna().sum()

### 3.3.4 Replace the missing value
#### Use mean of each numerical column (Ratings, Score, Price Range) to replace the missing value

In [None]:
new_restaurant = new_restaurant.fillna({"Ratings" : new_restaurant["Ratings"].mean(),
                                        "Score" : new_restaurant["Score"].mean()})
new_restaurant.isna().sum()

### 3.3.5 Drop the row
#### To drop the row which has less missing value to ensure the consistency of dataset

In [None]:
new_restaurant.dropna(inplace=True)
new_restaurant.isna().sum()

## 3.4 Outlier

### 3.4.1 Detect the outliers by using scatter plot

In [None]:
new_restaurant.shape

In [None]:
plt.figure(figsize=(12, 8))

plt.subplot(2, 3, 1)
plt.scatter(new_restaurant['Position'], range(len(new_restaurant)))
plt.title('Position')
plt.xlabel('Position')
plt.ylabel('Index')

plt.subplot(2, 3, 2)
plt.scatter(new_restaurant['Latitude'], range(len(new_restaurant)))
plt.title('Latitude')
plt.xlabel('Latitude')
plt.ylabel('Index')

plt.subplot(2, 3, 3)
plt.scatter(new_restaurant['Longitude'], range(len(new_restaurant)))
plt.title('Longitude')
plt.xlabel('Longitude')
plt.ylabel('Index')

plt.subplot(2, 3, 4)
plt.scatter(new_restaurant['Ratings'], range(len(new_restaurant)))
plt.title('Ratings')
plt.xlabel('Ratings')
plt.ylabel('Index')

plt.subplot(2, 3, 5)
plt.scatter(new_restaurant['Score'], range(len(new_restaurant)))
plt.title('Score')
plt.xlabel('Score')
plt.ylabel('Index')

plt.tight_layout()
plt.show()

## 3.4.2 Keep the outliers
#### As it has a large amount of outliers, if change the outliers value or delete them maybe will occur problem to data integrity

## 3.5 Invalid Data
#### 3.5.1 Remove the USD sign in the row

In [None]:
def convert_price(price_str):
    try:
        price = float(price_str.replace('USD', '').strip())
        return price
    except ValueError:
        return None

## 3.6 Data Type Issue

#### 3.6.1 Convert the 'Price' columns from object to float

In [None]:
new_menu['Price'] = new_menu['Price'].apply(convert_price)

In [None]:
new_menu.info()

## 3.7 Inconsistent Data

#### 3.7.1 Detect the inconsistent data in Score column

In [None]:
inconsistent_score = new_restaurant[(new_restaurant['Score'] < 0) & (new_restaurant['Score'] > 5)]

if inconsistent_score.empty:
    print("There is not any inconsistent score in this dataset.")
else:
    print("Inconsistent Scores:")
    print(inconsistent_score)

#### 3.7.2 Detect the inconsistent data in Ratings column

In [None]:
inconsistent_ratings = new_restaurant[(new_restaurant['Ratings'] < 0) & (new_restaurant['Ratings'] > 100)]

if inconsistent_ratings.empty:
    print("There is not any inconsistent ratings in this dataset.")
else:
    print("Inconsistent Ratings:")
    print(inconsistent_ratings)

#### 3.7.3 Detect the inconsistent data in Price column

In [None]:
inconsistent_prices = new_menu[new_menu['Price'] <= 0]

if inconsistent_prices.empty:
    print("There is not any inconsistent price in this dataset.")
else:
    print("Number of Inconsistent Prices:",len(inconsistent_prices))

#### 3.7.4 Replace these inconsistent data with mean

In [None]:
new_menu.loc[new_menu['Price'] <= 0, 'Price'] = new_menu['Price'].mean()

In [None]:
inconsistent_prices = new_menu[new_menu['Price'] <= 0]

if inconsistent_prices.empty:
    print("There is not any inconsistent price in this dataset.")
else:
    print("Inconsistent Prices:")
    print(inconsistent_prices)

## 3.8 Contaminated Data
### 3.8.1 Check the present of contaminated data

In [None]:
new_restaurant

In [None]:
new_menu

### 3.8.2 Drop the corrupted column
#### To reduce redundancy, improve computational efficiency, and focus on relevant data for analysis

In [None]:
new_restaurant.drop("Price_range", axis = 1, inplace = True)
new_restaurant.head()

In [None]:
new_menu.drop("Description", axis = 1, inplace = True)
new_menu.head()

## 3.9 Structural Errors

#### 3.9.1 Do a stuructural checking for the dataset

##### 3.9.1.1 Check for missing value

In [None]:
new_restaurant.isna().sum()

In [None]:
new_menu.isna().sum()

##### 3.9.1.2 Check for Data Type

In [None]:
new_restaurant.dtypes

In [None]:
new_menu.dtypes

##### 3.9.1.3 Check for duplicate row

In [None]:
new_restaurant.duplicated().sum()

In [None]:
new_menu.duplicated().sum()

In [None]:
new_menu.drop_duplicates(inplace=True)
new_menu.shape

##### 3.9.1.4 Check for inconsistent data

In [None]:
inconsistent_score = new_restaurant[(new_restaurant['Score'] < 0) & (new_restaurant['Score'] > 5)]

if inconsistent_score.empty:
    print("There is not any inconsistent score in this dataset.")
else:
    print("Inconsistent Scores:", len(inconsistent_score))

In [None]:
inconsistent_ratings = new_restaurant[(new_restaurant['Ratings'] < 0) & (new_restaurant['Ratings'] > 100)]

if inconsistent_ratings.empty:
    print("There is not any inconsistent ratings in this dataset.")
else:
    print("Inconsistent Ratings:", len(inconsistent_ratings))

In [None]:
inconsistent_prices = new_menu[new_menu['Price'] <= 0]

if inconsistent_prices.empty:
    print("There is not any inconsistent price in this dataset.")
else:
    print("Number of Inconsistent Prices:", len(inconsistent_prices))

## 3.10 Split the Columns
### 3.10.1 Split the Address Column
#### Split the address column to get the City in each row and create a new column to store it

In [None]:
def extract_city(address):
    if isinstance(address, str):
        parts = address.split(',')
        city = parts[-3].strip()
        return city
    else:
        return None

new_restaurant['City'] = new_restaurant['Address'].apply(extract_city)

### 3.10.2 Split the Category Column and store into a new dataframe to future use

In [None]:
categories_series = new_restaurant['Category'].str.split(', ', expand=True).stack()

category_counts = categories_series.value_counts().reset_index()

category_counts.columns = ['Category', 'Count']

print(category_counts.head(10))

## 3.11 Checking for the dataframe after data cleaning

In [None]:
new_restaurant.shape

In [None]:
new_restaurant.head()

In [None]:
new_menu.head()

## 3.12 Merge the datasets by Id for futher use

In [None]:
restaurant_merge = pd.merge(new_restaurant, new_menu, left_on='Id', right_on='Restaurant ID')
restaurant_merge.head()

In [None]:
restaurant_merge.info()

In [None]:
restaurant_merge.dropna(inplace=True)
restaurant_merge.shape

In [None]:
new_restaurant.to_csv('new_restaurant.csv', index=False)
new_menu.to_csv('new_menu.csv', index=False)
restaurant_merge.to_csv('restaurant_merge.csv', index=False)

# 4.0 Exploratory Data Analysis (EDA)

## 4.1 Find out the numver of restaurant around the USA

In [None]:
num_restaurants_usa = restaurant_merge['Postal_Code'].nunique()

print("Estimated number of restaurants around the USA:", num_restaurants_usa)

## 4.2 Wordcloud for Category column

In [None]:
text = ' '.join(category_counts["Category"].dropna())

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

## 4.3 Histogram
#### Represent the distribution of Position (frequency, mean and median)

In [None]:
sns.histplot(data=new_restaurant, x="Position", bins=15, color='red', alpha=0.5, kde=True)
plt.xlabel('Position')
plt.ylabel('Frequency')
plt.title('Distribution of Position')

mean_score = new_restaurant['Position'].mean()
median_score = new_restaurant['Position'].median()

plt.axvline(x=mean_score, color='yellow', linestyle='--', label=f'Mean: {mean_score:.2f}')
plt.axvline(x=median_score, color='cyan', linestyle='--', label=f'Median: {median_score:.2f}')

plt.show()

## 4.4 Scatter Plot
### 4.4.1 Position vs. Score
#### Represent the relationship between the position and score

In [None]:
fig = px.scatter(new_restaurant, x='Position', y='Score', title='Scatter Plot of Position vs. Score',
                 opacity=0.5, color='Score', color_continuous_scale='viridis',
                 hover_data={'Position': True, 'Score': True})

fig.update_traces(marker=dict(size=12, line=dict(width=1, color='Black')))

fig.update_layout(xaxis_title='Position', yaxis_title='Score')

fig.show()

### 4.4.2 Score vs. Ratings
#### Represent the relationship between scores and ratings for popular restaurants

In [None]:
popular_restaurant = new_restaurant.sort_values(['Score','Ratings'])
popular_restaurant.head()

In [None]:
fig = px.scatter(popular_restaurant, x="Score", y="Ratings",
     size_max=45, log_x=True, template='plotly_dark',title="Score vs. Ratings")

fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))

fig.show()

## 4.5 Treemap

#### 4.5.1 Represent the distribution of the most popular menu, where the size of the rectangle corresponds to the frequency of that category in the dataset.

In [None]:
top_menu = new_menu['Name'].value_counts().nlargest(20).index
filtered_menu = new_menu[new_menu['Name'].isin(top_menu)]

fig = px.treemap(filtered_menu, path=['Name'], title=f'Most Popular Menu')
fig.update_traces(textfont_size=14,root_color="white")
fig.update_layout(title_font=dict(size=20, family='Arial', color='black'),
                  margin = dict(t=50, b=50, r=50, l=50))

fig.show()

#### 4.5.2 Represent the distribution of menu categories, where the size of the rectangle corresponds to the frequency of that category in the dataset.

In [None]:
top_categories = new_menu['Category'].value_counts().nlargest(20).index
filtered_menu = new_menu[new_menu['Category'].isin(top_categories)]

fig = px.treemap(filtered_menu, path=['Category'], title=f'Menu Categories')
fig.update_traces(textfont_size=14,root_color="white")
fig.update_layout(title_font=dict(size=20, family='Arial', color='black'),
                  margin = dict(t=50, b=50, r=50, l=50))

fig.show()

## 4.6 Funnel Plot
#### Represents the top 10 cities with the most amount of restaurants

In [None]:
city_counts = new_restaurant['City'].value_counts().reset_index()
city_counts.columns = ['City', 'Total Count']

top_10_cities = city_counts.nlargest(10, 'Total Count').sort_values(by='Total Count', ascending=True)

fig = go.Figure(go.Funnel(
    y=top_10_cities['City'],
    x=top_10_cities['Total Count'],
    textinfo='value',
    marker=dict(color="skyblue"),
    connector=dict(line=dict(color="royalblue", width=4))
))

fig.update_layout(
    title="Top 10 Cities with Most Amount of Restaurants",
    xaxis_title="Total Count of Restaurants",
    yaxis_title="City"
)

fig.show()

## 4.7 Radar Chart
#### Represent the top 10 unpopular restaurants based on the score

In [None]:
restaurants = popular_restaurant['Name'][:10]
scores = popular_restaurant['Score'][:10]

colors = ['rgba(255, 0, 0, 0.7)', 'rgba(0, 255, 0, 0.7)', 'rgba(0, 0, 255, 0.7)',
          'rgba(255, 255, 0, 0.7)', 'rgba(255, 0, 255, 0.7)', 'rgba(0, 255, 255, 0.7)',
          'rgba(128, 0, 0, 0.7)', 'rgba(0, 128, 0, 0.7)', 'rgba(0, 0, 128, 0.7)',
          'rgba(128, 128, 0, 0.7)']

fig = go.Figure()

fig.add_trace(go.Scatterpolar(
    r=scores,
    theta=restaurants,
    fill='toself',
    name='Score',
    marker=dict(
        color=colors,
        line=dict(width=1, color='rgba(255, 255, 255, 0.7)')
    )
))

fig.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 3]
        )
    ),
    showlegend=True,
    title='Radar Chart of Top 10 Unpopular Restaurants'
)

fig.show()

## 4.8 Bubble Plot
#### Represent the ratings of restaurants in the top 50 cities in the USA

In [None]:
mean_ratings_by_city = new_restaurant.groupby('City')['Ratings'].mean().reset_index()

top_50_cities = mean_ratings_by_city.nlargest(50, 'Ratings')['City']

filtered_data = new_restaurant[new_restaurant['City'].isin(top_50_cities)]

sorted_data = filtered_data.sort_values(by='Ratings', ascending=False)

fig = px.scatter(sorted_data, x='City', y='Ratings', size='Ratings', color='City',
                 title='Bubble Plot of Ratings for Top 50 Cities in USA (Sorted by Ratings)', hover_name='City',
                 hover_data={'Ratings': True, 'City': False})

fig.update_layout(xaxis_title='City', yaxis_title='Ratings')

fig.show()

## 4.9 Heatmap
#### This map combines a scatter plot of popular restaurant locations with a heatmap overlay representing the ratings normalized by score

In [None]:
trace_map = go.Scattermapbox(
    lat=popular_restaurant['Latitude'],
    lon=popular_restaurant['Longitude'],
    mode='markers',
    marker=dict(size=10),
    text=popular_restaurant['Name']
)

trace_heatmap = go.Densitymapbox(
    lat=popular_restaurant['Latitude'],
    lon=popular_restaurant['Longitude'],
    z=(popular_restaurant['Ratings'] / popular_restaurant['Score']),
    radius=20
)

layout = go.Layout(
    title='Heatmap of Popular Restaurants',
    mapbox=dict(
        style="open-street-map",
        center=dict(
            lat=37.09024,
            lon=-95.712891
        ),
        zoom=4.3
    )
)

fig = go.Figure(data=[trace_map, trace_heatmap], layout=layout)

fig.show()

## 4.10 Folium Map
#### Display the markers for locations of five-star restaurants, with each marker indicating the restaurant's address.

In [None]:
fivestar_restaurant=popular_restaurant.loc[(popular_restaurant['Score'] == 5.0)]['Address'].value_counts()
fivestar_restaurant

In [None]:
map = folium.Map(location=[37.09024,-95.712891], tiles="OpenStreetMap", zoom_start=4.2)

for i in range(0,len(fivestar_restaurant)):
   folium.Marker(
      location=[popular_restaurant.iloc[i]['Latitude'], popular_restaurant.iloc[i]['Longitude']],
      popup=popular_restaurant.iloc[i]['Address'],
       icon=folium.Icon(color="red", icon="glyphicon-cutlery"),
   ).add_to(map)

map

## 4.11 3D Scatter Plot
#### Represent the relationship between the position, ratings, and score of restaurants

In [None]:
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

x = new_restaurant['Position']
y = new_restaurant['Ratings']
z = new_restaurant['Score']

ax.scatter(x, y, z, c='b', marker='o')

ax.set_xlabel('Position')
ax.set_ylabel('Ratings')
ax.set_zlabel('Score')
ax.set_title('3D Scatter Plot of Position, Ratings, and Score')

plt.show()

## 4.12 Scatter Polar Plot
#### Represent the relationship between the price and categories of menu items

In [None]:
top_categories = new_menu['Category'].value_counts().nlargest(10).index
filtered_menu = new_menu[(new_menu['Category'].isin(top_categories)) & (new_menu['Price'] <= 500)]

fig = px.scatter_polar(filtered_menu, r='Price', theta='Category', color='Price',
                       template='plotly_dark', title='Scatter Polar Plot of Price vs Top 10 Category')

fig.show()

# 5.0 Data Modelling

## 5.1 Affinity Propagation

#### Affinity Propagation(AP) is a clustering algorithm that automatically ascertains the number of clusters in a dataset in contrast to conventional clustering methods, which necessitate predefining the number of clusters. It starts by choosing a subset of representative examples, or exemplars, and iteratively improves them by exchanging real-valued messages between data points according to similarity metrics.

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

features_clustering = new_restaurant[['Position', 'Score', 'Ratings', 'Latitude', 'Longitude']]
position_clustering = new_restaurant[['Position', 'Score']]
quality_clustering = new_restaurant[['Score', 'Ratings']]

scaler = StandardScaler()
features_standardized = scaler.fit_transform(features_clustering)
position_standardized = scaler.fit_transform(position_clustering)
quality_standardized = scaler.fit_transform(quality_clustering)

In [None]:
correlation_matrix = features_clustering.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

In [None]:
from sklearn import metrics
from sklearn.cluster import AffinityPropagation

split_size = 3000
num_records = features_standardized.shape[0]
num_splits = int(np.ceil(num_records / split_size))

best_silhouette_scores = []
best_cluster_labels = []

for i in range(num_splits):
    start_idx = i * split_size
    end_idx = min((i + 1) * split_size, num_records)
    split_data = features_standardized[start_idx:end_idx]

    affinity_propagation = AffinityPropagation()
    cluster_labels = affinity_propagation.fit_predict(split_data)

    silhouette_avg = silhouette_score(split_data, cluster_labels)

    best_silhouette_scores.append((i, silhouette_avg))
    best_cluster_labels.append(cluster_labels)

best_silhouette_scores.sort(key=lambda x: x[1], reverse=True)

print("Top 10 highest silhouette scores:")
for i, (split_idx, silhouette_score) in enumerate(best_silhouette_scores[:10]):
    print(f"Split Index: {split_idx}, Silhouette Score: {silhouette_score}")

    cluster_labels = best_cluster_labels[split_idx]

In [None]:
def extract_cluster_data(cluster_labels, original_data):
    cluster_data = {}
    for label in np.unique(cluster_labels):
        indices = np.where(cluster_labels == label)[0]
        cluster_data[label] = original_data.iloc[indices]
    return cluster_data

best_split_idx, _ = best_silhouette_scores[0]
cluster_labels = best_cluster_labels[best_split_idx]
cluster_data = extract_cluster_data(cluster_labels, new_restaurant)

print(f"Number of Clusters: {len(cluster_data)}")
for label, data in cluster_data.items():
    print(f"Cluster {label}:")
    print(data[['Position', 'Score', 'Ratings', 'Longitude', 'Latitude']])
    print()

In [None]:
def extract_cluster_means(cluster_labels, original_data, features):
    cluster_means = {}
    for label in np.unique(cluster_labels):
        indices = np.where(cluster_labels == label)[0]
        cluster_data = original_data.iloc[indices]
        cluster_means[label] = cluster_data[features].mean()
    return cluster_means

features_of_interest = ['Position', 'Score', 'Ratings', 'Longitude', 'Latitude']

cluster_means = extract_cluster_means(cluster_labels, new_restaurant, features_of_interest)

print(f"Number of Clusters: {len(cluster_means)}")
for label, means in cluster_means.items():
    print(f"Cluster {label} Mean:")
    print(means)
    print()

In [None]:
from scipy.spatial.distance import euclidean

def find_most_matching_restaurant(cluster_means, restaurants_data):
    most_matching_restaurants = {}
    for cluster_label, cluster_mean in cluster_means.items():
        min_distance = float('inf')
        most_matching_restaurant = None
        for idx, row in restaurants_data.iterrows():
            restaurant_mean = row[features_of_interest]
            distance = euclidean(cluster_mean, restaurant_mean)
            if distance < min_distance:
                min_distance = distance
                most_matching_restaurant = row['Name']
        most_matching_restaurants[cluster_label] = most_matching_restaurant
    return most_matching_restaurants

most_matching_restaurants = find_most_matching_restaurant(cluster_means, new_restaurant)
for cluster_label, restaurant_name in most_matching_restaurants.items():
    print(f"Cluster {cluster_label} Most Matching Restaurant: {restaurant_name}")

## Position and Quality Clustering

In [None]:
scores = position_clustering['Score'].values
sorted_indices = np.argsort(-scores)
position_standardized_sorted = position_standardized[sorted_indices]

In [None]:
correlation_matrix = position_clustering.corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", square=True, cbar=True, linewidths=0.5)
plt.title('Correlation Matrix')

plt.show()

In [None]:
from sklearn import metrics
from sklearn.cluster import AffinityPropagation

split_size = 1000
num_records = position_standardized_sorted.shape[0]
num_splits = int(np.ceil(num_records / split_size))

best_silhouette_score = -1
best_split_index = -1

for i in range(num_splits):
    start_idx = i * split_size
    end_idx = min((i + 1) * split_size, num_records)
    split_data = position_standardized_sorted[start_idx:end_idx]
    affinity_propagation = AffinityPropagation()
    cluster_labels = affinity_propagation.fit_predict(split_data)

    silhouette_avg = silhouette_score(split_data, cluster_labels)
    if silhouette_avg > best_silhouette_score:
        best_silhouette_score = silhouette_avg
        best_split_index = i

print(f"Best Split Index: {best_split_index}")
print(f"Best Silhouette Score: {best_silhouette_score}")

start_idx = best_split_index * split_size
end_idx = min((best_split_index + 1) * split_size, num_records)
print(f"Best Split Range: {start_idx} - {end_idx}")

In [None]:
start_idx = best_split_index * split_size
end_idx = min((best_split_index + 1) * split_size, position_standardized_sorted.shape[0])
best_split_data = position_standardized_sorted[start_idx:end_idx]

affinity_propagation = AffinityPropagation()
best_cluster_labels = affinity_propagation.fit_predict(best_split_data)

pca = PCA(n_components=2)
best_split_pca = pca.fit_transform(best_split_data)

In [None]:
Preference = [-50, -40, -30,-20, -10]
silhouette_scores = []
for preference in Preference:
    model = AffinityPropagation(preference=preference, random_state=42)
    model.fit(best_split_data)

    silhouette_scores.append(metrics.silhouette_score(best_split_data , model.labels_))

data = {
    'Preference': Preference,
    'Silhouette Score': silhouette_scores
}
df = pd.DataFrame(data)

df.to_csv('/content/drive/MyDrive/ML_Assignment/AP_Silhouette_Scores_New.csv', index=False)

In [None]:
plt.figure()
plt.plot(Preference, silhouette_scores, marker='o')
plt.grid(True)
plt.xlabel('Preference Value')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Scores for Different Preference Values (Size=1000)')
plt.show()

## 5.2 Fuzzy C-Means

#### Fuzzy C-Means(FCM) is an unsupervised machine-learning algorithm that assigns membership degrees between 0 and 1. This allows data points to have varying degrees of association across multiple clusters, unlike traditional methods such as k-means or hierarchical clustering. Fuzzy clustering theory is better suited to the inherent nature of phenomena and presents a more unbiased portrayal of reality.

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

features_clustering = new_restaurant[['Position', 'Score', 'Ratings', 'Latitude', 'Longitude']]
position_clustering = new_restaurant[['Position', 'Score']]
quality_clustering = new_restaurant[['Score', 'Ratings']]

scaler = StandardScaler()
features_standardized = scaler.fit_transform(features_clustering)
position_standardized = scaler.fit_transform(position_clustering)
quality_standardized = scaler.fit_transform(quality_clustering)

In [None]:
import skfuzzy as fuzz

data = features_standardized
data_transposed = data.T

cluster_range = range(2, 11)
fpc_values = []
silhouette_scores = []
cluster_memberships = []

for x in cluster_range:
    cntr, u, _, _, _, _, fpc = fuzz.cluster.cmeans(
        data_transposed, x, m=2, error=0.005, maxiter=1000, init=None
    )
    fpc_values.append(fpc)
    cluster_membership = np.argmax(u, axis=0)
    cluster_memberships.append(cluster_membership)
    
    silhouette_avg = silhouette_score(data, cluster_membership)
    silhouette_scores.append(silhouette_avg)

print("Number of Clusters | Silhouette Score | FPC")
for num_clusters in cluster_range:
    print(f"{num_clusters:^19} | {silhouette_scores[num_clusters - 2]:^16.4f} | {fpc_values[num_clusters - 2]:^3.4f}")

In [None]:
def extract_cluster_means(cluster_labels, original_data, features):
    cluster_means = {}
    for label in np.unique(cluster_labels):
        indices = np.where(cluster_labels == label)[0]
        cluster_data = original_data.iloc[indices]
        cluster_mean = cluster_data[features].mean(axis=0)
        cluster_means[label] = cluster_mean
    return cluster_means

features_of_interest = ['Position', 'Score', 'Ratings', 'Longitude', 'Latitude']

best_split_idx = np.argmax(silhouette_scores)
cluster_labels = cluster_memberships[best_split_idx]
cluster_means = extract_cluster_means(cluster_labels, new_restaurant, features_of_interest)

print(f"Number of Clusters: {len(cluster_means)}")

for label, means in cluster_means.items():
    print(f"Cluster {label} Mean:")
    print(means)
    print()

In [None]:
from scipy.spatial.distance import euclidean

def find_most_matching_restaurant(cluster_means, restaurants_data, features_of_interest):
    most_matching_restaurants = {}
    for idx, cluster_mean in enumerate(cluster_means.values()):
        min_distance = float('inf')
        most_matching_restaurant = None
        for _, row in restaurants_data.iterrows():
            restaurant_mean = row[features_of_interest]
            distance = euclidean(cluster_mean, restaurant_mean)
            if distance < min_distance:
                min_distance = distance
                most_matching_restaurant = row['Name']
        most_matching_restaurants[idx] = most_matching_restaurant
    return most_matching_restaurants

cluster_means = extract_cluster_means(cluster_labels, new_restaurant, features_of_interest)

most_matching_restaurants = find_most_matching_restaurant(cluster_means, new_restaurant, features_of_interest)

for cluster_label, restaurant_name in most_matching_restaurants.items():
    print(f"Cluster {cluster_label + 1} Most Matching Restaurant: {restaurant_name}")

In [None]:
plt.figure()
plt.plot(cluster_range, fpc_values, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Fuzzy Partition Coefficient (FPC)')
plt.title('FPC vs Number of Clusters')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(cluster_range, silhouette_scores, marker='o', linestyle='-')
plt.title('Silhouette Score vs. Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.grid(True)
plt.show()

In [None]:
optimal_num_clusters = cluster_range[np.argmax(silhouette_scores)]

cntr, u, _, _, _, _, _ = fuzz.cluster.cmeans(
    data_transposed, optimal_num_clusters, m=2, error=0.005, maxiter=1000
)

In [None]:
results_df = pd.DataFrame({
    'Number of Clusters': cluster_range,
    'Silhouette Score': silhouette_scores,
    'FPC': fpc_values
})

results_df.to_csv('FPC_values.csv.csv', index=False)

In [None]:
data_df = pd.DataFrame(data_transposed.T, columns=['Feature 1', 'Feature 2','Feature 3','Feature 4', 'Feature 5'])
membership_df = pd.DataFrame(cluster_membership, columns=['Cluster Membership'])

data_df['Cluster Membership'] = membership_df['Cluster Membership']
data_df.to_csv('FCP_Plot.csv', index=False)

cntr_df = pd.DataFrame(cntr, columns=['Feature 1', 'Feature 2','Feature 3','Feature 4', 'Feature 5'])
cntr_df.to_csv('Cluster_Centers.csv', index=False)

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(data_transposed[0], data_transposed[1], c=cluster_membership, cmap='viridis', alpha=0.7, edgecolors='k')
plt.scatter(cntr[:, 0], cntr[:, 1], c='red', marker='x', s=100, label='Cluster Centers')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title(f'Fuzzy C-Means Clustering with {optimal_num_clusters} Clusters')
plt.colorbar(label='Cluster Membership')
plt.legend()
plt.show()

## Position and Quality Clustering

In [None]:
data = position_standardized

data_transposed = data.T

cluster_range = range(2, 8)
fpc_values = []

for x in cluster_range:
    cntr, u, _, _, _, _, fpc = fuzz.cluster.cmeans(
        data_transposed, x, m=2, error=0.005, maxiter=1000, init=None
    )

    fpc_values.append(fpc)

cluster_membership = np.argmax(u, axis=0)

In [None]:
plt.figure()
plt.plot(cluster_range, fpc_values, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Fuzzy Partition Coefficient (FPC)')
plt.title('FPC vs Number of Clusters')
plt.show()

In [None]:
optimal_num_clusters = cluster_range[np.argmax(fpc_values)]

cntr, u, _, _, _, _, _ = fuzz.cluster.cmeans(
    data_transposed, optimal_num_clusters, m=2, error=0.005, maxiter=1000
)

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(data_transposed[0], data_transposed[1], c=cluster_membership, marker='o', cmap='viridis', alpha=0.7, edgecolors='k')
plt.scatter(cntr[:, 0], cntr[:, 1], c='red', marker='x', s=100, label='Cluster Centers')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title(f'Fuzzy C-Means Clustering with {optimal_num_clusters} Clusters')
plt.colorbar(label='Cluster Membership')
plt.legend()
plt.show()

## Quality and Probability Clustering

In [None]:
data = quality_standardized
data_transposed = data.T

cluster_range = range(2, 8)
fpc_values = []

for x in cluster_range:
    cntr, u, _, _, _, _, fpc = fuzz.cluster.cmeans(
        data_transposed, x, m=2, error=0.005, maxiter=1000, init=None
    )

    fpc_values.append(fpc)

cluster_membership = np.argmax(u, axis=0)

In [None]:
plt.figure()
plt.plot(cluster_range, fpc_values, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Fuzzy Partition Coefficient (FPC)')
plt.title('FPC vs Number of Clusters')
plt.show()

In [None]:
optimal_num_clusters = cluster_range[np.argmax(fpc_values)]

cntr, u, _, _, _, _, _ = fuzz.cluster.cmeans(
    data_transposed, optimal_num_clusters, m=2, error=0.005, maxiter=1000
)

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(data_transposed[0], data_transposed[1], c=cluster_membership, cmap='viridis', alpha=0.7, edgecolors='k')
plt.scatter(cntr[:, 0], cntr[:, 1], c='red', marker='x', s=100, label='Cluster Centers')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title(f'Fuzzy C-Means Clustering with {optimal_num_clusters} Clusters')
plt.colorbar(label='Cluster Membership')
plt.legend()
plt.show()

## 5.3 Spectral Clustering

#### Spectral clustering is a widespread class of clustering algorithms, designed specifically for graph data models. It uses the eigenvectors of an affinity matrix to identify distinct clusters within the data. This algorithm effectively partitions N data points in an I-dimensional space into several clusters based on their similarity. Points within the same cluster exhibit a high degree of similarity, while points in different clusters are dissimilar.

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

features_clustering = new_restaurant[['Position', 'Score', 'Ratings', 'Latitude', 'Longitude']]
position_clustering = new_restaurant[['Position', 'Score']]
quality_clustering = new_restaurant[['Score', 'Ratings']]

scaler = StandardScaler()
features_standardized = scaler.fit_transform(features_clustering)
position_standardized = scaler.fit_transform(position_clustering)
quality_standardized = scaler.fit_transform(quality_clustering)

In [None]:
from sklearn.cluster import SpectralClustering

pca = PCA(n_components=2)
features_pca = pca.fit_transform(features_standardized)

n_clusters_range = range(2, 11)

inertia_values = []
silhouette_scores = []
best_silhouette_score = -1
best_n_clusters = None

for x in n_clusters_range:
    spectral_clustering = SpectralClustering(n_clusters=x, affinity='nearest_neighbors')
    cluster_labels = spectral_clustering.fit_predict(features_pca)

    silhouette_avg = silhouette_score(features_pca, cluster_labels)
    silhouette_scores.append(silhouette_avg)

    if silhouette_avg > best_silhouette_score:
        best_silhouette_score = silhouette_avg
        best_n_clusters = x
            
    inertia = 0
    for i in range(x):
        cluster_points = features_pca[cluster_labels == i]
        if cluster_points.size > 0:
            cluster_center = np.mean(cluster_points, axis=0)
            inertia += np.sum((cluster_points - cluster_center) ** 2)

    inertia_values.append(inertia)

    print(f'Number of clusters: {x}')
    unique, counts = np.unique(cluster_labels, return_counts=True)
    print(dict(zip(unique, counts)))

In [None]:
data = {
    'Number of Clusters': list(n_clusters_range),
    'Inertia': inertia_values
}
df = pd.DataFrame(data)

df.to_csv('Spectral_Clustering.csv', index=False)

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(n_clusters_range, inertia_values, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Within-cluster Sum of Squares')
plt.title('Inertia for Different Number of Clusters')
plt.show()

In [None]:
data = {
    'Number of Clusters': list(n_clusters_range),
    'Silhouette Score': silhouette_scores
}
df = pd.DataFrame(data)

df.to_csv('SC_Silhouette_Scores.csv', index=False)

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(n_clusters_range, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Scores for Different Number of Clusters')
plt.show()

In [None]:
print(f"Best Silhouette Score: {best_silhouette_score}")
print(f"Number of Clusters: {best_n_clusters}")

In [None]:
def extract_cluster_means(cluster_labels, original_data, features):
    cluster_means = {}
    for label in np.unique(cluster_labels):
        indices = np.where(cluster_labels == label)[0]
        cluster_data = original_data.iloc[indices]
        cluster_mean = cluster_data[features].mean(axis=0)
        cluster_means[label] = cluster_mean
    return cluster_means

features_of_interest = ['Position', 'Score', 'Ratings', 'Longitude', 'Latitude']

best_split_idx = np.argmax(silhouette_scores)
best_n_clusters = n_clusters_range[best_split_idx]
spectral_clustering_best = SpectralClustering(n_clusters=best_n_clusters, affinity='nearest_neighbors')
best_cluster_labels = spectral_clustering_best.fit_predict(features_pca)
cluster_means = extract_cluster_means(best_cluster_labels, new_restaurant, features_of_interest)

print(f"Number of Clusters: {len(cluster_means)}")

for label, means in cluster_means.items():
    print(f"Cluster {label} Mean:")
    print(means)
    print()

In [None]:
from scipy.spatial.distance import euclidean

def find_most_matching_restaurant(cluster_means, restaurants_data, features_of_interest):
    most_matching_restaurants = {}
    for idx, cluster_mean in enumerate(cluster_means.values()):
        min_distance = float('inf')
        most_matching_restaurant = None
        for _, row in restaurants_data.iterrows():
            restaurant_mean = row[features_of_interest]
            distance = euclidean(cluster_mean, restaurant_mean)
            if distance < min_distance:
                min_distance = distance
                most_matching_restaurant = row['Name']
        most_matching_restaurants[idx] = most_matching_restaurant
    return most_matching_restaurants

cluster_means = extract_cluster_means(best_cluster_labels, new_restaurant, features_of_interest)

most_matching_restaurants = find_most_matching_restaurant(cluster_means, new_restaurant, features_of_interest)

for cluster_label, restaurant_name in most_matching_restaurants.items():
    print(f"Cluster {cluster_label + 1} Most Matching Restaurant: {restaurant_name}")

In [None]:
data_to_save = np.hstack((features_pca, best_cluster_labels.reshape(-1, 1)))
df = pd.DataFrame(data_to_save, columns=['PCA1', 'PCA2', 'ClusterLabel'])
df.to_csv('Best_Silhouette_Score.csv', index=False)

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(features_pca[:, 0], features_pca[:, 1], c=best_cluster_labels, cmap='viridis', marker='o', edgecolor='k')
plt.title(f'Spectral Clustering with {best_n_clusters} Clusters')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(label='Cluster Label')
plt.grid(True)
plt.show()

## 5.4 Gaussian Mixture Model (GMM)

#### A Gaussian Mixture Model (GMM) is a parametric probability density function (PDF) that represents the overall distribution of data as a weighted sum of multiple Gaussian component densities. These components are defined using the mean vector and covariance matrix of the data, with estimated weights determining their relative importance in modelling the overall distribution. GMMs are used in unsupervised machine learning, which posits that data points are generated from a mixture of a finite number of Gaussian distributions, and each with its own set of unknown parameters.

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

features_clustering = new_restaurant[['Position', 'Score', 'Ratings', 'Latitude', 'Longitude']]
position_clustering = new_restaurant[['Position', 'Score']]
quality_clustering = new_restaurant[['Score', 'Ratings']]

scaler = StandardScaler()
features_standardized = scaler.fit_transform(features_clustering)
position_standardized = scaler.fit_transform(position_clustering)
quality_standardized = scaler.fit_transform(quality_clustering)

In [None]:
from sklearn.mixture import GaussianMixture

num_clusters_range = range(2, 11)

bic_scores = []
silhouette_scores = []

for n_clusters in num_clusters_range:
    gmm = GaussianMixture(n_components=n_clusters, random_state=10)
    gmm.fit(features_standardized)
    bic_scores.append(gmm.bic(features_standardized))
    cluster_labels = gmm.predict(features_standardized)
    silhouette_scores.append(silhouette_score(features_standardized, cluster_labels))

print("Number of Clusters\tAIC\tSilhouette Score")
for n_clusters, bic, silhouette in zip(num_clusters_range, bic_scores, silhouette_scores):
    print(f"{n_clusters}\t\t\t{bic:.2f}\t{silhouette:.3f}")

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(num_clusters_range, bic_scores, marker='s', color='red', label='BIC')
plt.title('AIC and BIC Values for Different Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Score')
plt.xticks(num_clusters_range)
plt.legend(loc='best')
plt.grid(True)
plt.show()

In [None]:
data = {
    'Number of Clusters': list(num_clusters_range),
    'Silhouette Score': silhouette_scores
}
df = pd.DataFrame(data)

df.to_csv('GMM_Score.csv', index=False)

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(num_clusters_range, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score by Number of Clusters')
plt.show()

In [None]:
def extract_cluster_means(cluster_labels, original_data, features):
    cluster_means = {}
    for label in np.unique(cluster_labels):
        indices = np.where(cluster_labels == label)[0]
        cluster_data = original_data.iloc[indices]
        cluster_mean = cluster_data[features].mean(axis=0)
        cluster_means[label] = cluster_mean
    return cluster_means

features_of_interest = ['Position', 'Score', 'Ratings', 'Longitude', 'Latitude']

best_num_clusters = num_clusters_range[silhouette_scores.index(max(silhouette_scores))]
best_cluster_labels = GaussianMixture(n_components=best_num_clusters, random_state=10).fit_predict(features_standardized)

cluster_means = extract_cluster_means(best_cluster_labels, new_restaurant, features_of_interest)

print(f"Number of Clusters: {len(cluster_means)}")

for label, means in cluster_means.items():
    print(f"Cluster {label} Mean:")
    print(means)
    print()

In [None]:
from scipy.spatial.distance import euclidean

def find_most_matching_restaurant(cluster_means, restaurants_data, features_of_interest):
    most_matching_restaurants = {}
    for idx, cluster_mean in enumerate(cluster_means.values()):
        min_distance = float('inf')
        most_matching_restaurant = None
        for _, row in restaurants_data.iterrows():
            restaurant_mean = row[features_of_interest]
            distance = euclidean(cluster_mean, restaurant_mean)
            if distance < min_distance:
                min_distance = distance
                most_matching_restaurant = row['Name']
        most_matching_restaurants[idx] = most_matching_restaurant
    return most_matching_restaurants

cluster_means = extract_cluster_means(best_cluster_labels, new_restaurant, features_of_interest)

most_matching_restaurants = find_most_matching_restaurant(cluster_means, new_restaurant, features_of_interest)

for cluster_label, restaurant_name in most_matching_restaurants.items():
    print(f"Cluster {cluster_label + 1} Most Matching Restaurant: {restaurant_name}")

In [None]:
best_num_clusters = num_clusters_range[np.argmax(silhouette_scores)]

gmm = GaussianMixture(n_components=best_num_clusters, random_state=42)
gmm.fit(features_standardized)
cluster_labels = gmm.predict(features_standardized)

pca = PCA(n_components=2)
features_pca = pca.fit_transform(features_standardized)

data = pd.DataFrame({
    'PC1': features_pca[:, 0],
    'PC2': features_pca[:, 1],
    'Cluster Label': cluster_labels
})

data.to_csv('GMM_Clustering.csv', index=False)

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(features_pca[:, 0], features_pca[:, 1], c=cluster_labels, cmap='viridis', alpha=0.8, edgecolors='k')
plt.colorbar(label='Cluster')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title(f'PCA of Clusters (Best Number of Clusters: {best_num_clusters})')
plt.grid(True)
plt.show()

## 5.5 Hierarchical Clustering

#### Hierarchical clustering is a technique that groups data objects into a tree-like structure. It involves combining or dividing existing groups and specifying the order in which clusters are divided or combined. Clusters of considerable size are split into smaller clusters, and clusters that are small in size but of significant importance are merged.

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

features_clustering = new_restaurant[['Position', 'Score', 'Ratings', 'Latitude', 'Longitude']]
position_clustering = new_restaurant[['Position', 'Score']]
quality_clustering = new_restaurant[['Score', 'Ratings']]

scaler = StandardScaler()
features_standardized = scaler.fit_transform(features_clustering)
position_standardized = scaler.fit_transform(position_clustering)
quality_standardized = scaler.fit_transform(quality_clustering)

In [None]:
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage

sorted_new_restaurant = new_restaurant.sort_values(by='Score', ascending=False)

top_records = sorted_new_restaurant.head(20000)

columns_for_clustering = ['Position', 'Score', 'Ratings', 'Latitude', 'Longitude']
data_to_cluster = top_records[columns_for_clustering]

scaler = StandardScaler()
data_standardized = scaler.fit_transform(data_to_cluster)

linkage_matrix = linkage(data_standardized, method='ward')

linkage_df = pd.DataFrame(linkage_matrix, columns=['idx1', 'idx2', 'distance', 'sample_count'])
linkage_df.to_csv('Linkage_Matrix.csv', index=False)

In [None]:
plt.figure(figsize=(12, 8))

dendrogram(
    linkage_matrix,
    leaf_rotation=45,
    leaf_font_size=6,
    truncate_mode='level',
    p=4
)

plt.title('Hierarchical Clustering Dendrogram (Truncated)')
plt.xlabel('Sample Index (or Cluster Size)')
plt.ylabel('Distance')
plt.show()

In [None]:
cluster_range = range(2, 11)

silhouette_scores = []

for n_clusters in cluster_range:
    agg_clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
    cluster_labels = agg_clustering.fit_predict(data_standardized)

    silhouette_avg = silhouette_score(data_standardized, cluster_labels)
    silhouette_scores.append(silhouette_avg)

    print(f'Number of clusters: {n_clusters}, Silhouette Score: {silhouette_avg:.3f}')

best_n_clusters = cluster_range[np.argmax(silhouette_scores)]
print(f'Best Number of Clusters: {best_n_clusters}')

data = {
    'Number of Clusters': list(cluster_range),
    'Silhouette Score': silhouette_scores
}
df = pd.DataFrame(data)

df.to_csv('Hiearchical_Score.csv', index=False)

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(cluster_range, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs. Number of Clusters')
plt.show()

In [None]:
def extract_cluster_means(cluster_labels, original_data, features):
    cluster_means = {}
    for label in np.unique(cluster_labels):
        indices = np.where(cluster_labels == label)[0]
        cluster_data = original_data.iloc[indices]
        cluster_mean = cluster_data[features].mean(axis=0)
        cluster_means[label] = cluster_mean
    return cluster_means

features_of_interest = ['Position', 'Score', 'Ratings', 'Longitude', 'Latitude']

best_cluster_labels = AgglomerativeClustering(n_clusters=best_n_clusters, linkage='ward').fit_predict(data_standardized)

cluster_means = extract_cluster_means(best_cluster_labels, new_restaurant, features_of_interest)

print(f"Number of Clusters: {len(cluster_means)}")

for label, means in cluster_means.items():
    print(f"Cluster {label} Mean:")
    print(means)
    print()

In [None]:
def find_most_matching_restaurant(cluster_means, restaurants_data, features_of_interest):
    most_matching_restaurants = {}
    for idx, cluster_mean in enumerate(cluster_means.values()):
        min_distance = float('inf')
        most_matching_restaurant = None
        for _, row in restaurants_data.iterrows():
            restaurant_mean = row[features_of_interest]
            distance = euclidean(cluster_mean, restaurant_mean)
            if distance < min_distance:
                min_distance = distance
                most_matching_restaurant = row['Name']
        most_matching_restaurants[idx] = most_matching_restaurant
    return most_matching_restaurants

most_matching_restaurants = find_most_matching_restaurant(cluster_means, new_restaurant, features_of_interest)

for cluster_label, restaurant_name in most_matching_restaurants.items():
    print(f"Cluster {cluster_label + 1} Most Matching Restaurant: {restaurant_name}")

In [None]:
agg_clustering = AgglomerativeClustering(n_clusters=best_n_clusters, linkage='ward')
cluster_labels = agg_clustering.fit_predict(data_standardized)

pca = PCA(n_components=2)
data_reduced = pca.fit_transform(data_standardized)

df = pd.DataFrame({
    'PC1': data_reduced[:, 0],
    'PC2': data_reduced[:, 1],
    'Cluster': cluster_labels
})

df.to_csv('Hiearchical_Clustering.csv', index=False)

In [None]:
plt.figure(figsize=(8, 6))

for cluster_id in np.unique(cluster_labels):
    cluster_points = data_reduced[cluster_labels == cluster_id]
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f'Cluster {cluster_id}', edgecolors='k')

plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title(f'Agglomerative Clustering (Best number of clusters: {best_n_clusters})')
plt.legend()
plt.show()