Add you import statements and the database connection statements in the below code block

Database file path: /course/data/CSE-578/dinofunworld.db

In [2]:
# import statements here
import sqlite3
import numpy as np
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch
from scipy.spatial.distance import pdist, squareform

# Connect to the database
db_path = '/course/data/CSE-578/dinofunworld.db'
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

**Question 1:**

Create and display a dendrogram of the trajectories of the 5 visitors specified above. The clustering algorithm used to create the dendrogram should use the average distance between points in a cluster.

**NOTE:**

* You can reuse the distance matrix from Assignment 3.
* Make sure to use the correct clustering algorithm.
* The dendrogram  should have the following parameters:
    * Title should be '**Dendrogram of Trajectories of 5 Visitors to the Park**'
    * xlabel should be '**Visitor ID**' and ylabel should be '**Distance**'.
* The plot must not have the following parameters,
    * do not set figure size
    * do not set font size or font-weight for titles and labels




In [0]:
### TEST FUNCTION: test_question1
# DO NOT MODIFY OR REMOVE THE ABOVE LINE
# your code here
# Visitor IDs to analyze
visitor_ids = [165316, 1835254, 296394, 404385, 448990]

# Fetch sequences for the selected visitors
sequences = {}
for visitor_id in visitor_ids:
    cursor.execute("SELECT sequence FROM sequences WHERE visitorID = ?", (visitor_id,))
    result = cursor.fetchone()
    if result:
        sequences[visitor_id] = list(map(int, result[0].replace('-', ' ').split()))
    else:
        sequences[visitor_id] = []

# Close the database connection
conn.close()

# Convert sequences into a matrix with equal length (padding with 0 if needed)
max_length = max(len(seq) for seq in sequences.values())
sequence_matrix = np.array([seq + [0] * (max_length - len(seq)) for seq in sequences.values()])

# Compute pairwise distances using edit distance (Levenshtein-like approach)
def edit_distance(seq1, seq2):
    m, n = len(seq1), len(seq2)
    dp = np.zeros((m + 1, n + 1))
    
    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0:
                dp[i][j] = j  # Insert all elements of seq2
            elif j == 0:
                dp[i][j] = i  # Remove all elements of seq1
            elif seq1[i - 1] == seq2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = 1 + min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1])
    
    return dp[m][n]

# Compute distance matrix
distance_matrix = np.zeros((len(visitor_ids), len(visitor_ids)))
for i in range(len(visitor_ids)):
    for j in range(i + 1, len(visitor_ids)):
        dist = edit_distance(sequence_matrix[i], sequence_matrix[j])
        distance_matrix[i][j] = dist
        distance_matrix[j][i] = dist

# Convert to condensed form
distance_condensed = squareform(distance_matrix)

# Perform hierarchical clustering
linkage_matrix = sch.linkage(distance_condensed, method='average')

# Plot the dendrogram
plt.figure(figsize=(10, 5))
sch.dendrogram(linkage_matrix, labels=visitor_ids, leaf_rotation=45)
plt.title("Dendrogram of Trajectories of 5 Visitors to the Park")
plt.xlabel("Visitor ID")
plt.ylabel("Distance")
plt.show()