In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

file_path = "TASK2_dataset.csv"
df = pd.read_csv(file_path)

In [5]:
dataset_keywords = list(set(df['keyword_1'].tolist() + df['keyword_2'].tolist() + df['keyword_3'].tolist()))

keyword_sets = [[row['keyword_1'], row['keyword_2'], row['keyword_3']] for _, row in df.iterrows()]

def create_keyword_vector(keywords, reference_keywords):
    vector = np.zeros(len(reference_keywords))
    for keyword in keywords:
        if keyword in reference_keywords:
            vector[reference_keywords.index(keyword)] += 1
    return vector

keyword_vectors = np.array([create_keyword_vector(keywords, dataset_keywords) for keywords in keyword_sets])

In [None]:
def apply_pca(data, num_components=2):
    mean_adjusted = data - np.mean(data, axis=0)
    covariance_matrix = np.cov(mean_adjusted, rowvar=False)
    eigenvalues, eigenvectors = np.linalg.eigh(covariance_matrix)
    sorted_indices = np.argsort(eigenvalues)[::-1]
    principal_components = eigenvectors[:, sorted_indices[:num_components]]
    return np.dot(mean_adjusted, principal_components)

dimensionality_reduced_vectors = apply_pca(keyword_vectors, num_components=2)
scaled_vectors = dimensionality_reduced_vectors / 3
np.random.seed(42)

In [None]:
def initialize_random_centroids(data, k):
    return data[np.random.choice(len(data), k, replace=False)]

def assign_to_clusters(data, centroids):
    distances = np.linalg.norm(data[:, np.newaxis] - centroids, axis=2)
    return np.argmin(distances, axis=1)

def compute_new_centroids(data, labels, k):
    return np.array([data[labels == i].mean(axis=0) if len(data[labels == i]) > 0 else data[np.random.choice(len(data))] for i in range(k)])

def k_means_clustering(data, k, max_iterations=100, tolerance=1e-4):
    centroids = initialize_random_centroids(data, k)
    for _ in range(max_iterations):
        old_centroids = centroids.copy()
        cluster_labels = assign_to_clusters(data, centroids)
        centroids = compute_new_centroids(data, cluster_labels, k)
        if np.linalg.norm(centroids - old_centroids) < tolerance:
            break
    return cluster_labels, centroids

In [None]:
sse = []
k_values = range(2, 10)
for k in k_values:
    labels, centroids = k_means_clustering(scaled_vectors, k)
    sse.append(np.sum((scaled_vectors - centroids[labels]) ** 2))


In [None]:
plt.figure(figsize=(8, 5))
plt.plot(k_values, sse, marker='o', linestyle='--')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Sum of Squared Errors (SSE)')
plt.title('Elbow Method for Optimal k')
plt.show()

In [None]:
num_clusters = 5
cluster_labels, final_centroids = k_means_clustering(scaled_vectors, num_clusters)
df['Cluster'] = cluster_labels

In [None]:
plt.figure(figsize=(8, 6))
colors = ['red', 'blue', 'green', 'purple', 'orange']
for i in range(num_clusters):
    plt.scatter(scaled_vectors[cluster_labels == i, 0], scaled_vectors[cluster_labels == i, 1],
                color=colors[i], label=f'Cluster {i}')
plt.scatter(final_centroids[:, 0], final_centroids[:, 1], color='black', marker='x', s=200, label='Centroids')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('Cluster Visualization')
plt.legend()
plt.show()

In [None]:
def compute_silhouette_score(data, labels):
    scores = []
    for i in range(len(data)):
        same_cluster = data[labels == labels[i]]
        intra_distance = np.mean(np.linalg.norm(same_cluster - data[i], axis=1))
        inter_distances = [np.mean(np.linalg.norm(data[labels == j] - data[i], axis=1)) for j in set(labels) if j != labels[i]]
        nearest_cluster_distance = np.min(inter_distances) if inter_distances else 0
        scores.append((nearest_cluster_distance - intra_distance) / max(intra_distance, nearest_cluster_distance))
    return np.mean(scores)

silhouette = compute_silhouette_score(scaled_vectors, cluster_labels)
print("Silhouette Score:", silhouette)

In [None]:
cluster_genre_distribution = df.groupby('Cluster')['genre'].value_counts(normalize=True) * 100
print("\nCluster Genre Distribution:\n", cluster_genre_distribution)

def apply_pca_to_single_vector(data, reference_data, num_components=2):
    mean_adjusted = data - np.mean(reference_data, axis=0)
    covariance_matrix = np.cov(reference_data.T)
    eigenvalues, eigenvectors = np.linalg.eigh(covariance_matrix)
    sorted_indices = np.argsort(eigenvalues)[::-1]
    principal_components = eigenvectors[:, sorted_indices[:num_components]]
    return np.dot(mean_adjusted, principal_components)

In [None]:
def predict_genre_for_new_song(keywords, dataset_keywords, centroids, df):
    keyword_vector = create_keyword_vector(keywords, dataset_keywords).reshape(1, -1)
    reduced_vector = apply_pca_to_single_vector(keyword_vector, keyword_vectors, num_components=2) / 3
    distances = np.linalg.norm(centroids - reduced_vector, axis=1)
    closest_cluster = np.argmin(distances)
    cluster_genres = df[df['Cluster'] == closest_cluster]['genre']
    return cluster_genres.mode()[0] if not cluster_genres.empty else "Unknown"

new_songs = [
    ['piano', 'calm', 'slow'],
    ['guitar', 'emotional', 'distorted'],
    ['synth', 'mellow', 'distorted']
]

predicted_genres = {tuple(song): predict_genre_for_new_song(song, dataset_keywords, final_centroids, df) for song in new_songs}

print("\nNew Song Genre Predictions:")
for song, genre in predicted_genres.items():
    print(f"Keywords: {song} -> Assigned Genre: {genre}")
