## Imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import kagglehub
from datasets import load_dataset
import os
from typing import Dict, List, Set, Tuple
from tqdm.notebook import tqdm
import csv

In [1]:
import pandas as pd

## Basic Setup

In [2]:
train_df = pd.read_csv('data/train.csv')
val_df = pd.read_csv('data/val.csv')
test_df = pd.read_csv('data/test.csv')

In [None]:
def filter_playlists_by_Y_count(df, top_k):
    """
    Remove playlists that have fewer than top_k 'Y' labels.

    Parameters:
    - df: Pandas DataFrame containing the playlist data.
    - top_k: Minimum number of 'Y' labels required.

    Returns:
    - filtered_df: DataFrame after removing playlists with fewer than top_k 'Y' labels.
    """
    # Group by 'playlist_id' and count the number of 'Y' labels
    Y_counts = df[df['XY'] == 'Y'].groupby('playlist_id').size().reset_index(name='Y_count')

    # Filter playlists that have at least top_k 'Y' labels
    valid_playlists = Y_counts[Y_counts['Y_count'] >= top_k]['playlist_id']

    # Filter the original DataFrame to include only valid playlists
    filtered_df = df[df['playlist_id'].isin(valid_playlists)].reset_index(drop=True)

    return filtered_df


In [None]:
def verify_Y_counts(df, top_k, set_name):
    Y_counts = df[df['XY'] == 'Y'].groupby('playlist_id').size()
    if Y_counts.min() >= top_k:
        print(f"All playlists in the {set_name} set have at least {top_k} 'Y' songs.")
    else:
        print(f"Some playlists in the {set_name} set have fewer than {top_k} 'Y' songs.")




In [None]:
merged_df = pd.concat([train_df, val_df, test_df], ignore_index=True)
merged_df.shape

In [None]:
from scipy.sparse.linalg import svds

# Assume 'merged_df' is prepared as before, containing only the necessary columns

# Create a unique song ID using 'track_clean' and 'artist_clean'
merged_df['song_id'] = merged_df['track_clean'] + ' - ' + merged_df['artist_clean']

# Create a unique playlist ID using 'playlistname' and 'user_id'
merged_df['playlist_id'] = merged_df['playlistname'] + ' - ' + merged_df['user_id']



train_df['song_id'] = train_df['track_clean'] + ' - ' + train_df['artist_clean']

# Create a unique playlist ID using 'playlistname' and 'user_id'
train_df['playlist_id'] = train_df['playlistname'] + ' - ' + train_df['user_id']



val_df['song_id'] = val_df['track_clean'] + ' - ' + val_df['artist_clean']

# Create a unique playlist ID using 'playlistname' and 'user_id'
val_df['playlist_id'] = val_df['playlistname'] + ' - ' + val_df['user_id']



test_df['song_id'] = test_df['track_clean'] + ' - ' + test_df['artist_clean']

# Create a unique playlist ID using 'playlistname' and 'user_id'
test_df['playlist_id'] = test_df['playlistname'] + ' - ' + test_df['user_id']

# Keep only necessary columns
columns_to_keep = ['playlist_id', 'song_id', 'XY']


In [None]:
total_playlists = merged_df['playlist_id'].nunique()
total_songs = merged_df['song_id'].nunique()
print(f"Total unique playlists: {total_playlists}")
print(f"Total unique songs: {total_songs}")

In [None]:
# Set the value of top_k
top_k = 5  # Adjust as needed

# Apply the function to each dataset
train_df_filtered = filter_playlists_by_Y_count(train_df, top_k)
val_df_filtered = filter_playlists_by_Y_count(val_df, top_k)
test_df_filtered = filter_playlists_by_Y_count(test_df, top_k)

# Verify that all playlists have at least top_k 'Y' songs
verify_Y_counts(train_df_filtered, top_k, 'Train')
verify_Y_counts(val_df_filtered, top_k, 'Validation')
verify_Y_counts(test_df_filtered, top_k, 'Test')

In [None]:
def generate_mapping(train_data):
  unique_playlists = train_data['playlist_id'].unique()
  unique_songs = merged_df['song_id'].unique()

  # Create mappings
  playlist_id_to_index = {playlist_id: idx for idx, playlist_id in enumerate(unique_playlists)}
  song_id_to_index = {song_id: idx for idx, song_id in enumerate(unique_songs)}

  # For reverse mapping if needed
  index_to_playlist_id = {idx: playlist_id for playlist_id, idx in playlist_id_to_index.items()}
  index_to_song_id = {idx: song_id for song_id, idx in song_id_to_index.items()}
  return playlist_id_to_index, song_id_to_index, index_to_playlist_id, index_to_song_id
  

## Item Based Collaborative Filtering

In [None]:
from scipy.sparse import coo_matrix, csr_matrix
from scipy.sparse.linalg import svds

def generate_interaction_matrix(train_df, playlist_id_to_index, song_id_to_index):
  # Generate mappings
  playlist_id_to_index, song_id_to_index, index_to_playlist_id, index_to_song_id = generate_mapping(train_df)
  # Filter to 'X' entries
  interaction_df = train_df

  # Map playlist and song IDs to indices
  interaction_df['playlist_index'] = interaction_df['playlist_id'].map(playlist_id_to_index)
  interaction_df['song_index'] = interaction_df['song_id'].map(song_id_to_index)

  # Prepare data for the sparse matrix
  row_indices = interaction_df['playlist_index'].values
  col_indices = interaction_df['song_index'].values
  data_values = np.ones(len(interaction_df))  # Since we're marking presence, use 1s

  # Create the sparse interaction matrix
  num_playlists = len(playlist_id_to_index)
  num_songs = len(song_id_to_index)

  interaction_matrix = coo_matrix(
      (data_values, (row_indices, col_indices)),
      shape=(num_playlists, num_songs)
  ).tocsr()
  return interaction_matrix

In [None]:
# Convert interaction matrix to a dense format if necessary (not recommended for large datasets)
# Since the interaction matrix is large, we'll work with the sparse format
from sklearn.preprocessing import normalize
# Use the interaction_matrix directly for SVD
# Note: For large sparse matrices, consider using implicit matrix factorization methods

# Define the number of latent factors
def generate_song_vectors(interaction_matrix, k=50):
  # Compute the mean (not centering since it's implicit feedback data)
  # For implicit feedback, we often skip mean centering

  # Apply SVD
  # Since scipy's svds works with csr_matrix, we can use it directly
  U, sigma, Vt = svds(interaction_matrix, k=k)

  # Convert sigma to a diagonal matrix
  sigma = np.diag(sigma)

  # Compute the predicted ratings
  R_pred = U.dot(sigma).dot(Vt)

  # Convert to csr_matrix for efficient row access
  R_pred_matrix = csr_matrix(R_pred)


  song_factors = Vt.T
  return song_factors, normalize(song_factors)


In [None]:
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

# Assuming you have:
# - song_factors: numpy array of shape (num_songs, k)
# - index_to_song_id: mapping from song index to song ID
# - song_id_to_index: mapping from song ID to song index

# Normalize song factors for cosine similarity
# song_factors_normalized = normalize(song_factors)

def recommend_songs_for_playlist(
    X_song_ids,
    song_factors,
    song_factors_normalized,
    song_id_to_index,
    index_to_song_id,
    top_k=25
):
    """
    Generate song recommendations for a playlist given a list of songs in 'X'.

    Parameters:
    - X_song_ids: List of song IDs in the playlist's 'X' set.
    - song_factors: Numpy array of song latent factors (num_songs x k).
    - song_factors_normalized: Normalized song factors for similarity computation.
    - song_id_to_index: Mapping from song IDs to indices.
    - index_to_song_id: Mapping from indices to song IDs.
    - top_k: Number of recommendations to generate.

    Returns:
    - recommended_songs: List of recommended song IDs.
    """
    # Map song IDs to indices
    X_song_indices = [song_id_to_index[song_id] for song_id in X_song_ids if song_id in song_id_to_index]

    if not X_song_indices:
        print("No valid songs in the playlist.")
        return []

    # Get song factors for songs in the playlist
    X_song_factors = song_factors[X_song_indices]

    # Compute the playlist's latent factor (e.g., mean of song factors)
    playlist_vector = np.mean(X_song_factors, axis=0).reshape(1, -1)

    # Normalize the playlist vector
    playlist_vector_normalized = normalize(playlist_vector)

    # Compute cosine similarity between the playlist vector and all song factors
    similarity_scores = cosine_similarity(playlist_vector_normalized, song_factors_normalized).flatten()

    # Exclude songs already in the playlist
    similarity_scores[X_song_indices] = -np.inf  # Set their scores to negative infinity

    # Get top K song indices
    top_song_indices = np.argpartition(similarity_scores, -top_k)[-top_k:]
    top_song_indices = top_song_indices[np.argsort(-similarity_scores[top_song_indices])]

    # Map indices back to song IDs
    recommended_songs = [index_to_song_id[idx] for idx in top_song_indices]

    return recommended_songs


In [None]:
def compute_metrics_for_sets(df, song_factors, song_factors_normalized, song_id_to_index, index_to_song_id, set_name, top_k=25):
    recalls = []
    precisions = []
    playlists = df['playlist_id'].unique()

    for i in tqdm(range(len(playlists))):
        playlist_id = playlists[i]
        playlist_df = df[df['playlist_id'] == playlist_id]
        X_songs = playlist_df[playlist_df['XY'] == 'X']['song_id'].tolist()
        Y_songs = playlist_df[playlist_df['XY'] == 'Y']['song_id'].tolist()

        if not X_songs or not Y_songs:
            continue

        # Generate recommendations
        recommended_songs = recommend_songs_for_playlist(
            X_song_ids=X_songs,
            song_factors=song_factors,
            song_factors_normalized=song_factors_normalized,
            song_id_to_index=song_id_to_index,
            index_to_song_id=index_to_song_id,
            top_k=top_k
        )

        # Compute metrics
        num_Y_in_rec = len(set(Y_songs) & set(recommended_songs))
        recall = num_Y_in_rec / len(Y_songs)
        precision = num_Y_in_rec / top_k  # Since top_k=25

        recalls.append(recall)
        precisions.append(precision)

    avg_recall = np.mean(recalls) if recalls else 0
    avg_precision = np.mean(precisions) if precisions else 0

    print(f"{set_name} Set - Average Recall: {avg_recall:.4f}, Average Precision: {avg_precision:.4f}")



In [None]:
NUM_REPETITIONS = 5

In [None]:
percentages = [0.01, 0.05, 0.10, 0.25, 0.50]
train_results = [[]] * NUM_REPETITIONS
validation_results = [[]] * NUM_REPETITIONS
test_results = [[]] * NUM_REPETITIONS
# Create the subsets
for i in range(NUM_REPETITIONS):
    for percentage in percentages:
        subset_size = int(len(train_df_filtered) * percentage)
        subset = train_df_filtered.sample(n=subset_size, random_state=42)
        playlist_id_to_index, song_id_to_index, index_to_playlist_id, index_to_song_id = generate_mapping(subset)
        interaction_matrix = generate_interaction_matrix(subset, playlist_id_to_index, song_id_to_index)
        song_factors, song_factors_normalized = generate_song_vectors(interaction_matrix)
        print(f"Results on {percentage}% of the training data used")
        train_res = compute_metrics_for_sets(train_df, song_factors, song_factors_normalized, song_id_to_index, index_to_song_id, 'Train', top_k=5)
        val_res = compute_metrics_for_sets(val_df, song_factors, song_factors_normalized, song_id_to_index, index_to_song_id, 'Validation', top_k=5)
        test_res = compute_metrics_for_sets(test_df, song_factors, song_factors_normalized, song_id_to_index, index_to_song_id, 'Test', top_k=5)
        train_results[i].append(train_res)
        validation_results[i].append(val_res)
        test_results[i].append(test_res)



## Audio Features Based Recommendation System

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Assuming the feature columns are from index 5 to 20 (adjust if necessary)
feature_columns = merged_df.columns[5:20]  # Python indexing is end-exclusive

# Extract song features
song_features_df = merged_df[['song_id'] + list(feature_columns)].drop_duplicates("song_id").reset_index(drop=True)

# Handle missing values if any
song_features_df[feature_columns] = song_features_df[feature_columns].fillna(0)

In [None]:
print(song_features_df.shape, len(merged_df['song_id'].unique()))

In [None]:

# Standardize the features
scaler = StandardScaler()
song_features_scaled = scaler.fit_transform(song_features_df[feature_columns])

# Convert to a DataFrame
song_features_scaled_df = pd.DataFrame(song_features_scaled, columns=feature_columns)
song_features_scaled_df['song_id'] = song_features_df['song_id']


In [None]:
# Map song IDs to indices
song_features_scaled_df['song_index'] = song_features_scaled_df['song_id'].map(song_id_to_index)

# Ensure the order matches the song indices
song_features_scaled_df = song_features_scaled_df.sort_values('song_index').reset_index(drop=True)

# Extract the feature matrix
song_feature_matrix = song_features_scaled_df[feature_columns].values

# Normalize the feature matrix for cosine similarity
song_feature_matrix_normalized = normalize(song_feature_matrix)


In [None]:
def recommend_songs_with_features(
    X_song_ids,
    song_feature_matrix,
    song_feature_matrix_normalized,
    song_id_to_index,
    index_to_song_id,
    top_k=25
):
    """
    Recommend songs based on pre-existing song features.

    Parameters:
    - X_song_ids: List of song IDs in the playlist's 'X' set.
    - song_feature_matrix: Numpy array of song features.
    - song_feature_matrix_normalized: Normalized song features for similarity computation.
    - song_id_to_index: Mapping from song IDs to indices.
    - index_to_song_id: Mapping from indices to song IDs.
    - top_k: Number of recommendations to generate.

    Returns:
    - recommended_songs: List of recommended song IDs.
    """
    # Map song IDs to indices
    X_song_indices = [song_id_to_index[song_id] for song_id in X_song_ids if song_id in song_id_to_index]

    if not X_song_indices:
        print("No valid songs in the playlist.")
        return []

    # Get features for songs in the playlist
    X_song_features = song_feature_matrix[X_song_indices]

    # Compute the playlist's feature vector (mean of song features)
    playlist_vector = np.mean(X_song_features, axis=0).reshape(1, -1)

    # Normalize the playlist vector
    playlist_vector_normalized = normalize(playlist_vector)

    # Compute cosine similarity between the playlist vector and all song features
    similarity_scores = cosine_similarity(playlist_vector_normalized, song_feature_matrix_normalized).flatten()

    # Exclude songs already in the playlist
    similarity_scores[X_song_indices] = -np.inf  # Set their scores to negative infinity

    # Get top K song indices
    top_song_indices = np.argpartition(similarity_scores, -top_k)[-top_k:]
    top_song_indices = top_song_indices[np.argsort(-similarity_scores[top_song_indices])]

    # Map indices back to song IDs
    recommended_songs = [index_to_song_id[idx] for idx in top_song_indices]

    return recommended_songs


In [None]:
def compute_metrics_with_features(df, song_feature_matrix, song_feature_matrix_normalized, song_id_to_index, index_to_song_id, set_name, top_k=25):
    recalls = []
    precisions = []
    playlists = df['playlist_id'].unique()

    for i in tqdm(range(len(playlists))):
        playlist_id = playlists[i]
        playlist_df = df[df['playlist_id'] == playlist_id]
        X_songs = playlist_df[playlist_df['XY'] == 'X']['song_id'].tolist()
        Y_songs = playlist_df[playlist_df['XY'] == 'Y']['song_id'].tolist()

        if not X_songs or not Y_songs:
            continue

        # Generate recommendations
        recommended_songs = recommend_songs_with_features(
            X_song_ids=X_songs,
            song_feature_matrix=song_feature_matrix,
            song_feature_matrix_normalized=song_feature_matrix_normalized,
            song_id_to_index=song_id_to_index,
            index_to_song_id=index_to_song_id,
            top_k=top_k
        )

        # Compute metrics
        num_Y_in_rec = len(set(Y_songs) & set(recommended_songs))
        recall = num_Y_in_rec / len(Y_songs)
        precision = num_Y_in_rec / top_k  # Since top_k=25

        recalls.append(recall)
        precisions.append(precision)

    avg_recall = np.mean(recalls) if recalls else 0
    avg_precision = np.mean(precisions) if precisions else 0

    print(f"{set_name} Set - Average Recall: {avg_recall:.4f}, Average Precision: {avg_precision:.4f}")


In [None]:

compute_metrics_with_features(train_df, song_feature_matrix, song_feature_matrix_normalized, song_id_to_index, index_to_song_id, 'Train', top_k=5)
compute_metrics_with_features(val_df, song_feature_matrix, song_feature_matrix_normalized, song_id_to_index, index_to_song_id, 'Validation', top_k=5)
compute_metrics_with_features(test_df, song_feature_matrix, song_feature_matrix_normalized, song_id_to_index, index_to_song_id, 'Test', top_k=5)


In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_precision_recall(method1_name, method2_name,
                          train_precision_method1, test_precision_method1,
                          train_recall_method1, test_recall_method1,
                          train_precision_method2, test_precision_method2,
                          train_recall_method2, test_recall_method2):
    """
    Plots bar graphs for precision and recall of two methods for train and test sets.

    Parameters:
    - method1_name, method2_name: Names of the methods.
    - train_precision_method1, test_precision_method1: Precision for method1 (train, test).
    - train_recall_method1, test_recall_method1: Recall for method1 (train, test).
    - train_precision_method2, test_precision_method2: Precision for method2 (train, test).
    - train_recall_method2, test_recall_method2: Recall for method2 (train, test).
    """
    # Data preparation
    methods = [method1_name, method2_name]
    precision_train = [train_precision_method1, train_precision_method2]
    precision_test = [test_precision_method1, test_precision_method2]
    recall_train = [train_recall_method1, train_recall_method2]
    recall_test = [test_recall_method1, test_recall_method2]

    x = np.arange(len(methods))  # Label locations
    width = 0.35  # Bar width

    # Plotting Precision
    fig, ax = plt.subplots(figsize=(8, 6))
    bars1 = ax.bar(x - width/2, precision_train, width, label='Train Precision')
    bars2 = ax.bar(x + width/2, precision_test, width, label='Test Precision')

    # Labels and formatting
    ax.set_xlabel('Methods', fontsize=12)
    ax.set_ylabel('Precision', fontsize=12)
    ax.set_title('Precision Comparison', fontsize=14)
    ax.set_xticks(x)
    ax.set_xticklabels(methods, fontsize=10)
    ax.legend()
    ax.bar_label(bars1, fmt='%.4f', padding=3)
    ax.bar_label(bars2, fmt='%.4f', padding=3)
    plt.tight_layout()
    plt.show()

    # Plotting Recall
    fig, ax = plt.subplots(figsize=(8, 6))
    bars3 = ax.bar(x - width/2, recall_train, width, label='Train Recall')
    bars4 = ax.bar(x + width/2, recall_test, width, label='Test Recall')

    # Labels and formatting
    ax.set_xlabel('Methods', fontsize=12)
    ax.set_ylabel('Recall', fontsize=12)
    ax.set_title('Recall Comparison', fontsize=14)
    ax.set_xticks(x)
    ax.set_xticklabels(methods, fontsize=10)
    ax.legend()
    ax.bar_label(bars3, fmt='%.4f', padding=3)
    ax.bar_label(bars4, fmt='%.4f', padding=3)
    plt.tight_layout()
    plt.show()

# Example usage
plot_precision_recall(
    method1_name="Item Based Collaborative Filtering", method2_name="Song Audio Features based Reccomender",
    train_precision_method1=0.0298, test_precision_method1=0.0204,
    train_recall_method1=0.0257, test_recall_method1=0.0193,
    train_precision_method2=0.0026, test_precision_method2=0.003,
    train_recall_method2=0.0026, test_recall_method2=0.0029
)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_recommender_performance(knn_precision, knn_recall, percentages, precision_values, recall_values):
    """
    Plots precision and recall values for a KNN-based recommender and item-based collaborative filtering models.
    
    Parameters:
        knn_precision (float): Precision value of the KNN-based model.
        knn_recall (float): Recall value of the KNN-based model.
        percentages (list of float): List of percentages (as decimals) representing training data proportions.
        precision_values (list of float): Precision values for item-based CF models.
        recall_values (list of float): Recall values for item-based CF models.
    """
    # Create labels for the x-axis
    labels = ['Audio Features (KNN)'] + [f'Item-based CF\n{int(p * 100)}%' for p in percentages]
    
    # Combine KNN values with the list of item-based CF values
    all_precisions = [knn_precision] + precision_values
    all_recalls = [knn_recall] + recall_values
    
    # Create x positions for the bars
    x = np.arange(len(labels))
    
    # Set bar width
    bar_width = 0.5
    
    # Create the figure and axes
    fig, axs = plt.subplots(1, 2, figsize=(20, 10))
    
    # Plot precision bar graph
    bars_prec = axs[0].bar(x, all_precisions, color=['orange'] + ['blue'] * len(percentages), width=bar_width)
    axs[0].set_title('Model Precision')
    axs[0].set_ylabel('Precision')
    axs[0].set_xticks(x)
    axs[0].set_xticklabels(labels, rotation=45, ha='right')
    axs[0].set_ylim(0, 0.025)
    axs[0].grid(axis='y', linestyle='--', alpha=0.7)
    
    # Add values above precision bars
    for bar in bars_prec:
        axs[0].text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.0005, 
                    f'{bar.get_height():.4f}', ha='center', va='bottom', fontsize=10)
    
    # Plot recall bar graph
    bars_recall = axs[1].bar(x, all_recalls, color=['orange'] + ['blue'] * len(percentages), width=bar_width)
    axs[1].set_title('Model Recall')
    axs[1].set_ylabel('Recall')
    axs[1].set_xticks(x)
    axs[1].set_xticklabels(labels, rotation=45, ha='right')
    axs[1].set_ylim(0, 0.025)
    axs[1].grid(axis='y', linestyle='--', alpha=0.7)
    
    # Add values above recall bars
    for bar in bars_recall:
        axs[1].text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.0005, 
                    f'{bar.get_height():.4f}', ha='center', va='bottom', fontsize=10)
    
    # Adjust layout and show plot
    plt.tight_layout()
    plt.show()

# Example usage:
# KNN model precision and recall
knn_precision = 0.003
knn_recall = 0.0029

# Percentages of training data used
percentages = [0.01, 0.05, 0.10, 0.25, 0.50, 1.0]

# Precision and recall values for item-based CF models
precision_values = [0.0012, 0.0024, 0.0032,  0.0081, 0.0155, 0.0204]
recall_values = [0.0007, 0.0015, 0.0023,  0.0068, 0.0140, 0.0193]

# Generate the plot
plot_recommender_performance(knn_precision, knn_recall, percentages, precision_values, recall_values)