In [5]:
# Import the required modules
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

# Pre-Processing
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Models
from sklearn.neighbors import NearestNeighbors

# suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [6]:
# Reccommendor function
def make_recommendation(playlist_length, track_name, artists):

    # Step 1: Load the dataset
    df = pd.read_csv("Resources/dataset.csv")
    
        
    # Remove any rows with missing values
    df = df.dropna(how="any").reset_index(drop=True)

    # Drop unnecessary columns (you can adjust this if you want to keep more columns)
    df = df.drop(columns=["Unnamed: 0", "album_name", "track_genre"])

    # Remove duplicate tracks based on track_id
    df = df.drop_duplicates(subset=["track_id"]).reset_index(drop=True)

    

    # Defining the columns for meta and features 
    meta_cols = ['track_id', 'track_name', 'artists']
    feature_cols = ['popularity', 'explicit', 'danceability', 'energy', 'key', 
                    'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 
                    'liveness', 'valence', 'tempo', 'time_signature']

    # Define preprocessing steps
    numeric_features = ['popularity', 'danceability', 'energy',  'loudness',
                        'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),  # Fill missing values with the mean
        ('scaler', StandardScaler())])  # Standardize features

    categorical_features = ['key', 'mode', 'time_signature']
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent', missing_values=pd.NA)),  # Fill missing values
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])  # Apply one-hot encoding

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

    # Prepare the feature matrix for the Nearest Neighbors model
    X = df[feature_cols]  # Select the feature columns
    preprocessor.fit(X)  # Fit the preprocessor to the feature matrix
    X_preprocessed = preprocessor.transform(X)  # Transform the feature matrix

    # Step 4: Get the target track's ID based on track name and artist
    track_id = df.loc[(df.track_name == track_name) & (df.artists == artists)] \
                .sort_values(by="popularity", ascending=False).track_id.values

    # Check if the track is found in the dataset
    if len(track_id) == 0:
        return {"error": "Track not found in the dataset"}

    track_id = track_id[0]  # If found, get the first result

    # Step 5: Initialize the Nearest Neighbors model
    k = playlist_length  # Number of recommendations (playlist length)
    model1 = NearestNeighbors(n_neighbors=k, metric="cosine")
    model1.fit(X_preprocessed)

    # Step 6: Get the features of the target track
    track_features = df.loc[df.track_id == track_id, feature_cols]  # Get the feature vector for the target track
    track_features_preprocessed = preprocessor.transform(track_features)  # Preprocess the target track features

    # Step 7: Find the nearest neighbors (songs most similar to the target track)
    distances, indices = model1.kneighbors(track_features_preprocessed)  # Get distances and indices of neighbors

    # Step 8: Retrieve the metadata of the recommended tracks
    tracks = df.iloc[indices[0]]  # Select tracks corresponding to the nearest neighbors
    tracks["distance"] = distances[0]  # Add the distance of each neighbor as a new column

    # Step 9: Filter the columns for the final output
    tracks = tracks[['track_id', 'track_name', 'artists', 'distance']]  # Final output columns
    tracks = tracks.sort_values(by="distance")  # Sort the tracks by their similarity (distance)

    # Step 10: Return the recommended tracks as a list of dictionaries
    return tracks.to_dict(orient="records")


In [7]:
# User input for testing
playlist_length = 5
track_name = "Schism"
artists = "TOOL"

response = make_recommendation(playlist_length, track_name, artists) 
pd.DataFrame(response) 

Unnamed: 0,track_id,track_name,artists,distance
0,55mJleti2WfWEFNFcBduhc,Schism,TOOL,0.0
1,4OmlsAT8r4q9vPFBvfYgyZ,Is It Really You?,Loathe,0.127763
2,6wBXSf1chpnyXGOSvANs0a,Thalli Pogathey,A.R. Rahman;Sid Sriram;Aaryan Dinesh Kanagaratnam,0.137344
3,4mA8VXKSpyfTZ66uuyQnw7,The Road I'm On,Ranji;Bingo Bango,0.192393
4,4R8BdwRidxAWaYyFNU00P1,War Of Hearts,Ruelle,0.19314
