In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv('dataset.csv')
#df.head()

In [None]:
# Reduce subset_percentage if data processing takes too long or kernel dies
subset_percentage = 1
dfs = df.sample(frac=subset_percentage, random_state=42)

# Drop unnecessary columns for recommendation
columns_to_drop = ['track_id', 'album_name', 'duration_ms', 'explicit', 'key', 'mode', 'time_signature', ]
dfs = dfs.drop(columns=columns_to_drop)
dfs = dfs.loc[:, ~dfs.columns.str.match('^Unnamed')]
dfs = dfs.drop_duplicates(subset=['track_name', 'artists'], keep='first')

# Drop rows with NaN values in 'track_name'
dfs = dfs.dropna(subset=['track_name'])

# Filter out rows with non-Latin characters in 'track_name'
dfs = dfs[dfs['track_name'].apply(lambda x: all(ord(char) < 128 for char in str(x)))]
dfs = dfs[(dfs['popularity'] >= 10)]
features = dfs.drop(['track_name', 'artists', 'track_genre'], axis=1)

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

cosine_sim = cosine_similarity(features_scaled)

In [None]:
# Function to get top N similar songs
def get_similar_songs(song_index, similarity_matrix, n=5):
    sim_scores = list(enumerate(similarity_matrix[song_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n+1]  # Exclude the input song itself
    similar_songs_indices = [x[0] for x in sim_scores]
    return similar_songs_indices

In [None]:
# Function to recommend songs based on input song name
def recommend_songs(input_song_name, df, similarity_matrix, artist, n=5):    
    input_song_index = int(df[(df['track_name'].str.contains(input_song_name, case=False)) & (df['artists'].str.contains(artist, case=False))].index[0])
    similar_songs_indices = get_similar_songs(input_song_index, similarity_matrix, n)
    output = dfs.iloc[similar_songs_indices][['track_name', 'artists']].to_string(index=False)
    return output

In [None]:
# Function to return full artist list for output
def artist_search(a):
    input_song_index = int(df[(df['track_name'].str.contains(input_song_name, case=False)) & (df['artists'].str.contains(artist, case=False))].index[0])
    output = df['track_name'][input_song_index]
    output += " by "
    output += df['artists'][input_song_index]
    output+=": "
    return output

In [None]:
# Example usage
input_song_name = "Anybody"
artist="Burna Boy"
# artist_index = df[df['track_name'] == input_song_name].index[0]
# artist = df.iloc[artist_index]['artists']
recommendations = recommend_songs(input_song_name, df, cosine_sim, artist)
artist_return = artist_search(artist)
print(f"Top 5 songs similar to {artist_return}")
print("\n\n")
print(recommendations)