# Recommender.

In [31]:
import pandas as pd
import numpy as np

import string
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings('ignore')

## Read in the data with the clusters.

In [27]:
spot = pd.read_csv('../Data/clean/final.csv')

In [28]:
spot.head()

Unnamed: 0,song title,artist,top genre,year,added,bpm,energy,dance,dB,live,valence,dur,acous,spch,popularity,km cluster
0,Chop Suey!,System Of A Down,alternative metal,2001,1969‑12‑31,127,93,42,-4,13,29,210,0,12,86,2
1,Just Pretend,Bad Omens,american metalcore,2022,1969‑12‑31,140,83,46,-5,10,20,205,0,8,81,2
2,Monster,Skillet,alternative metal,2009,1969‑12‑31,135,96,64,-2,8,69,178,4,7,80,2
3,Popular Monster,Falling In Reverse,alternative metal,2019,1969‑12‑31,165,90,46,-4,10,36,221,1,9,78,2
4,Watch The World Burn,Falling In Reverse,alternative metal,2023,1969‑12‑31,162,78,40,-6,21,35,204,1,13,78,2


## Make the clusters into separate sections

In [22]:
# Lived in chat gpt and reading articles to create the recommender system. 
# While no code from this article was used it did give me a lot of pointers on how to make proceed.
# https://asdkazmi.medium.com/ai-movies-recommendation-system-with-clustering-based-k-means-algorithm-f04467e02fcd
cluster_0_songs = spot[spot['km cluster'] == 0]  
cluster_1_songs = spot[spot['km cluster'] == 1]  
cluster_2_songs = spot[spot['km cluster'] == 2]  

## Tokenize

In [29]:
# Define a function for text preprocessing
def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    tokens = word_tokenize(text)
    return tokens

# Apply text preprocessing to song titles in each cluster
cluster_0_songs['song_tokens'] = cluster_0_songs['song title'].apply(preprocess_text)
cluster_1_songs['song_tokens'] = cluster_1_songs['song title'].apply(preprocess_text)
cluster_2_songs['song_tokens'] = cluster_2_songs['song title'].apply(preprocess_text)


## TFID to look at term frequency in titles.

In [32]:
# Create TF-IDF vectorizers for each cluster
tfidf_vectorizer_0 = TfidfVectorizer()
tfidf_vectorizer_1 = TfidfVectorizer()
tfidf_vectorizer_2 = TfidfVectorizer()

# Fit and transform the clusters.
tfidf_matrix_0 = tfidf_vectorizer_0.fit_transform(cluster_0_songs['song_tokens'].apply(lambda x: ' '.join(x)))
tfidf_matrix_1 = tfidf_vectorizer_1.fit_transform(cluster_1_songs['song_tokens'].apply(lambda x: ' '.join(x)))
tfidf_matrix_2 = tfidf_vectorizer_2.fit_transform(cluster_2_songs['song_tokens'].apply(lambda x: ' '.join(x)))


## Calculate Cosine Similarity between the songs in each cluster.

In [25]:
# Applying cosine similarity to each cluster.
cosine_sim_matrix_0 = cosine_similarity(tfidf_matrix_0)
cosine_sim_matrix_1 = cosine_similarity(tfidf_matrix_1)
cosine_sim_matrix_2 = cosine_similarity(tfidf_matrix_2)

## Recommender

In [26]:
def recommend_songs(song_title, cluster, cosine_sim_matrix, n=10):
    # Check if the song title exists in the cluster
    if song_title not in cluster['song title'].values:
        return "Song not found in this cluster."

    song_index = cluster[cluster['song title'] == song_title].index[0]
    sim_scores = list(enumerate(cosine_sim_matrix[song_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n+1]  # Get the top n (in this case, 10) similar songs
    song_indices = [i[0] for i in sim_scores]
    recommended_songs = cluster.iloc[song_indices][['song title', 'artist', 'energy']]
    return recommended_songs

# Input the song title for which you want recommendations
input_song_title = 'Monster'

# Recommend the top 10 songs for the input song title in the correct cluster
if input_song_title in cluster_0_songs['song title'].values:
    recommended_songs = recommend_songs(input_song_title, cluster_0_songs, cosine_sim_matrix_0)
elif input_song_title in cluster_1_songs['song title'].values:
    recommended_songs = recommend_songs(input_song_title, cluster_1_songs, cosine_sim_matrix_1)
elif input_song_title in cluster_2_songs['song title'].values:
    recommended_songs = recommend_songs(input_song_title, cluster_2_songs, cosine_sim_matrix_2)
else:
    recommended_songs = "Song not found in any cluster."

# Print the top 10 recommended songs with artist and energy level
print("Top 10 recommended songs:")
print(recommended_songs)


Top 10 recommended songs:
                      song title              artist  energy
1276               I'm A Monster               Lucha      91
3                Popular Monster  Falling In Reverse      90
0                     Chop Suey!    System Of A Down      93
1                   Just Pretend           Bad Omens      83
4           Watch The World Burn  Falling In Reverse      78
5                         Bodies       Drowning Pool      93
6     THE DEATH OF PEACE OF MIND           Bad Omens      71
7              Voices In My Head  Falling In Reverse      90
8           Animal I Have Become    Three Days Grace      85
9                  The Summoning         Sleep Token      70
