In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances, cosine_distances, pairwise_distances
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from collections import Counter

class MusicRecommender:
    def __init__(self):
        self.similarity_scores = {}
        self.kmeans_labels = None
        self.X_pca = None
        self.X_tsne = None
        self.data = None
        self.raw_data=None
        self.features=['acousticness', 'danceability', 'energy',
                              'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'popularity', 'duration_ms','release_year']

    def fit(self, data_path='top_tracks_data.csv'):
        # Step 1: Data Preprocessing
        self.raw_data = pd.read_csv(data_path).drop_duplicates(subset=['id'])
        self.data = pd.read_csv(data_path).drop_duplicates(subset=['id'])
        if 'Unnamed: 0' in self.data.columns:
            self.data.drop(columns=['Unnamed: 0'], inplace=True)
        # Handle missing values if any
        self.data.dropna(inplace=True)

        # Scale numerical features
        scaler = StandardScaler()
        numerical_features = ['acousticness', 'danceability', 'energy',
                              'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'popularity', 'duration_ms','release_year']
        self.data[numerical_features] = scaler.fit_transform(self.data[numerical_features])

        # Step 2: Feature Selection (We'll use all numerical features)
        selected_features = numerical_features

        # Step 3: Similarity Calculation
        similarity_functions = {
            'euclidean_distances': euclidean_distances,
            'manhattan_distances': manhattan_distances,
            'cosine_distances': cosine_distances,
            'pairwise_distances': pairwise_distances
        }

        for similarity_name, similarity_func in similarity_functions.items():
            if similarity_name == 'pairwise_distances':
                self.similarity_scores[similarity_name] = similarity_func(self.data[selected_features])
            else:
                self.similarity_scores[similarity_name] = similarity_func(self.data[selected_features])

        # Fit KMeans
        self.kmeans_labels = KMeans(n_clusters=8, random_state=42).fit_predict(self.data[selected_features])

        # Apply PCA and t-SNE
        pca = PCA(n_components=7, random_state=42)
        self.X_pca = pca.fit_transform(self.data[selected_features])
        tsne = TSNE(n_components=3, random_state=42)
        self.X_tsne = tsne.fit_transform(self.data[selected_features])
    def get_song_index_by_name(self, song_name):
        return self.data[self.data['track_name'] == song_name].index[0]
    def recommend(self, song_name, n=10):
        song_index = self.get_song_index_by_name(song_name)
        recommendations = {}
        for similarity_name, similarity_scores in self.similarity_scores.items():
            scores = similarity_scores[song_index]
            similar_song_indices = scores.argsort()
            similar_song_indices = similar_song_indices[similar_song_indices != song_index]
            recommendations[similarity_name] = similar_song_indices[:n]

        # KMeans recommendation
        cluster_label = self.kmeans_labels[song_index]
        cluster_indices = np.where(self.kmeans_labels == cluster_label)[0]
        kmeans_recommendations = np.random.choice(cluster_indices, size=n, replace=False)
        recommendations["KMeans"] = kmeans_recommendations

        # PCA recommendation
        distances_pca = np.linalg.norm(self.X_pca - self.X_pca[song_index], axis=1)
        pca_recommendations = distances_pca.argsort()[1:n+1]  # Exclude the seed song itself
        recommendations["PCA"] = pca_recommendations

        # t-SNE recommendation
        distances_tsne = np.linalg.norm(self.X_tsne - self.X_tsne[song_index], axis=1)
        tsne_recommendations = distances_tsne.argsort()[1:n+1]  # Exclude the seed song itself
        recommendations["t-SNE"] = tsne_recommendations
        print("Chosen Song", self.data.iloc[song_index]["track_name"])
        for similarity_name, recommended_song_indices in recommendations.items():
            print(f"\nUsing {similarity_name}:")
            recommended_songs = self.data.iloc[recommended_song_indices]
            print(recommended_songs['track_name'])

        return recommendations
    
    def get_top_recommendations(self, song_name, n=10):
        # song_index = self.get_song_index_by_name(song_name)
        df = self.recommend(song_name=song_name)
        counter = Counter()

        for key, value in df.items():
            counter.update(value)

        sorted_counter = counter.most_common()

        new_dataset = {
            'value': [x[0] for x in sorted_counter],
            'count': [x[1] for x in sorted_counter]
        }

        first_10_values = new_dataset['value'][:10]
        first_10_counts = new_dataset['count'][:10]

        first_10_dataset = {
            'value': first_10_values,
            'count': first_10_counts
        }
        top_recommendations = [{}]
        # print(first_10_dataset)
        print("Chosen Song:", song_name)
        # Iterate over the indices in first_10_dataset
        for index in first_10_dataset['value']:
            # Access the corresponding row from the self.dataa dataframe
            recommended_song = self.data.iloc[index]
            # Print the song name and artist name
            top_recommendations.append({recommended_song['track_name']:recommended_song['artist_name']})
            print("Song:", recommended_song['track_name'])
            print("Artist:", recommended_song['artist_name'])
            print()  # Add a newline for better readability between recommendations

        return top_recommendations
    
    def generate_plots(self):
        
        # Timeline of Music by Release Year
        plt.figure(figsize=(12, 6))
        sns.histplot(data=self.raw_data, x='release_year', bins=30, kde=False)
        plt.title('Timeline of Music by Release Year')
        plt.xlabel('Release Year')
        plt.ylabel('Count')
        plt.xticks(rotation=45)
        plt.show()
        
        plt.figure(figsize=(16, 12))
        for i, feature in enumerate(self.features):
            plt.subplot(4, 4, i + 1)
            sns.histplot(data=self.data.drop(columns=[x for x in self.data.columns if x != feature]), x=feature, kde=True)
            if feature=='key' or feature=='mode' or feature=='time_signature' :
                plt.title(f'{feature} Distribution')
            else:
                plt.title(f'{feature} Distribution (Scaled)')
        plt.tight_layout()
        plt.show()
        
        plt.figure(figsize=(12, 6))
        sns.heatmap(self.data[self.features].corr(), annot=True, cmap='coolwarm', fmt=".2f")
        plt.title('Correlation Matrix Heatmap')
        plt.show()
        
        plt.figure(figsize=(12, 6))
        self.data.boxplot(rot=45)
        plt.title('Boxplots of Numerical Features')
        plt.show()
        
        # Clustering Analysis
        plt.figure(figsize=(12, 6))
        plt.scatter(self.X_pca[:, 0], self.X_pca[:, 1], c=self.kmeans_labels, cmap='viridis', alpha=0.5)
        plt.title('PCA Clusters')
        plt.xlabel('PCA Component 1')
        plt.ylabel('PCA Component 2')
        plt.colorbar(label='Cluster')
        plt.show()
        
        plt.figure(figsize=(12, 6))
        plt.scatter(self.X_tsne[:, 0], self.X_tsne[:, 1], c=self.kmeans_labels, cmap='viridis', alpha=0.5)
        plt.title('t-SNE Clusters')
        plt.xlabel('t-SNE Dimension 1')
        plt.ylabel('t-SNE Dimension 2')
        plt.colorbar(label='Cluster')
        plt.show()
        
        # Model Evaluation
        inertia = []
        for k in range(1, 11):
            kmeans = KMeans(n_clusters=k, random_state=42)
            kmeans.fit(self.X_pca)
            inertia.append(kmeans.inertia_)
        plt.figure(figsize=(8, 6))
        plt.plot(range(1, 11), inertia, marker='o')
        plt.title('Elbow Method for Optimal K')
        plt.xlabel('Number of Clusters')
        plt.ylabel('Inertia')
        plt.xticks(range(1, 11))
        plt.grid(True)
        plt.show()
        
        pca = PCA().fit(self.data[self.features])
        plt.figure(figsize=(8, 6))
        plt.plot(np.cumsum(pca.explained_variance_ratio_))
        plt.xlabel('Number of Components')
        plt.ylabel('Cumulative Explained Variance')
        plt.title('Explained Variance Ratio for PCA')
        plt.grid(True)
        plt.show()

# Example usage:
model = MusicRecommender()
model.fit('top_tracks_data_new.csv')
# model.generate_plots()

In [7]:
song_name= "Don't Rain On My Parade"  # Index of the song for which we want recommendations
print("Recommended Songs:")
recommendations=model.recommend(song_name=song_name)

    
print("\nTop Recommendation:")
top_recommendations = model.get_top_recommendations(song_name)

Recommended Songs:
Chosen Song Don't Rain On My Parade

Using euclidean_distances:
1406                       Mujeres Divinas
85      We Will Rock You - Remastered 2011
508                            Usted Abusó
1409                         La Diferencia
1002                           La Farsante
1517                           That's Life
1523    Come Fly With Me - Remastered 1998
666                  Abhi Na Jao Chhod Kar
86          Killer Queen - Remastered 2011
773                Changes - 2015 Remaster
Name: track_name, dtype: object

Using manhattan_distances:
508                            Usted Abusó
1518                Strangers In The Night
1406                       Mujeres Divinas
85      We Will Rock You - Remastered 2011
108                              Perdóname
1409                         La Diferencia
1517                           That's Life
1519     The World We Knew (Over And Over)
849        You Are The Sunshine Of My Life
851                        My Cherie Amo