<a href="https://colab.research.google.com/github/ryxen2/CSE_303/blob/main/Spotify_Music_Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install spotipy

In [None]:
!pip install python-dotenv

# **Run Code**

In [None]:
import os
import numpy as np
import pandas as pd
from dotenv import load_dotenv
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict

from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.stats import pearsonr

import warnings
warnings.filterwarnings("ignore")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
file_path = '/content/drive/My Drive/Colab Notebooks/secret.env'

In [None]:
load_dotenv(file_path)


print(os.getenv("SPOTIFY_CLIENT_ID"))
print(os.getenv("SPOTIFY_CLIENT_SECRET"))

In [None]:
import kagglehub

path = kagglehub.dataset_download("vatsalmavani/spotify-dataset")

print("Path to dataset files:", path)

In [None]:
data = pd.read_csv(path + "/data/data.csv")

In [None]:
print(data.info())

In [None]:
data.isnull().sum()

In [None]:
genre_data = pd.read_csv(path + "/data/data_by_genres.csv")
print(genre_data.info())

In [None]:
genre_data.isnull().sum()

In [None]:
year_data = pd.read_csv(path + '/data/data_by_year.csv')
print(year_data)

In [None]:
year_data.isnull().sum()

In [None]:
features = ['danceability', 'energy', 'tempo']

plt.figure(figsize=(12, 6))

for i, feature in enumerate(features, 1):
    plt.subplot(1, 3, i)
    sns.histplot(data[feature], kde=True, bins=30)
    plt.title(f'{feature.capitalize()} Distribution')

plt.show()


In [None]:
feature_names = ['acousticness', 'danceability', 'energy', 'instrumentalness','liveness', 'loudness', 'speechiness', 'tempo', 'valence','duration_ms','explicit','key','mode','year']

In [None]:
Trending_songs = data.sort_values('popularity', ascending=False).head(100)

plt.figure(figsize=(12, 6))
sns.histplot(Trending_songs['year'], bins=20, kde=True, color='skyblue')
plt.title('Year Distribution of Top 100 Popular Tracks', fontsize=16)
plt.xlabel('Year')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=Trending_songs, x='year', y='energy', color='lightgreen')

plt.title('Energy Distribution of Top 100 Popular Tracks by Year', fontsize=16)
plt.xlabel('Year')
plt.ylabel('Energy')

plt.xticks(rotation=45)
plt.show()

In [None]:
data['decade'] = (data['year'] // 10) * 10

In [None]:
#Top 10 songs with artists and year
top_10_songs = Trending_songs.head(10)
print(top_10_songs[['name','artists','year']].to_string())

In [None]:
top_5_songs_each_decade = data.groupby('decade').apply(lambda x: x.nlargest(5, 'popularity')).reset_index(drop=True)
print(top_5_songs_each_decade[['name', 'artists', 'decade']].to_string())


In [None]:
trending_genres = genre_data.sort_values('popularity', ascending= False).head(25)
plt.figure(figsize=(12, 6))
sns.scatterplot(x='genres', y='popularity', data=trending_genres, color='blue')
plt.xlabel('Genres')
plt.ylabel('Popularity')
plt.title('Top 25 Genres by Popularity')
plt.xticks(rotation=45)
plt.show()

In [None]:
correlation_matrix = data[feature_names + ['popularity']].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix[['popularity']].sort_values(by='popularity',ascending=False),annot=True,cmap='coolwarm')
plt.title("Feature Correlation with Popularity")
plt.show()

In [None]:
sns.countplot(x='decade', data=data)
plt.title("Count of Songs per Decade")
plt.xlabel("Decade")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()

In [None]:
features = ['year', 'energy', 'loudness']
X = data[features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
inertia = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

In [None]:
plt.plot(range(1, 11), inertia, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')

In [None]:
train_data_kmeans,test_data_kmeans=train_test_split(data,test_size=0.2,random_state=42)

print("Training Data Size:", len(train_data_kmeans))
print("Testing Data Size:", len(test_data_kmeans))

In [None]:
clusterable_features = ['year', 'energy', 'loudness']

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(train_data_kmeans[clusterable_features])
X_test_scaled = scaler.fit_transform(test_data_kmeans[clusterable_features])

kmeans = KMeans(n_clusters=3, random_state=42)
train_data_kmeans['song_clusters'] = kmeans.fit_predict(X_train_scaled)
test_data_kmeans['song_clusters'] = kmeans.predict(X_test_scaled)


print("Train Data Head:")
print(train_data_kmeans.head())
print("\nTest Data Head:")
print(test_data_kmeans.head())

In [None]:
sns.scatterplot(x=X_train_scaled[:, 0], y=X_train_scaled[:, 1], hue=train_data_kmeans['song_clusters'], palette='rainbow')

plt.title("Song Clustering")
plt.show()

In [None]:
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [None]:
train_data_kmeans['PCA1'] = X_train_pca[:, 0]
train_data_kmeans['PCA2'] = X_train_pca[:, 1]

test_data_kmeans['PCA1'] = X_test_pca[:, 0]
test_data_kmeans['PCA2'] = X_test_pca[:, 1]

plt.figure(figsize=(10, 6))
sns.scatterplot(data=train_data_kmeans,x='PCA1',y='PCA2',hue='song_clusters',palette='rainbow',)

plt.title("KMeans Clusters (Reduced to 2D with PCA)")
plt.xlabel("PCA1")
plt.ylabel("PCA2")
plt.show()



# **API**

In [None]:
import requests
import base64

CLIENT_ID = os.environ["SPOTIFY_CLIENT_ID"]
CLIENT_SECRET = os.environ["SPOTIFY_CLIENT_SECRET"]

def get_token():
    url = 'https://accounts.spotify.com/api/token'
    auth = base64.b64encode(f"{CLIENT_ID}:{CLIENT_SECRET}".encode()).decode('utf-8')
    headers = {'Authorization': f'Basic {auth}'}
    data = {'grant_type': 'client_credentials'}
    response = requests.post(url, headers=headers, data=data)
    return response.json().get('access_token') if response.status_code == 200 else None

def search_song(query, type, limit=1):
    token = get_token()
    if not token:
        return None
    url = 'https://api.spotify.com/v1/search'
    headers = {'Authorization': f'Bearer {token}'}
    params = {'q': query, 'type': type, 'limit': limit}
    response = requests.get(url, headers=headers, params=params)
    return response.json() if response.status_code == 200 else None

def search_playlist(query, type, limit=10):
    token = get_token()
    if not token:
        return None
    url = 'https://api.spotify.com/v1/search'
    headers = {'Authorization': f'Bearer {token}'}
    params = {'q': query, 'type': type, 'limit': limit}
    response = requests.get(url, headers=headers, params=params)
    return response.json() if response.status_code == 200 else None


def format_data1(data):
    items = data.get('tracks', {}).get('items', [])
    num_items = len(items)  # Get the number of items in the list
    print(f"Number of items: {num_items}")  # Print the number of items for debugging
    if items:
        print("Keys in item:", items[0].keys())

    result = []
    for item in items:
        track = item.get('name', 'Unknown')
        artist = item.get('artists', [{}])[0].get('name', 'Unknown')
        album = item.get('album', {}).get('name', 'Unknown')
        album_url = item.get('album', {}).get('external_urls', {}).get('spotify', 'Unknown')
        release = item.get('album', {}).get('release_date', 'Unknown')
        track_url = item.get('external_urls', {}).get('spotify', 'Unknown')
        popularity = item.get('popularity', 'Unknown')
        disc_number = item.get('disc_number', 'Unknown')
        duration_ms = item.get('duration_ms', 'Unknown')
        explicit = int(item.get('explicit', False))  # Convert Boolean to 0/1
        external_ids = item.get('external_ids', {})
        external_urls = item.get('external_urls', {})
        href = item.get('href', 'Unknown')
        track_id = item.get('id', 'Unknown')
        is_local = int(item.get('is_local', False))  # Convert Boolean to 0/1
        is_playable = int(item.get('is_playable', False)) if item.get('is_playable') is not None else 'Unknown'
        preview_url = item.get('preview_url', 'Unknown')
        track_number = item.get('track_number', 'Unknown')
        uri = item.get('uri', 'Unknown')
        available_markets = ", ".join(item.get('available_markets', []))  # Convert list to comma-separated string

        track_data = {
            'Track': track,
            'Artist': artist,
            'Album': album,
            'Album URL': album_url,
            'Release': release,
            'Track URL': track_url,
            'Popularity': popularity,
            'Disc Number': disc_number,
            'Duration (ms)': duration_ms,
            'Explicit': explicit,
            'External IDs': external_ids,
            'External URLs': external_urls,
            'Spotify Href': href,
            'Track ID': track_id,
            'Is Local': is_local,
            'Is Playable': is_playable,
            'Preview URL': preview_url,
            'Track Number': track_number,
            'URI': uri,
            'Available Markets': available_markets
        }
        result.append(track_data)


    return pd.DataFrame(result)


def format_data2(data):
    items = data.get('tracks', {}).get('items', [])

    result = []
    for item in items:
        track = item.get('name', 'Unknown')
        artist = item.get('artists', [{}])[0].get('name', 'Unknown')
        album = item.get('album', {}).get('name', 'Unknown')
        release = item.get('album', {}).get('release_date', 'Unknown')
        popularity = item.get('popularity', 'Unknown')
        duration_ms = item.get('duration_ms', 'Unknown')
        explicit = int(item.get('explicit', False))  # Convert Boolean to 0/1

        track_data = {
            'Track': track,
            'Artist': artist,
            'Album': album,
            'Release': release,
            'Popularity': popularity,
            'Duration (ms)': duration_ms,
            'Explicit': explicit
        }
        result.append(track_data)

    return pd.DataFrame(result)


In [None]:
query = 'Shape of You'
search_results = search_song(query, 'track')
if search_results:
    formatted_data = format_data1(search_results)
    print(formatted_data)
else:
    print("No results found.")

In [None]:
query = 'Shape of You'
search_results = search_playlist(query, 'track')
if search_results:
    formatted_data = format_data2(search_results)
    print(formatted_data)
else:
    print("No results found.")

In [None]:
# Function to get content-based recommendations
def content_based_recommendations(df, song_index, top_n=5):

    df['Feature_engineering_column'] = df['Track'] + ' ' + df['Artist'] + ' ' + df['Duration (ms)'].map(str) + ' ' + df['Popularity'].map(str)

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df['Feature_engineering_column'])

    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    #print("Cosine Similarity Matrix:")
    #print(cosine_sim)

    # Get recommendations
    sim_scores = list(enumerate(cosine_sim[song_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n + 1]  # Exclude the input song itself
    song_indices = [i[0] for i in sim_scores]
    return df.iloc[song_indices]




In [None]:
def recommend_songs(query, top_n=5):
    # Fetch data from Spotify
    print(f"Fetching data for query: {query}")
    search_results = search_playlist(query, 'track')
    if not search_results:
        print("No results found.")
        return

    # Format data into a DataFrame
    df = format_data2(search_results)
    print("Fetched Data:")
    print(df)

    # Get content-based recommendations
    song_index = 0  # Assuming the first result is the input song
    content_recs = content_based_recommendations(df, song_index, top_n)
    print("\nContent-Based Recommendations:")
    print(content_recs)


# Example usage
query = input("Enter your preferred song name: ")
recommend_songs(query)

In [None]:
def create_utility_matrix(df):
    utility_matrix = df.pivot_table(index='Track', columns='User', values='Popularity', fill_value=0)
    return utility_matrix

def collaborative_filtering(df, song_index, top_n=5):
    utility_matrix = create_utility_matrix(df)
    similarity_matrix = cosine_similarity(utility_matrix)

    song_list = list(utility_matrix.index)
    target_song = song_list[song_index]
    song_similarities = list(enumerate(similarity_matrix[song_index]))
    song_similarities = sorted(song_similarities, key=lambda x: x[1], reverse=True)
    song_similarities = song_similarities[1:top_n + 1]  # Exclude the input song

    song_indices = [i[0] for i in song_similarities]
    return df[df['Track'].isin([song_list[i] for i in song_indices])]

def recommend_songs_collaborative(query, top_n=5):
    print(f"Fetching data for query: {query}")
    search_results = search_playlist(query, 'track')
    if not search_results:
        print("No results found.")
        return

    df = format_data2(search_results)
    print("Fetched Data:")
    print(df)

    song_index = 0  # Assuming the first result is the input song
    collab_recs = collaborative_filtering(df, song_index, top_n)
    print("\nCollaborative Filtering Recommendations:")
    print(collab_recs)

# Example usage
query = input("Enter your preferred song name: ")
recommend_songs_collaborative(query)