# 📘 IMPORTS & DATA LOADING

In [None]:
# 1. Libraries and Loading
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix

import warnings
warnings.filterwarnings('ignore')

# Load data
df = pd.read_csv('data.csv')
df.head()

# 📊 PREPROCESSING & EDA

In [None]:
# 2. Preprocessing and EDA
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

# Convert release_date to datetime
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')

# Audio feature columns
audio_features = ['valence', 'acousticness', 'danceability', 'duration_ms',
                  'energy', 'instrumentalness', 'key', 'liveness',
                  'loudness', 'mode', 'popularity', 'speechiness', 'tempo', 'year']

# Standardize features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[audio_features])

# EDA: Popularity distribution
plt.figure(figsize=(10, 4))
sns.histplot(df['popularity'], bins=50, kde=True)
plt.title('Popularity Distribution')
plt.show()

# 🎯 GENRE CLUSTERING VISUALIZATION WITH PCA

In [None]:
# 3. Genre Clustering Visualization with PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_features)

kmeans = KMeans(n_clusters=10, random_state=42)
clusters = kmeans.fit_predict(scaled_features)

df['cluster'] = clusters
df['pca_one'] = pca_result[:, 0]
df['pca_two'] = pca_result[:, 1]

plt.figure(figsize=(12, 6))
sns.scatterplot(x='pca_one', y='pca_two', hue='cluster', data=df, palette='tab10', s=10)
plt.title('PCA Clustering Visualization')
plt.show()

# 🤖 CONTENT BASED FILTERING (COSINE SIMILARITY)

In [None]:
# Define audio features
audio_features = ['valence', 'acousticness', 'danceability', 'duration_ms',
                  'energy', 'instrumentalness', 'key', 'liveness',
                  'loudness', 'mode', 'popularity', 'speechiness', 'tempo', 'year']

# Standardize audio features
scaler = StandardScaler()
content_scaled = scaler.fit_transform(df[audio_features])

# Fit NearestNeighbors model
nn_model_content = NearestNeighbors(n_neighbors=11, metric='cosine', algorithm='brute')
nn_model_content.fit(content_scaled)

# Recommendation function using NearestNeighbors (memory-efficient)
def get_content_recommendations(song_name, top_n=10):
    idx_list = df[df['name'].str.lower() == song_name.lower()].index
    if idx_list.empty:
        return f"'{song_name}' not found in dataset."
    idx = idx_list[0]
    distances, indices = nn_model_content.kneighbors([content_scaled[idx]], n_neighbors=top_n+1)
    rec_indices = indices[0][1:]
    return df.iloc[rec_indices][['name', 'artists', 'popularity']]

# 📈 COLLABORATIVE FILTERING WITH MATRIX FACTORIZATION (SGD)

In [None]:
# 5. Create the User-Song Interaction Matrix (pivot table) for Collaborative Filtering
top_songs_df = df.sort_values('popularity', ascending=False).drop_duplicates('name').head(10000)
top_songs_df = top_songs_df[['name', 'popularity']].copy()

# Simulate user interactions
top_songs_df = pd.concat([top_songs_df] * 3, ignore_index=True)
top_songs_df['user_id'] = top_songs_df.index % 5000

# Group and pivot
grouped_df = top_songs_df.groupby(['user_id', 'name']).agg({'popularity': 'mean'}).reset_index()
pivot_df = grouped_df.pivot(index='user_id', columns='name', values='popularity').fillna(0)
sparse_matrix = csr_matrix(pivot_df.values)

# Fit KNN model
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(sparse_matrix)

# 🎵 COLLABORATIVE RECOMMENDATIONS

In [None]:
# 6. Recommendation Functions
def get_collab_recommendations(song_name, top_n=10):
    if song_name not in pivot_df.columns:
        return f"'{song_name}' not found in interaction matrix."
    song_vector = pivot_df[song_name].values.reshape(1, -1)
    distances, indices = model_knn.kneighbors(song_vector, n_neighbors=top_n+1)
    similar_songs = pivot_df.columns[indices[0][1:]].tolist()
    return pd.DataFrame({'Recommended Songs': similar_songs})

def recommend(song_name, method='content', top_n=10):
    if method == 'content':
        return get_content_recommendations(song_name, top_n)
    elif method == 'collab':
        return get_collab_recommendations(song_name, top_n)
    else:
        return "Method must be either 'content' or 'collab'."

# Example song for testing
song_name = 'China'  # Replace with a real song name from your dataset

# Get content-based recommendations
print("Content-based Recommendations:")
print(recommend(song_name, method='content', top_n=5))

# Get collaborative-based recommendations
print("Collaborative-based Recommendations:")
print(recommend(song_name, method='collab', top_n=5))