In [None]:
!pip install spotipy

In [None]:
import spotipy
import pandas as pd
import json
from spotipy.oauth2 import SpotifyClientCredentials
import config

Cleaning Songs dataset 

In [None]:
song_arc_df= pd.read_csv(r'C:\Users\samya\PyCharmProject\song-recommendations-\Dataset\audio_features_dataset_curated.csv')

In [None]:
song_arc_df.head()

In [None]:
#checking Null value 
song_arc_df.isnull().sum()

In [None]:
#shape of dataframe
song_arc_df.shape

In [None]:
song_arc_df.info()

In [None]:
song_arc_df

In [None]:
song_arc_df.columns

In [None]:
song_arc_df.drop(columns=['explicit','mode','key','time_signature','popularity'], inplace=True)

In [None]:
song_arc_df.columns

In [None]:
song_arc_df

In [None]:
song_arc_df.shape

In [None]:
song_arc_df.dropna(inplace=True)

In [None]:
song_arc_df.isnull().sum()

In [None]:
song_arc_df.shape

In [None]:
song_arc_df.describe()

In [None]:
!pip install scikit-learn

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

In [None]:
def ms_to_mmss(ms):
    min= ms// 60000
    sec= (ms% 60000)//1000
    return f"{min:02}:{sec:02}"
song_arc_df['duration_mmss'] = song_arc_df['duration_ms'].apply(ms_to_mmss)

In [None]:
song_arc_df

In [None]:
song_arc_df.nunique()

In [None]:
song_arc_df.info()

In [None]:

# Drop rows with missing values in essential columns
df_cleaned = song_arc_df.dropna(subset=["artists", "album_name", "track_name"])


In [None]:
df_cleaned

In [None]:

# Select relevant audio features
features = [
    "duration_ms", "danceability", "energy", "loudness", "speechiness",
    "acousticness", "instrumentalness", "liveness",
    "valence", "tempo"
]


In [None]:
features

In [None]:
X=df_cleaned[features]

In [None]:
from sklearn.discriminant_analysis import StandardScaler


scaler= StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
X_scaled

In [None]:
kmeans = KMeans(n_clusters=15 ,random_state=42)
kmeans.fit(X_scaled)
pred= kmeans.predict(X_scaled)

In [None]:
from sklearn.decomposition import PCA


pca =PCA(n_components=2)

X_pca = pca.fit_transform(X_scaled)

# Plot the 2D PCA scatter plot with cluster labels
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=pred, cmap='tab10', alpha=0.6)
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.title("KMeans Clustering of Songs (PCA-reduced Features)")
plt.colorbar(label='Cluster')
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:

# Scatter plot using the first two standardized features
plt.figure(figsize=(16, 16))
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=pred, cmap='tab10', alpha=0.6)
#plt.xlabel("Standardized Feature 1 (e.g., Danceability)")
#plt.ylabel("Standardized Feature 2 (e.g., Energy)")
#plt.title("KMeans Clustering of Songs (15 Clusters)")
plt.colorbar(label='Cluster')
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# ward linkage tends to produce relatively equally sized clusters
from sklearn import cluster


agglomerative = cluster.AgglomerativeClustering(n_clusters=3,linkage='ward')
pred = agglomerative.fit_predict(X)

plt.figure(figsize=(5, 5))
plt.scatter(X[:, 0], X[:, 1], c=pred,alpha=0.5)
plt.show()

In [None]:
inertia = []

k_range = range(1, 16)
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(8, 5))
plt.plot(k_range, inertia, marker='o')
plt.title("Elbow Method for Optimal k")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Inertia")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
df_cleaned["cluster"] = kmeans.fit_predict(X_scaled)

In [None]:
df_cleaned

In [None]:
from sklearn.metrics import silhouette_score
import pickle
import os

# Define the directory name or path
directory_name = "pickles"

# Create the directory
os.makedirs(directory_name, exist_ok=True)

print(f"Directory '{directory_name}' created successfully.")

cluster_range= range(5, 25)
silhouette_scores= []


for n_clusters in cluster_range:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init = 5)
    cluster_labels = kmeans.fit_predict(X_scaled)
    #pickling
    filename = "pickles/kmeans_" + str(n_clusters) + ".pickle"
    with open(filename, "wb") as f:
        pickle.dump(kmeans,f)
    score = silhouette_score(X_scaled, cluster_labels)
    silhouette_scores.append(score)




In [None]:
plt.figure(figsize=(8, 5))
plt.plot(cluster_range, silhouette_scores, marker='o', linestyle='-', color='b')
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Score")
plt.title("Silhouette Score for Different Cluster Counts")
plt.xticks(cluster_range)
plt.grid(True)
plt.show()

In [None]:
def load(filename = "filename.pickle"): 
    try: 
        with open("pickles/"+filename, "rb") as f: 
            return pickle.load(f) 
        
    except FileNotFoundError: 
        print("File not found!") 

In [None]:
kmeans12 = load("kmeans_12.pickle")
kmeans15 = load("kmeans_15.pickle")

In [None]:
kmeans12.predict(X_scaled)

In [None]:
kmeans15.predict(X_scaled)