In [17]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Load dataset
df = pd.read_csv("SpotifyAudioFeaturesApril2019.csv")

df["popular"] = df["popularity"].apply(lambda x: 1 if x >= 60 else 0)


In [7]:
# Drop non-numeric and unnecessary columns
X = df.drop(columns=[
    "artist_name", "track_id", "track_name",
    "popularity", "popular"
], errors="ignore")

X = X.select_dtypes(include=[np.number])
y = df["popular"]


In [19]:
# Train
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42
)

# Validation + Test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42
)


In [11]:
param_grid = {
    "n_estimators": [50, 100],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5]
}

rf = RandomForestClassifier(random_state=42)

grid = GridSearchCV(
    rf,
    param_grid,
    cv=3,
    scoring="accuracy",
    n_jobs=-1
)

grid.fit(X_train, y_train)

best_model = grid.best_estimator_
print("Best parameters:", grid.best_params_)


Best parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}


In [13]:
X_final = pd.concat([X_train, X_val])
y_final = pd.concat([y_train, y_val])

best_model.fit(X_final, y_final)


In [15]:
y_pred = best_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9460714285714286

Confusion Matrix:
 [[18514     8]
 [ 1049    29]]

Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97     18522
           1       0.78      0.03      0.05      1078

    accuracy                           0.95     19600
   macro avg       0.87      0.51      0.51     19600
weighted avg       0.94      0.95      0.92     19600

