In [None]:
!pip install kagglehub --quiet

import kagglehub
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, f1_score


In [None]:
# Download dataset from KaggleHub
path = kagglehub.dataset_download("amitanshjoshi/spotify-1million-tracks")
file_path = path + "/spotify_data.csv"

# Load the dataset
df = pd.read_csv(file_path)
print("Dataset loaded. Total rows:", len(df))
df.head()


Downloading from https://www.kaggle.com/api/v1/datasets/download/amitanshjoshi/spotify-1million-tracks?dataset_version_number=1...


100%|██████████| 77.1M/77.1M [00:00<00:00, 96.8MB/s]

Extracting files...





Dataset loaded. Total rows: 1159764


Unnamed: 0.1,Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0,Jason Mraz,I Won't Give Up,53QF56cjZA9RTuuMZDrSA6,68,2012,acoustic,0.483,0.303,4,-10.058,1,0.0429,0.694,0.0,0.115,0.139,133.406,240166,3
1,1,Jason Mraz,93 Million Miles,1s8tP3jP4GZcyHDsjvw218,50,2012,acoustic,0.572,0.454,3,-10.286,1,0.0258,0.477,1.4e-05,0.0974,0.515,140.182,216387,4
2,2,Joshua Hyslop,Do Not Let Me Go,7BRCa8MPiyuvr2VU3O9W0F,57,2012,acoustic,0.409,0.234,3,-13.711,1,0.0323,0.338,5e-05,0.0895,0.145,139.832,158960,4
3,3,Boyce Avenue,Fast Car,63wsZUhUZLlh1OsyrZq7sz,58,2012,acoustic,0.392,0.251,10,-9.845,1,0.0363,0.807,0.0,0.0797,0.508,204.961,304293,4
4,4,Andrew Belle,Sky's Still Blue,6nXIYClvJAfi6ujLiKqEq8,54,2012,acoustic,0.43,0.791,6,-5.419,0,0.0302,0.0726,0.0193,0.11,0.217,171.864,244320,4


In [None]:
# Keep only the top 8 genres
top_8 = df['genre'].value_counts().index[:8]
df = df[df['genre'].isin(top_8)]

# Reset index and confirm
df = df.reset_index(drop=True)
print("Filtered dataset size:", df.shape)


Filtered dataset size: (168309, 20)


In [None]:
# Encode target
label_encoder = LabelEncoder()
df['genre'] = label_encoder.fit_transform(df['genre'])

# Drop non-numeric, non-useful columns
X = df.drop(columns=['genre', 'artist_name', 'track_name', 'track_id', 'Unnamed: 0'], errors='ignore')
y = df['genre']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Identify columns
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns


In [None]:
# Create a pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_cols)
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', ExtraTreesClassifier(n_estimators=100, max_depth=10, random_state=42))
])

# Train model
pipeline.fit(X_train, y_train)


In [None]:
y_pred = pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Extra Trees Classifier - Accuracy: {accuracy:.4f}, F1-score: {f1:.4f}")


Extra Trees Classifier - Accuracy: 0.5947, F1-score: 0.5891


In [None]:
y_pred = pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Extra Trees Classifier - Accuracy: {accuracy:.4f}, F1-score: {f1:.4f}")


Extra Trees Classifier - Accuracy: 0.5947, F1-score: 0.5891


In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'classifier__n_estimators': [100, 150, 200],
    'classifier__max_depth': [10, 20, None],
    'classifier__min_samples_split': [2, 5, 10]
}

random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=5,  # Only try 5 random combinations
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

random_search.fit(X_train, y_train)
print("Best Accuracy:", random_search.best_score_)
print("Best Params:", random_search.best_params_)


Fitting 3 folds for each of 5 candidates, totalling 15 fits




Best Accuracy: 0.6775642800704333
Best Params: {'classifier__n_estimators': 100, 'classifier__min_samples_split': 5, 'classifier__max_depth': None}


In [None]:
# Get feature importances from tuned model
best_model = random_search.best_estimator_.named_steps['classifier']
importances = best_model.feature_importances_

# Rank features
top_features = pd.Series(importances, index=X.columns).sort_values(ascending=False)
print(top_features.head(10))


popularity          0.131656
instrumentalness    0.119342
acousticness        0.113324
danceability        0.090083
energy              0.087786
valence             0.083168
loudness            0.072456
duration_ms         0.062785
year                0.061528
speechiness         0.043284
dtype: float64


In [None]:
X['energy_dance'] = X['energy'] * X['danceability']
X['valence_tempo'] = X['valence'] * X['tempo']


In [None]:
# Step 1: Clean params
best_params_clean = {
    key.split("__")[1]: val for key, val in random_search.best_params_.items()
}

# Step 2: Create and fit PCA pipeline
pipeline_pca = Pipeline([
    ('classifier', ExtraTreesClassifier(**best_params_clean))
])

pipeline_pca.fit(X_train_pca, y_train_pca)

# Step 3: Evaluate
print("PCA Accuracy:", pipeline_pca.score(X_test_pca, y_test_pca))


PCA Accuracy: 0.6338898461172836


In [None]:
# Get preprocessor from the pipeline
preprocessor = random_search.best_estimator_.named_steps['preprocessor']

# Get the column names actually used (numerical ones)
numeric_features = preprocessor.transformers_[0][2]

# Now build the Series properly
top_features = pd.Series(importances, index=numeric_features).sort_values(ascending=False)

# Top 15 feature names
top_15_features = top_features.head(15).index.tolist()


In [None]:
X_top = X[top_15_features]


In [None]:
X_top = X[top_15_features]

X_train, X_test, y_train, y_test = train_test_split(
    X_top, y, test_size=0.2, random_state=42, stratify=y
)

pipeline_top = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', ExtraTreesClassifier(**best_params_clean))
])

pipeline_top.fit(X_train, y_train)
y_pred = pipeline_top.predict(X_test)

print("Accuracy (Top Features):", accuracy_score(y_test, y_pred))


Accuracy (Top Features): 0.6832333194700255


In [None]:
y.value_counts(normalize=True)


Unnamed: 0_level_0,proportion
genre,Unnamed: 1_level_1
3,0.129833
5,0.12846
2,0.127082
0,0.125347
1,0.124283
4,0.12385
6,0.122293
7,0.118853


In [None]:
from imblearn.over_sampling import SMOTE

# Only use top 15 selected features
X_top = X[top_15_features]

X_train, X_test, y_train, y_test = train_test_split(
    X_top, y, test_size=0.2, stratify=y, random_state=42
)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

print("After SMOTE:", pd.Series(y_resampled).value_counts())

# Retrain on balanced data
pipeline_smote = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', ExtraTreesClassifier(**best_params_clean))
])

pipeline_smote.fit(X_resampled, y_resampled)
y_pred = pipeline_smote.predict(X_test)

from sklearn.metrics import accuracy_score, f1_score
print("SMOTE Accuracy:", accuracy_score(y_test, y_pred))
print("SMOTE F1 Score:", f1_score(y_test, y_pred, average='weighted'))


After SMOTE: genre
0    17482
3    17482
1    17482
6    17482
2    17482
4    17482
5    17482
7    17482
Name: count, dtype: int64
SMOTE Accuracy: 0.6816291367120195
SMOTE F1 Score: 0.680323076084586


In [None]:
X['energy_dance'] = X['energy'] * X['danceability']
X['valence_tempo'] = X['valence'] * X['tempo']


In [None]:
# Update top features list
X_new = X.copy()
X_new['energy_dance'] = X_new['energy'] * X_new['danceability']
X_new['valence_tempo'] = X_new['valence'] * X_new['tempo']

# Split
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42, stratify=y)

# Fit on original (non-SMOTE) data just to get importances
pipeline_new = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', ExtraTreesClassifier(**best_params_clean))
])

pipeline_new.fit(X_train, y_train)

# Get importances
model = pipeline_new.named_steps['classifier']
importances = model.feature_importances_
top_features = pd.Series(importances, index=X_train.columns).sort_values(ascending=False)
top_15_features = top_features.head(15).index.tolist()
print(top_features.head(15))


popularity          0.126915
instrumentalness    0.114737
acousticness        0.103820
energy              0.079434
danceability        0.072952
valence             0.061328
energy_dance        0.060465
loudness            0.060048
year                0.059301
duration_ms         0.057827
valence_tempo       0.043736
speechiness         0.039419
liveness            0.033776
tempo               0.030334
key                 0.027223
dtype: float64


In [None]:
# Select new top features
X_top = X_new[top_15_features]

# Split and apply SMOTE again
X_train, X_test, y_train, y_test = train_test_split(X_top, y, test_size=0.2, random_state=42, stratify=y)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Retrain
pipeline_final = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', ExtraTreesClassifier(**best_params_clean))
])

pipeline_final.fit(X_resampled, y_resampled)
y_pred = pipeline_final.predict(X_test)

# Evaluate
print("Final Accuracy:", accuracy_score(y_test, y_pred))
print("Final F1 Score:", f1_score(y_test, y_pred, average='weighted'))


Final Accuracy: 0.6788663775176758
Final F1 Score: 0.6780282327762525
