In [509]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL.ImageColor import colormap
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from category_encoders import TargetEncoder
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.pipeline import Pipeline
import mlflow
import mlflow.sklearn
from sklearn.metrics import mean_absolute_error, mean_squared_log_error, make_scorer, mean_squared_error, r2_score
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    ConfusionMatrixDisplay
)
from sklearn.metrics import precision_recall_curve
from imblearn.over_sampling import RandomOverSampler, SMOTENC
from imblearn.pipeline import Pipeline as ImbPipeline
from collections import Counter


In [510]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("part-2 | Spotify | begin")
mlflow.sklearn.autolog()

In [511]:
df = pd.read_csv('data/spotify_data.csv')
df = df[df['year'] != 2023]
df = df[df['popularity'] != 0]

In [512]:
df['is_pop'] = df['popularity'] > 50
df['duration'] = df['duration_ms'] / 60
# df['pop_temp'] = (df['tempo'] >= 80) & (df['tempo'] <= 140)

In [513]:
include = ['danceability','loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration', 'year', 'genre', 'key', 'time_signature', 'mode']

In [514]:
numerical = ['tempo', 'duration', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence']

In [515]:
categorical = ['genre','key', 'time_signature', 'mode']

In [516]:
X = df[numerical + categorical]
y = df['is_pop']
X = X.astype({col: 'float64' for col in X.select_dtypes(include='int').columns})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
input_example = X_train.iloc[:1]

In [517]:
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

ct = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical),
        ('cat', categorical_pipeline, categorical),
    ],
    remainder = 'passthrough'
)

In [518]:
param_grid = {
    'estimator__n_estimators': [100, 200],              # Reasonable trees
    'estimator__max_depth': [3, 5, 7],                  # Low to moderate complexity
    'estimator__learning_rate': [0.01, 0.1],            # Small and standard learning rates
    'estimator__subsample': [0.8, 1.0],                 # Slight regularization via sampling
    'estimator__colsample_bytree': [0.5, 0.8],          # Reduce overfitting
    'estimator__scale_pos_weight': [3, 5, 7],           # Tackle class imbalance
    'estimator__reg_alpha': [0, 1],                     # L1 regularization (sparse)
    'estimator__reg_lambda': [1, 10],
}

In [519]:
pipeline = Pipeline([
    ('ct', ct),
    # ('ros', RandomOverSampler(random_state=42)),
    ('estimator', XGBClassifier()),
    # ('estimator', XGBClassifier(subsample=0.8, learning_rate=0.1, n_estimators=500, reg_alpha=1, max_depth=7, colsample_bytree=0.6,reg_lambda=10, gamma=0, scale_pos_weight=5)),
])

In [520]:
model = RandomizedSearchCV(estimator=pipeline, param_distributions=param_grid, n_iter=50, scoring="f1", cv=5, verbose=2, random_state=42, n_jobs=-1)
# model = GridSearchCV(estimator=pipeline, scoring='average_precision', param_grid=param_grid, n_jobs=-1, cv=5, verbose=2)
# model = pipeline

In [None]:
with mlflow.start_run() as run:
    model.fit(X_train, y_train)
    mlflow.sklearn.log_model(sk_model=model.best_estimator_, artifact_path="best_model", input_example=input_example)
    model_uri = f"runs:/{run.info.run_id}/best_model"
    eval_data = X_test.astype({col: 'float64' for col in X_test.select_dtypes(include='int').columns})
    eval_data['target'] = y_test
    mlflow.evaluate(model=model_uri, model_type='classifier', data=eval_data, targets='target')

    y_pred = model.best_estimator_.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues')
    plt.title("Confusion Matrix")
    plt.savefig("nonnormal_confusion_matrix.png")
    mlflow.log_artifact("nonnormal_confusion_matrix.png")
    plt.show()

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END estimator__colsample_bytree=0.5, estimator__learning_rate=0.1, estimator__max_depth=5, estimator__n_estimators=200, estimator__reg_alpha=1, estimator__reg_lambda=10, estimator__scale_pos_weight=3, estimator__subsample=0.8; total time=  10.6s
[CV] END estimator__colsample_bytree=0.8, estimator__learning_rate=0.1, estimator__max_depth=5, estimator__n_estimators=100, estimator__reg_alpha=1, estimator__reg_lambda=10, estimator__scale_pos_weight=3, estimator__subsample=0.8; total time=   6.7s
[CV] END estimator__colsample_bytree=0.8, estimator__learning_rate=0.01, estimator__max_depth=7, estimator__n_estimators=100, estimator__reg_alpha=1, estimator__reg_lambda=10, estimator__scale_pos_weight=3, estimator__subsample=0.8; total time=   8.0s
[CV] END estimator__colsample_bytree=0.5, estimator__learning_rate=0.1, estimator__max_depth=7, estimator__n_estimators=100, estimator__reg_alpha=1, estimator__reg_lambda=10, estimator



[CV] END estimator__colsample_bytree=0.5, estimator__learning_rate=0.1, estimator__max_depth=5, estimator__n_estimators=200, estimator__reg_alpha=1, estimator__reg_lambda=10, estimator__scale_pos_weight=3, estimator__subsample=0.8; total time=  11.4s
[CV] END estimator__colsample_bytree=0.8, estimator__learning_rate=0.1, estimator__max_depth=5, estimator__n_estimators=100, estimator__reg_alpha=1, estimator__reg_lambda=10, estimator__scale_pos_weight=3, estimator__subsample=0.8; total time=   6.4s
[CV] END estimator__colsample_bytree=0.8, estimator__learning_rate=0.01, estimator__max_depth=7, estimator__n_estimators=100, estimator__reg_alpha=1, estimator__reg_lambda=1, estimator__scale_pos_weight=7, estimator__subsample=0.8; total time=   8.7s
[CV] END estimator__colsample_bytree=0.5, estimator__learning_rate=0.01, estimator__max_depth=3, estimator__n_estimators=200, estimator__reg_alpha=0, estimator__reg_lambda=1, estimator__scale_pos_weight=7, estimator__subsample=1.0; total time=   7

In [505]:
    # # print('Best Parameters:', model.best_params_)
    # # print('Best CV score:', model.best_score_)
    #
    # # mlflow.log_params(model.best_params_)
    # # mlflow.sklearn.log_model(model.best_estimator_, "best_model", input_example=input_example)
    #
    # # Get predicted probabilities for the positive class
    # y_proba = model.predict_proba(X_test)[:, 1]
    #
    # # Compute precision, recall, thresholds
    # precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
    #
    # # Calculate F1 scores at each threshold
    # f1_scores = 2 * (precision * recall) / (precision + recall + 1e-6)  # Add epsilon to avoid divide-by-zero
    # best_threshold = thresholds[np.argmax(f1_scores)]
    #
    # print("Best Threshold:", best_threshold)
    #
    # y_pred_optimal = (y_proba >= best_threshold).astype(int)
    #
    # mlflow.log_metric("best_threshold", best_threshold)
    # mlflow.log_metric("test_f1", f1_score(y_test, y_pred_optimal))
    # mlflow.log_metric("test_precision", precision_score(y_test, y_pred_optimal))
    # mlflow.log_metric("test_recall", recall_score(y_test, y_pred_optimal))
    # mlflow.log_metric("test_accuracy", accuracy_score(y_test, y_pred_optimal))
    # mlflow.log_metric("test_roc_auc", roc_auc_score(y_test, y_proba))
    #
    # cm = confusion_matrix(y_test, y_pred_optimal)
    # disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    # disp.plot(cmap='Blues')
    # plt.title("Confusion Matrix")
    # plt.savefig("confusion_matrix.png")
    # mlflow.log_artifact("confusion_matrix.png")
    # plt.show()