#   [Overview] Machine Learning Model for determining a hit song
###     This is a quick overview of the required steps in order to create this model for binary classification via linear regression
1. Import packages, read data
2. Prepare Data
3. Process Data
4. Train Model
5. Evaluate Model
6. Tune hyper-parameters
***

## Import Packages
Here we will install the required packages, we will be using `pandas` to visualize data. `sklearn` is the our library in order to use machine learning on Python. 

Important notes, if using google collab to train the data please run this script first.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/{folder_name}
%pwd

After running these scripts, go ahead and follow through with the rest of the notebook.

In [2]:
from sklearn.model_selection import train_test_split
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

## Prepare Data, and reading it in.
I decided on the arbitrary number `75` to consider if a song is currently popular or not. Modify to fit needs such as class imbalances.


In [1]:
from datetime import datetime

def calculate_song_age(release_year, release_month, release_day):
    # Define the end date as December 31, 2022
    end_date = datetime(2023, 1, 1)
    
    # Check if only the year is given (no month and day), then use January 1st of that year as the release date
    if release_month is None or release_day is None:
        release_date = datetime(release_year, 1, 1)
    else:
        # Create a datetime object for the release date
        release_date = datetime(release_year, release_month, release_day)
    
    # Calculate the difference in days between the release date and the end date
    age_in_days = (end_date - release_date).days
    return age_in_days

In [4]:
#importing and reading Data
popLimit = 73

data = pd.read_csv("spotifyInfoFinal.csv")

# adding the popular tag to data
data['isPopular'] = data['popularity'] > popLimit
data.value_counts('isPopular')
## While there is a slight class inbalance, I think it is acceptable enough for what I'm working towards
data['explicit'] = data['explicit'].astype(int)
data['isPopular'] = data['isPopular'].astype(int)

def convert_date(entry):
    if isinstance(entry, int) or (isinstance(entry, str) and entry.isdigit()):
        # if the entry is an integer or a string of digits, assume it's a year and create a full date with January 1st
        return pd.to_datetime(f'{entry}-01-01')
    else:
        # otherwise, use pandas to parse the date normally
        return pd.to_datetime(entry, errors='coerce')  # 'coerce' will set invalid parsing as NaT

data['date'] = data['date'].apply(convert_date)
data['releaseDate'] = data['releaseDate'].apply(convert_date)
data['year'] = data['date'].dt.year
data['release_year'] = data['releaseDate'].dt.year
data['release_month'] = data['releaseDate'].dt.month
data['release_day'] = data['releaseDate'].dt.day
# Iterative Approach where I try some stuff out
data['comboEnergyDance'] = data['energy'] * data['danceability']
data['songAge'] = data.apply(lambda row: calculate_song_age(row['release_year'], 
                                                            row['release_month'], 
                                                            row['release_day']), axis=1) 

data.drop(['date', 'mode', 'timeSig', 'releaseDate', 'name', 'id', 'album_Name', 'author'], axis=1, inplace=True)
print(data.value_counts('isPopular'))


categorical_features = ['albumType']
onehot_encoder = OneHotEncoder(sparse_output=False) 

ct = ColumnTransformer(
    [('onehot', onehot_encoder, categorical_features)],
    remainder='passthrough'  # Keeps the rest of the columns that are not transformed
)

encoded_features = ct.fit_transform(data)

encoded_feature_names = ct.named_transformers_['onehot'].get_feature_names_out(input_features=categorical_features)

# Get the rest of the column names for columns that were not transformed
# This does not include the categorical columns we one-hot encoded
remainder_column_names = [col for col in data.columns if col not in categorical_features]

# Combine one-hot encoded feature names with remainder column names
all_feature_names = list(encoded_feature_names) + remainder_column_names

data_encoded = pd.DataFrame(encoded_features, columns=all_feature_names)

isPopular
1    867
0    831
Name: count, dtype: int64


In [5]:
from sklearn.model_selection import train_test_split

X = data_encoded.drop(['isPopular', 'popularity'], axis=1)
y = data_encoded['isPopular']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)

In [6]:
from sklearn.linear_model import LogisticRegression

# Create a Logistic Regression classifier
model = LogisticRegression(max_iter = 1000)

In [7]:
model.fit(X_train, y_train)
print("Number of iterations:", model.n_iter_)

Number of iterations: [40]


In [8]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split

# Predicting the test set results
y_pred = model.predict(X_test)

# Calculating the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Calculating precision, recall, and F1 score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

# Calculating the ROC-AUC score
# For ROC-AUC, you'll need the probabilities of the positive class
y_proba = model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_proba)

print(f'ROC-AUC Score: {roc_auc:.2f}')

Accuracy: 0.67
Precision: 0.66
Recall: 0.72
F1 Score: 0.69
ROC-AUC Score: 0.72


In [None]:
from sklearn.utils import shuffle

# Shuffle the target labels
y_shuffled = shuffle(y_train, random_state=42)

# Train the model on the shuffled target
model.fit(X_train, y_shuffled)

# Evaluate on the original test set
y_pred_shuffled = model.predict(X_test)
accuracy_shuffled = accuracy_score(y_test, y_pred_shuffled)
print(f'Accuracy on shuffled labels: {accuracy_shuffled:.2f}')


## XGBOOST MODEL
I decided to mess around with other models to see if data complexity is issue or if linear regression was too simple

In [9]:
import xgboost as xgb

In [24]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)



# Set up the XGBoost parameters
# Start with default parameters, then you can tune them later
params = {
    'device': 'cuda',
    'max_depth': 6,
    'eta': 0.01,
    'objective': 'binary:hinge',  # Use 'binary:logistic' for binary classification problems
    'eval_metric': 'logloss',        # Use 'logloss' for evaluation metric of binary classification
}

# Train the model
# 'num_boost_round' is equivalent to the number of trees
bst = xgb.train(params, dtrain, num_boost_round=100)

y_pred_proba = bst.predict(dtest)
y_pred = (y_pred_proba > 0.5).astype(int)  # Thresholding at 0.5

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)  # Notice that we use y_pred_proba here

# Print out the metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Recall: {recall:.2f}")
print(f"Precision: {precision:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"ROC-AUC Score: {roc_auc:.2f}")


Accuracy: 0.62
Recall: 0.90
Precision: 0.58
F1 Score: 0.71
ROC-AUC Score: 0.61


In [25]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

search_space = {
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 7, 9],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'min_child_weight': [1, 2, 3],
    'n_estimators': [100, 200, 300],
}

# Define the grid search
grid = GridSearchCV(
    estimator=model,
    param_grid=search_space,
    scoring='roc_auc',
    cv=3,
    verbose=1,
)

# Fit the grid search
result = grid.fit(X_train, y_train)

# Summarize results
print(f"Best: {result.best_score_} using {result.best_params_}")

Fitting 3 folds for each of 1728 candidates, totalling 5184 fits


KeyboardInterrupt: 