In [143]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, precision_recall_curve
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV

data = pd.read_csv('KaggleDataset.csv')
data.shape

(114000, 21)

In [145]:
# Drop rows with missing values
data = data.dropna()
data.shape

(113999, 21)

In [167]:
# Define target
data['is_hit'] = (data['popularity'] > 75).astype(int)

# Drop unneccesary columns to speed up model training
X = pd.get_dummies(data.drop(columns=['popularity', 'is_hit', 'track_id', 'album_name', 'track_name', 'track_genre', 'artists'], errors='ignore'))
y = data['is_hit']

# Training and testing sets (because dataset is large, I'll use 5% for test data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=20, stratify=y)

print("Training Class Distribution:", y_train.value_counts())
print("Test Class Distribution:", y_test.value_counts())

Training Class Distribution: is_hit
0    106006
1      2293
Name: count, dtype: int64
Test Class Distribution: is_hit
0    5579
1     121
Name: count, dtype: int64


### Random Forest Model

In [200]:
# Initialize and train the model
model = RandomForestClassifier(random_state=21, n_estimators=100)
model.fit(X_train, y_train)

# Predict on the test set
#y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]
threshold = 0.03  # Adjust to a lower threshold
y_pred_thresholded = (y_pred_proba >= threshold).astype(int)

precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba) # For finding optimal threshold
# Evaluate the model
classification_rep = classification_report(y_test, y_pred_thresholded)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_thresholded)

# Display
print("Classification Report:")
print(classification_rep)
print("AUC-ROC:", round(roc_auc, 2))

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.82      0.90      5579
           1       0.09      0.87      0.17       121

    accuracy                           0.82      5700
   macro avg       0.55      0.84      0.53      5700
weighted avg       0.98      0.82      0.88      5700

AUC-ROC: 0.93


Recall: 87%
Precision: 9%

In [204]:
# Finding threshold suitable for my goal (recall >= 0.85)
# If the goal to successfully predict song popularity score over 75 was lower than 85% of the time
# then precision would have been drastically better
for p, r, t in zip(precision, recall, thresholds):
    if r >= 0.6:
        print("Threshold:", round(t, 2), "Recall:", round(r, 2), "Precision:", round(p, 2))

Threshold: 0.0 Recall: 1.0 Precision: 0.02
Threshold: 0.01 Recall: 0.96 Precision: 0.04
Threshold: 0.02 Recall: 0.9 Precision: 0.06
Threshold: 0.03 Recall: 0.87 Precision: 0.09
Threshold: 0.04 Recall: 0.83 Precision: 0.14
Threshold: 0.05 Recall: 0.81 Precision: 0.2
Threshold: 0.06 Recall: 0.77 Precision: 0.26
Threshold: 0.07 Recall: 0.74 Precision: 0.32
Threshold: 0.08 Recall: 0.72 Precision: 0.37
Threshold: 0.09 Recall: 0.72 Precision: 0.47
Threshold: 0.1 Recall: 0.71 Precision: 0.53
Threshold: 0.11 Recall: 0.71 Precision: 0.58
Threshold: 0.12 Recall: 0.71 Precision: 0.63
Threshold: 0.13 Recall: 0.7 Precision: 0.69
Threshold: 0.14 Recall: 0.69 Precision: 0.72
Threshold: 0.15 Recall: 0.69 Precision: 0.74
Threshold: 0.16 Recall: 0.68 Precision: 0.75
Threshold: 0.17 Recall: 0.66 Precision: 0.76
Threshold: 0.18 Recall: 0.65 Precision: 0.79
Threshold: 0.19 Recall: 0.65 Precision: 0.81
Threshold: 0.2 Recall: 0.65 Precision: 0.81
Threshold: 0.21 Recall: 0.65 Precision: 0.82
Threshold: 0.22 R

### Random Forest Model with balanced data and scaled features

In [185]:
# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [187]:
# Scale features for better model performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

In [206]:
# Initialize and train the model
model = RandomForestClassifier(random_state=21, n_estimators=100)
model.fit(X_train_scaled, y_train_resampled)

# Predict on the test set
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
threshold = 0.1  # Adjust to a lower threshold
y_pred_thresholded = (y_pred_proba >= threshold).astype(int)

# Evaluate the model
classification_rep = classification_report(y_test, y_pred_thresholded)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_thresholded)

# Display metrics and confusion matrix
print("Classification Report:")
print(classification_rep)
print("AUC-ROC:", round(roc_auc, 2))

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.75      0.86      5579
           1       0.07      0.86      0.13       121

    accuracy                           0.75      5700
   macro avg       0.53      0.80      0.49      5700
weighted avg       0.98      0.75      0.84      5700

AUC-ROC: 0.89


Recall: 86%
Precision: 7%

### Linear SVM

Trying to balance out the dataset.
Using class_weight to distribute weight between classes.

**Formula of Class Weights:**
wj=n_samples / (n_classes * n_samplesj)

+ wj is the weight for each class(j signifies the class)
+ n_samplesis the total number of samples or rows in the dataset
+ n_classesis the total number of unique classes in the target
+ n_samplesjis the total number of rows of the respective class

Popularity under 75: 113 999 / 2 * 111 585 = 113 999 / 223 170 = 0.51

Popularity under 75: 113 999 / 2 * 2414 = 113 999 / 4828 = 23.61

Can't put weight for class 1 too high as the model will then heavily favor predicting class 1.

In [208]:
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, roc_auc_score

In [210]:
# Standardize features and use LinearSVC
svm_model = make_pipeline(StandardScaler(), LinearSVC(class_weight={0: 0.51, 1: 23.61}, random_state=42, max_iter=10000))

# Fit the model on the training data
svm_model.fit(X_train, y_train)

# Predict on the test set
y_pred = svm_model.predict(X_test) # Not useful to adjust threshold

# Linear SVM does not provide probabilities directly, but we can use the decision function
y_scores = svm_model.decision_function(X_test)

classification_rep = classification_report(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_scores)

print("Classification Report (Linear SVM):")
print(classification_rep)
print("AUC-ROC:", round(roc_auc, 2))

Classification Report (Linear SVM):
              precision    recall  f1-score   support

           0       0.99      0.60      0.75      5579
           1       0.04      0.78      0.08       121

    accuracy                           0.61      5700
   macro avg       0.52      0.69      0.41      5700
weighted avg       0.97      0.61      0.74      5700

AUC-ROC: 0.74


Recall: 78%
Precision: 4%

### Linear SVM with balanced data and scaled features

In [217]:
# Standardize features and use LinearSVC
svm_model = make_pipeline(StandardScaler(), LinearSVC(class_weight='balanced', random_state=42, max_iter=10000))

# Fit the model on the training data
svm_model.fit(X_train_scaled, y_train_resampled)

# Predict on the test set
y_pred = svm_model.predict(X_test_scaled)

# Linear SVM does not provide probabilities directly, but we can use the decision function
y_scores = svm_model.decision_function(X_test_scaled)

# Evaluate the model
classification_rep = classification_report(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_scores)

print("Classification Report (Linear SVM):")
print(classification_rep)
print("AUC-ROC:", round(roc_auc, 2))

Classification Report (Linear SVM):
              precision    recall  f1-score   support

           0       0.99      0.72      0.83      5579
           1       0.04      0.55      0.08       121

    accuracy                           0.72      5700
   macro avg       0.51      0.63      0.45      5700
weighted avg       0.97      0.72      0.82      5700

AUC-ROC: 0.7


Recall: 55%
Precision: 4%

### K-nearest neighbors

In [222]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, roc_auc_score

In [224]:
# Standardize features and use KNN
knn_model = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=5, weights='distance'))

# Fit the model on the training data
knn_model.fit(X_train, y_train)

# Predict on the test set
y_pred = knn_model.predict(X_test) # Threshold does not help enough

# Predict probabilities (for AUC-ROC)
y_pred_proba = knn_model.predict_proba(X_test)[:, 1]

classification_rep = classification_report(y_test, y_pred_thresholded)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print("Classification Report (K-Nearest Neighbors):")
print(classification_rep)
print("AUC-ROC:", round(roc_auc, 2))

Classification Report (K-Nearest Neighbors):
              precision    recall  f1-score   support

           0       1.00      0.75      0.86      5579
           1       0.07      0.86      0.13       121

    accuracy                           0.75      5700
   macro avg       0.53      0.80      0.49      5700
weighted avg       0.98      0.75      0.84      5700

AUC-ROC: 0.77


Recall: 86%
Precision: 7%

### K-nearest neighbors with balanced data and scaled features¶

In [227]:
# Standardize features and use KNN
knn_model = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=5, weights='distance')) # Closer neighbors are given more influence than farther ones

# Fit the model on the training data
knn_model.fit(X_train_scaled, y_train_resampled)

# Predict on the test set
y_pred = knn_model.predict(X_test_scaled) #

# Predict probabilities (for AUC-ROC)
y_pred_proba = knn_model.predict_proba(X_test_scaled)[:, 1]

classification_rep = classification_report(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print("Classification Report (K-Nearest Neighbors):")
print(classification_rep)
print("AUC-ROC:", round(roc_auc, 2))

Classification Report (K-Nearest Neighbors):
              precision    recall  f1-score   support

           0       0.99      0.87      0.92      5579
           1       0.08      0.50      0.13       121

    accuracy                           0.86      5700
   macro avg       0.53      0.69      0.53      5700
weighted avg       0.97      0.86      0.91      5700

AUC-ROC: 0.78


Recall: 50%
Precision: 8%

# Results
### Models in order based on performance

1. Random Forest (R: 87%, P: 9%)
2. Random Forest (Balanced & Scaled) (R: 86%, P: 7%)
3. K-nearest neighbors (R: 86%, P: 7%)
4. Linear SVM (R: 78%, P: 4%)
5. Linear SVM (Balanced & Scaled) (R: 55%, P: 4%)
7. K-nearest neighbors (Balanced & Scaled) (R: 50%, P: 8%)