In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score, f1_score, fbeta_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from scipy.stats import percentileofscore
from xgboost import XGBClassifier

# import warnings
# warnings.filterwarnings("ignore")

In [10]:
data = pd.read_csv(r'C:\Users\rayva\Desktop\HWs & Assignments\Machine Learning\Final Project\dataset.csv', header = 0)

In [11]:
# drop irrelevant columns
data = data.drop(['Unnamed: 0', 'track_id', 'artists', 'album_name', 'track_name'], axis = 1)


In [12]:
data.isna().sum()

popularity           0
duration_ms          0
explicit             0
danceability         0
energy               0
key                  0
loudness             0
mode                 0
speechiness          0
acousticness         0
instrumentalness     0
liveness             0
valence              0
tempo                0
time_signature       0
track_genre          0
artists.1           19
dtype: int64

##### Finding the threshold for 20th percentile

In [13]:
percentileofscore(data['popularity'], 10, kind='rank')

20.25964912280702

In [14]:
data['is_above_20'] = np.where(data['popularity'] > 10, True, False)
data.head()

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,artists.1,is_above_20
0,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic,Gen Hoshino,True
1,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic,Ben Woodward,True
2,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic,Ingrid Michaelson;ZAYN,True
3,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic,Kina Grannis,True
4,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic,Chord Overstreet,True


In [15]:
y = data['is_above_20']
x = data.drop(['popularity', 'artists.1', 'is_above_20'], axis = 1)

In [16]:
x.head()

Unnamed: 0,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [17]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=47, stratify=y)

In [18]:
encoder = OneHotEncoder()

X_train_encoded = encoder.fit_transform(X_train[['track_genre']])
genre_list = np.array(encoder.get_feature_names_out(['track_genre']))
X_train_encoded = pd.DataFrame(X_train_encoded.toarray(), columns=genre_list, index=X_train.index)

X_train = pd.concat([X_train, X_train_encoded], axis=1)
X_train.drop(['track_genre'], inplace = True, axis = 1)

In [19]:
X_train.shape

(91200, 128)

In [20]:
X_test_encoded = encoder.transform(X_test[['track_genre']])
X_test_encoded = pd.DataFrame(X_test_encoded.toarray(), columns=genre_list, index=X_test.index)

X_test = pd.concat([X_test, X_test_encoded], axis=1)
X_test.drop(['track_genre'], inplace = True, axis = 1)

In [21]:
X_test.shape

(22800, 128)

In [22]:
scaler = MinMaxScaler()

X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [23]:
data['popularity'].describe()

count    114000.000000
mean         33.238535
std          22.305078
min           0.000000
25%          17.000000
50%          35.000000
75%          50.000000
max         100.000000
Name: popularity, dtype: float64

### Logistic Regression

In [None]:
# Logistic Regression:
logistic_regression = LogisticRegression(max_iter=5000, solver='saga')

param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100]
}

grid_search = GridSearchCV(estimator=logistic_regression, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)
probabilities = best_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

accuracy = accuracy_score(y_test, predictions)
roc_auc = roc_auc_score(y_test, probabilities)
f1 = f1_score(y_test, predictions)

print("Accuracy:", accuracy)
print("ROC-AUC Score:", roc_auc)
print("F1 Score:", f1)

### Random Forest

In [None]:
# Random Forest:
random_forest = RandomForestClassifier()

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)
probabilities = best_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

accuracy = accuracy_score(y_test, predictions)
roc_auc = roc_auc_score(y_test, probabilities)
f1 = f1_score(y_test, predictions)

print("Accuracy:", accuracy)
print("ROC-AUC Score:", roc_auc)
print("F1 Score:", f1)

### Multi Layer Perceptron

In [None]:
# Multi-Layer Perceptron (MLP):
mlp_classifier = MLPClassifier()

param_grid = {
    'hidden_layer_sizes': [(100,), (50, 50), (30, 20, 10)],
    'activation': ['logistic', 'tanh', 'relu'],
    'alpha': [0.0001, 0.001, 0.01],
    'max_iter': [100, 200, 300]
}

grid_search = GridSearchCV(estimator=mlp_classifier, param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)
probabilities = best_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

accuracy = accuracy_score(y_test, predictions)
roc_auc = roc_auc_score(y_test, probabilities)
f1 = f1_score(y_test, predictions)

print("Accuracy:", accuracy)
print("ROC-AUC Score:", roc_auc)
print("F1 Score:", f1)

### SVM

In [None]:
# Support Vector Machine (SVM):
svm_classifier = SVC(probability=True, max_iter=500)  # Setting probability to True for calculating ROC-AUC

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100, 1000],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

grid_search = GridSearchCV(estimator=svm_classifier, param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)
probabilities = best_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

accuracy = accuracy_score(y_test, predictions)
roc_auc = roc_auc_score(y_test, probabilities)
f1 = f1_score(y_test, predictions)

print("Accuracy:", accuracy)
print("ROC-AUC Score:", roc_auc)
print("F1 Score:", f1)

### XGBoost

In [24]:
# XGBoost:
xgb_classifier = XGBClassifier()

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'min_child_weight': [1, 3, 5]
}

grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)
probabilities = best_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

accuracy = accuracy_score(y_test, predictions)
roc_auc = roc_auc_score(y_test, probabilities)
f1 = f1_score(y_test, predictions)

print("Accuracy:", accuracy)
print("ROC-AUC Score:", roc_auc)
print("F1 Score:", f1)

Accuracy: 0.8870614035087719
ROC-AUC Score: 0.9188935195158616
F1 Score: 0.9310224746189495
