In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score, f1_score, fbeta_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from scipy.stats import percentileofscore
from xgboost import XGBClassifier

# import warnings
# warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv(r'C:\Users\rayva\Desktop\HWs & Assignments\Machine Learning\Final Project\dataset.csv', header = 0)

In [None]:
# drop irrelevant columns
data = data.drop(['Unnamed: 0', 'track_id', 'artists', 'album_name', 'track_name'], axis = 1)


In [None]:
data.isna().sum()

##### Finding the threshold for 50th percentile

In [None]:
percentileofscore(data['popularity'], 35, kind='rank')

In [None]:
data['is_above_50'] = np.where(data['popularity'] > 35, True, False)
data.head()

In [None]:
y = data['is_above_50']
x = data.drop(['popularity', 'artists.1', 'is_above_50'], axis = 1)

In [None]:
x.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=47, stratify=y)

In [None]:
encoder = OneHotEncoder()

X_train_encoded = encoder.fit_transform(X_train[['track_genre']])
genre_list = np.array(encoder.get_feature_names_out(['track_genre']))
X_train_encoded = pd.DataFrame(X_train_encoded.toarray(), columns=genre_list, index=X_train.index)

X_train = pd.concat([X_train, X_train_encoded], axis=1)
X_train.drop(['track_genre'], inplace = True, axis = 1)

In [None]:
X_train.shape

In [None]:
X_test_encoded = encoder.transform(X_test[['track_genre']])
X_test_encoded = pd.DataFrame(X_test_encoded.toarray(), columns=genre_list, index=X_test.index)

X_test = pd.concat([X_test, X_test_encoded], axis=1)
X_test.drop(['track_genre'], inplace = True, axis = 1)

In [None]:
X_test.shape

In [None]:
scaler = MinMaxScaler()

X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [None]:
data['popularity'].describe()

### Logistic Regression

In [None]:
# Logistic Regression:
logistic_regression = LogisticRegression(max_iter=5000, solver='saga')

param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100]
}

grid_search = GridSearchCV(estimator=logistic_regression, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)
probabilities = best_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

accuracy = accuracy_score(y_test, predictions)
roc_auc = roc_auc_score(y_test, probabilities)
f1 = f1_score(y_test, predictions)

print("Accuracy:", accuracy)
print("ROC-AUC Score:", roc_auc)
print("F1 Score:", f1)

### Random Forest

In [None]:
# Random Forest:
random_forest = RandomForestClassifier()

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)
probabilities = best_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

accuracy = accuracy_score(y_test, predictions)
roc_auc = roc_auc_score(y_test, probabilities)
f1 = f1_score(y_test, predictions)

print("Accuracy:", accuracy)
print("ROC-AUC Score:", roc_auc)
print("F1 Score:", f1)

### Multi Layer Perceptron

In [None]:
# Multi-Layer Perceptron (MLP):
mlp_classifier = MLPClassifier()

param_grid = {
    'hidden_layer_sizes': [(100,), (50, 50), (30, 20, 10)],
    'activation': ['logistic', 'tanh', 'relu'],
    'alpha': [0.0001, 0.001, 0.01],
    'max_iter': [100, 200, 300]
}

grid_search = GridSearchCV(estimator=mlp_classifier, param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)
probabilities = best_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

accuracy = accuracy_score(y_test, predictions)
roc_auc = roc_auc_score(y_test, probabilities)
f1 = f1_score(y_test, predictions)

print("Accuracy:", accuracy)
print("ROC-AUC Score:", roc_auc)
print("F1 Score:", f1)

### SVM

In [None]:
# Support Vector Machine (SVM):
svm_classifier = SVC(probability=True, max_iter=500)  # Setting probability to True for calculating ROC-AUC

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100, 1000],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

grid_search = GridSearchCV(estimator=svm_classifier, param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)
probabilities = best_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

accuracy = accuracy_score(y_test, predictions)
roc_auc = roc_auc_score(y_test, probabilities)
f1 = f1_score(y_test, predictions)

print("Accuracy:", accuracy)
print("ROC-AUC Score:", roc_auc)
print("F1 Score:", f1)

### XGBoost

In [None]:
# XGBoost:
xgb_classifier = XGBClassifier()

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'min_child_weight': [1, 3, 5]
}

grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)
probabilities = best_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

accuracy = accuracy_score(y_test, predictions)
roc_auc = roc_auc_score(y_test, probabilities)
f1 = f1_score(y_test, predictions)

print("Accuracy:", accuracy)
print("ROC-AUC Score:", roc_auc)
print("F1 Score:", f1)