In [3]:
import json
import pandas as pd

def load_and_merge_data(filepath1: str, filepath2: str) -> dict:
    """Load JSON data from two files and merge them."""
    with open(filepath1, 'r') as f1, open(filepath2, 'r') as f2:
        data1 = json.load(f1)
        data2 = json.load(f2)
    # Merge the dictionaries, with data2 overwriting data1 in case of conflicts
    merged_data = {**data1, **data2}
    return merged_data

def create_ml_dataframe(data: dict) -> pd.DataFrame:
    """Create a DataFrame with the specified features for ML analysis."""
    records = []
    
    for artist, attributes in data.items():
        # Basic attributes
        popularity = attributes.get('popularity', 0)
        followers = attributes.get('followers', 0)
        
        # Collaborator-based features
        collaborators = attributes.get('collaborators', {})
        num_collaborators = len(collaborators)
        
        sum_collab_popularity = 0
        sum_collab_in_playlist = 0
        for collab_name, collab_info in collaborators.items():
            # Sum collaborator popularity if available in the dataset
            collab_popularity = data.get(collab_name, {}).get('popularity', 0)
            sum_collab_popularity += collab_popularity
            # Count if collaborator is in the playlist
            collab_in_playlist = data.get(collab_name, {}).get('in_playlist', False)
            sum_collab_in_playlist += int(collab_in_playlist)
        
        # Add each artist's record to the dataset
        records.append({
            'name': artist,
            'popularity': popularity,
            'followers': followers,
            'num_collaborators': num_collaborators,
            'sum_collaborators_popularity': sum_collab_popularity,
            'sum_collaborators_in_playlist': sum_collab_in_playlist,
            'in_playlist': int(attributes.get('in_playlist', False)),
        })
    
    # Convert to DataFrame
    df = pd.DataFrame(records)
    return df

# Load and merge data from the two JSON files
data = load_and_merge_data('11_08_playlist_layer2.json', '10_26_random_2layer.json')

# Create the ML DataFrame
df_ml = create_ml_dataframe(data)

# Save to CSV for ML analysis
df_ml.to_csv('ml_dataset.csv', index=False)
print("ML dataset created and saved to 'ml_dataset.csv'")


ML dataset created and saved to 'ml_dataset.csv'


In [4]:
df_ml.head()

Unnamed: 0,name,popularity,followers,num_collaborators,sum_collaborators_popularity,sum_collaborators_in_playlist,in_playlist
0,ROSÉ,84,7766971,5,326,2,1
1,Bruno Mars,94,60240128,25,1816,3,1
2,Billie Eilish,96,100730714,13,899,1,1
3,Sabrina Carpenter,95,13640801,20,1325,3,1
4,Sevdaliza,77,704987,15,929,2,1


In [6]:
df_ml.describe()

Unnamed: 0,popularity,followers,num_collaborators,sum_collaborators_popularity,sum_collaborators_in_playlist,in_playlist
count,3857.0,3857.0,3857.0,3857.0,3857.0,3857.0
mean,39.244491,1230998.0,18.742546,296.782214,0.387348,0.035261
std,24.011799,6364674.0,22.512729,450.290474,0.67431,0.184462
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,605.0,6.0,75.0,0.0,0.0
50%,40.0,9202.0,13.0,162.0,0.0,0.0
75%,57.0,220849.0,22.0,319.0,1.0,0.0
max,100.0,124431100.0,230.0,5687.0,6.0,1.0


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier

# Load your data
df = pd.read_csv('ml_dataset.csv')

# Separate features and target variable
X = df[['popularity', 'followers', 'num_collaborators', 'sum_collaborators_popularity', 'sum_collaborators_in_playlist']]
y = df['in_playlist']

# Split into training (90%) and testing (10%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define models to evaluate
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Classifier': SVC(),
    'Decision Tree': DecisionTreeClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier(eval_metric='logloss', use_label_encoder=False),
    'LightGBM': LGBMClassifier(),
    'Neural Network (MLP)': MLPClassifier(max_iter=300)
}

# Cross-validation and model evaluation
for model_name, model in models.items():
    # Perform cross-validation
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='f1')
    print(f"{model_name} - Cross-Validation F1 Score: {cv_scores.mean():.2f}")
    
    # Fit model and make predictions
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Print results
    print(f"{model_name} - Test Set Performance:")
    print(f"  Accuracy: {accuracy:.2f}")
    print(f"  Precision: {precision:.2f}")
    print(f"  Recall: {recall:.2f}")
    print(f"  F1 Score: {f1:.2f}")
    print("-" * 30)


Logistic Regression - Cross-Validation F1 Score: 0.53
Logistic Regression - Test Set Performance:
  Accuracy: 0.96
  Precision: 0.50
  Recall: 0.56
  F1 Score: 0.53
------------------------------
Random Forest - Cross-Validation F1 Score: 0.58
Random Forest - Test Set Performance:
  Accuracy: 0.97
  Precision: 0.60
  Recall: 0.56
  F1 Score: 0.58
------------------------------
Support Vector Classifier - Cross-Validation F1 Score: 0.54
Support Vector Classifier - Test Set Performance:
  Accuracy: 0.96
  Precision: 0.50
  Recall: 0.56
  F1 Score: 0.53
------------------------------
Decision Tree - Cross-Validation F1 Score: 0.47
Decision Tree - Test Set Performance:
  Accuracy: 0.96
  Precision: 0.57
  Recall: 0.50
  F1 Score: 0.53
------------------------------
Gradient Boosting - Cross-Validation F1 Score: 0.58
Gradient Boosting - Test Set Performance:
  Accuracy: 0.96
  Precision: 0.55
  Recall: 0.38
  F1 Score: 0.44
------------------------------


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



XGBoost - Cross-Validation F1 Score: 0.56
XGBoost - Test Set Performance:
  Accuracy: 0.96
  Precision: 0.56
  Recall: 0.62
  F1 Score: 0.59
------------------------------


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 96, number of negative: 2680
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000109 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 704
[LightGBM] [Info] Number of data points in the train set: 2776, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.034582 -> initscore=-3.329224
[LightGBM] [Info] Start training from score -3.329224
[LightGBM] [Info] Number of positive: 96, number of negative: 2681
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000086 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 707
[LightGBM] [Info] Number of data points in the train set: 2777, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.034570 -> initscore=-3.329597
[LightGBM] [Info



Neural Network (MLP) - Cross-Validation F1 Score: 0.56
Neural Network (MLP) - Test Set Performance:
  Accuracy: 0.96
  Precision: 0.53
  Recall: 0.50
  F1 Score: 0.52
------------------------------


In [14]:
from sklearn.model_selection import GridSearchCV

# Set up the parameter grid
param_grid = {
    'learning_rate': [0.15, 0.2, 0.5],
    'max_depth': [2, 3, 4],
    'n_estimators': [80, 100, 150],
    'subsample': [0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

# Initialize XGBoost model
xgb = XGBClassifier(eval_metric='logloss', use_label_encoder=False)

# Set up GridSearchCV with cross-validation
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train_scaled, y_train)

# Print the best parameters and the best score
print("Best Parameters:", grid_search.best_params_)
print("Best F1 Score:", grid_search.best_score_)

# Evaluate the tuned model on the test set
best_xgb = grid_search.best_estimator_
y_pred = best_xgb.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Tuned XGBoost - Test Set Performance:")
print(f"  Accuracy: {accuracy:.2f}")
print(f"  Precision: {precision:.2f}")
print(f"  Recall: {recall:.2f}")
print(f"  F1 Score: {f1:.2f}")


Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 80, 'subsample': 1.0}
Best F1 Score: 0.6228338753313475
Tuned XGBoost - Test Set Performance:
  Accuracy: 0.96
  Precision: 0.53
  Recall: 0.50
  F1 Score: 0.52


Parameters: { "use_label_encoder" } are not used.



In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for tuning
param_grid = {
    'n_estimators': [30, 50, 70],          # Number of trees
    'max_depth': [None],         # Maximum depth of trees
    'min_samples_split': [8, 10, 15],         # Minimum samples to split a node
    'min_samples_leaf': [1],           # Minimum samples at a leaf node
    'max_features': [None]   # Number of features to consider for splits
}

# Initialize RandomForest model
rf = RandomForestClassifier(random_state=42)

# Set up GridSearchCV with cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train_scaled, y_train)

# Print the best parameters and the best F1 score
print("Best Parameters:", grid_search.best_params_)
print("Best F1 Score:", grid_search.best_score_)

# Evaluate the tuned model on the test set
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Tuned Random Forest - Test Set Performance:")
print(f"  Accuracy: {accuracy:.2f}")
print(f"  Precision: {precision:.2f}")
print(f"  Recall: {recall:.2f}")
print(f"  F1 Score: {f1:.2f}")


Best Parameters: {'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 15, 'n_estimators': 70}
Best F1 Score: 0.6030461599245794
Tuned Random Forest - Test Set Performance:
  Accuracy: 0.96
  Precision: 0.50
  Recall: 0.50
  F1 Score: 0.50


In [17]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

# Train Random Forest and XGBoost on the SMOTE-resampled data
models = {
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(eval_metric='logloss', use_label_encoder=False)
}

for model_name, model in models.items():
    model.fit(X_resampled, y_resampled)
    y_pred = model.predict(X_test_scaled)
    
    # Evaluate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"{model_name} with SMOTE - Test Set Performance:")
    print(f"  Accuracy: {accuracy:.2f}")
    print(f"  Precision: {precision:.2f}")
    print(f"  Recall: {recall:.2f}")
    print(f"  F1 Score: {f1:.2f}")
    print("-" * 30)


Random Forest with SMOTE - Test Set Performance:
  Accuracy: 0.96
  Precision: 0.54
  Recall: 0.94
  F1 Score: 0.68
------------------------------
XGBoost with SMOTE - Test Set Performance:
  Accuracy: 0.97
  Precision: 0.58
  Recall: 0.94
  F1 Score: 0.71
------------------------------


Parameters: { "use_label_encoder" } are not used.



In [25]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

# Initialize stratified cross-validation
stratified_kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation on SMOTE-resampled data
xgb = XGBClassifier(eval_metric='logloss', use_label_encoder=False, learning_rate=0.1, max_depth=5, n_estimators=150)
cv_scores = cross_val_score(xgb, X_resampled, y_resampled, cv=stratified_kf, scoring='f1')

print("Stratified K-Fold CV F1 Score:", cv_scores.mean())


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Stratified K-Fold CV F1 Score: 0.9906680623752017


Parameters: { "use_label_encoder" } are not used.



In [28]:
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

# Initialize XGBoost model with initial parameters
final_xgb = XGBClassifier(eval_metric='logloss', use_label_encoder=False)

# Train the final model on SMOTE-resampled data
final_xgb.fit(X_resampled, y_resampled)

# Make predictions on the original test set
y_pred = final_xgb.predict(X_test_scaled)

# Evaluate final model performance on the test set
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Final XGBoost Model with SMOTE - Test Set Performance:")
print(f"  Accuracy: {accuracy:.2f}")
print(f"  Precision: {precision:.2f}")
print(f"  Recall: {recall:.2f}")
print(f"  F1 Score: {f1:.2f}")

# Save the final model
joblib.dump(final_xgb, "final_xgb_model_with_smote.pkl")
print("Model saved as 'final_xgb_model_with_smote.pkl'")


Final XGBoost Model with SMOTE - Test Set Performance:
  Accuracy: 0.97
  Precision: 0.58
  Recall: 0.94
  F1 Score: 0.71
Model saved as 'final_xgb_model_with_smote.pkl'


Parameters: { "use_label_encoder" } are not used.

