# Spotify Song Popularity - Model Comparison

We'll compare different scikit-learn models to improve our prediction accuracy:
1. Random Forest (baseline)
2. Gradient Boosting
3. Support Vector Classification
4. Neural Network
5. XGBoost

In [None]:
# Import additional models
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

# Configure plot settings
plt.rcParams['figure.figsize'] = [12, 6]
plt.rcParams['axes.grid'] = True

In [None]:
# Load and prepare data (same as before)
df = pd.read_csv('../Untitled/Resources/spotify_songs.csv')

# Create popularity categories
def categorize_popularity(x):
    if x >= 67:
        return 'High'
    elif x >= 34:
        return 'Medium'
    else:
        return 'Low'

df['popularity_category'] = df['track_popularity'].apply(categorize_popularity)

# Feature engineering
df['energy_danceability'] = df['energy'] * df['danceability']
df['loudness_scaled'] = (df['loudness'] - df['loudness'].min()) / (df['loudness'].max() - df['loudness'].min())
df['tempo_scaled'] = df['tempo'] / df['tempo'].max()

# One-hot encode genre
genre_dummies = pd.get_dummies(df['playlist_genre'], prefix='genre')
df = pd.concat([df, genre_dummies], axis=1)

# Select features
audio_features = [
    'danceability', 'energy', 'key', 'loudness_scaled', 'speechiness',
    'acousticness', 'instrumentalness', 'liveness', 'valence',
    'tempo_scaled', 'energy_danceability'
]

genre_columns = [col for col in df.columns if col.startswith('genre_')]
features = audio_features + genre_columns

# Prepare data
X = df[features]
y = df['popularity_category']

# Split and scale
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Model Comparison

We'll train different models and compare their performance:

In [None]:
# Initialize models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=200, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=200, random_state=42),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42),
    'SVM': SVC(kernel='rbf', random_state=42)
}

# Train and evaluate models
results = {}
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = {
        'accuracy': accuracy,
        'report': classification_report(y_test, y_pred)
    }
    print(f"{name} Accuracy: {accuracy:.4f}\n")
    print("Classification Report:")
    print(results[name]['report'])
    print("-"*50)

## Visualize Results

In [None]:
# Plot accuracy comparison
plt.figure(figsize=(10, 6))
accuracies = [results[model]['accuracy'] for model in results]
plt.bar(results.keys(), accuracies)
plt.title('Model Accuracy Comparison')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Print best model
best_model = max(results.items(), key=lambda x: x[1]['accuracy'])
print(f"\nBest Model: {best_model[0]} with accuracy: {best_model[1]['accuracy']:.4f}")

## Cross-Validation Check

Let's verify our results with cross-validation:

In [None]:
# Perform cross-validation
cv_results = {}
for name, model in models.items():
    scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
    cv_results[name] = {
        'mean': scores.mean(),
        'std': scores.std()
    }
    print(f"{name} CV Accuracy: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")

# Plot cross-validation results
plt.figure(figsize=(10, 6))
means = [cv_results[model]['mean'] for model in cv_results]
stds = [cv_results[model]['std'] for model in cv_results]
plt.bar(cv_results.keys(), means, yerr=stds)
plt.title('Cross-Validation Results')
plt.xlabel('Model')
plt.ylabel('CV Accuracy')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()