In [14]:
# FIXED CELL 1: Use correct API limit

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

print("🤖 Building Your First ML Model: Popularity Predictor")
print("=" * 60)

# Get YOUR real Spotify data
api_base = "http://127.0.0.1:8000"

print("Step 1: Fetching your Spotify data...")

# FIX: Use limit=30 instead of 50 (matches your API validation)
response = requests.get(f"{api_base}/audio-features?limit=30")

print(f"API Response Status: {response.status_code}")

if response.status_code == 200:
    data = response.json()
    audio_features = data['audio_features']
    print(f"✅ Got {len(audio_features)} of YOUR tracks!")
    
    if 'warning' in data:
        print("⚠️  Note: Using dummy audio features (Spotify API blocked)")
        print("   This is PERFECT for learning ML concepts!")
        print("   You have REAL popularity scores to predict!")
    
elif response.status_code == 422:
    print("❌ 422 Error: Request limit too high")
    print("Trying with smaller limit...")
    
    # Fallback: try with limit=10
    response = requests.get(f"{api_base}/audio-features?limit=10")
    if response.status_code == 200:
        data = response.json()
        audio_features = data['audio_features']
        print(f"✅ Got {len(audio_features)} tracks (reduced limit)")
    else:
        print("❌ Still failed with smaller limit")
        raise Exception(f"API call failed with status {response.status_code}")
else:
    print(f"❌ API call failed with status {response.status_code}")
    raise Exception("API call failed")

# Convert to DataFrame
audio_df = pd.DataFrame(audio_features)

print(f"\nYour Music Data:")
print(f"Songs: {len(audio_df)}")
print("Sample of your tracks:")
for i in range(min(5, len(audio_df))):
    print(f"  {audio_df.iloc[i]['name']} - {audio_df.iloc[i]['artist']} (popularity: {audio_df.iloc[i]['popularity']})")

# Prepare features for ML
feature_columns = [
    'danceability', 'energy', 'speechiness', 'acousticness',
    'instrumentalness', 'liveness', 'valence', 'tempo', 'loudness'
]

X = audio_df[feature_columns].copy()
y = audio_df['popularity'].copy()

print(f"\nML Setup:")
print(f"Features (X): {X.shape}")
print(f"Target (y): {y.shape}")
print(f"Popularity range: {y.min()} to {y.max()}")

# Show correlations (will be weak since features are dummy)
print(f"\nFeature correlations with YOUR music popularity:")
correlations = X.corrwith(y).sort_values(ascending=False)

for feature, corr in correlations.items():
    print(f"{feature:15s}: {corr:6.3f}")

if abs(correlations).max() < 0.1:
    print("\n💡 Correlations are weak because audio features are dummy values")
    print("   But you'll learn the SAME ML concepts with real popularity scores!")

print(f"\n🎯 Ready to train ML model on YOUR music taste!")
print("✅ Data loaded successfully! Run Cell 2 to train the model!")

🤖 Building Your First ML Model: Popularity Predictor
Step 1: Fetching your Spotify data...
API Response Status: 200
✅ Got 10 of YOUR tracks!
⚠️  Note: Using dummy audio features (Spotify API blocked)
   This is PERFECT for learning ML concepts!
   You have REAL popularity scores to predict!

Your Music Data:
Songs: 10
Sample of your tracks:
  NOKIA - Drake (popularity: 86)
  Pursuit Of Happiness (Nightmare) - Kid Cudi, MGMT, Ratatat (popularity: 71)
  Big Shot (with Travis Scott) - Kendrick Lamar, Travis Scott (popularity: 60)
  MONA LISA - j-hope (popularity: 79)
  Out of Touch - Daryl Hall & John Oates (popularity: 60)

ML Setup:
Features (X): (10, 9)
Target (y): (10,)
Popularity range: 58 to 86

Feature correlations with YOUR music popularity:
danceability   :    nan
energy         :    nan
speechiness    :    nan
acousticness   :    nan
instrumentalness:    nan
liveness       :    nan
valence        :    nan
tempo          :    nan
loudness       :    nan

🎯 Ready to train ML model

In [15]:
# CELL 2: Train and Evaluate the Model
# (Run this AFTER Cell 1)

print("🏋️ TRAINING YOUR MODEL")
print("=" * 40)

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set: {X_train.shape[0]} songs")
print(f"Test set: {X_test.shape[0]} songs")

# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)

print("✓ Model trained!")

# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate performance metrics
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

print("\n📊 MODEL PERFORMANCE:")
print("="*40)
print(f"Training R²: {train_r2:.3f}")
print(f"Test R²:     {test_r2:.3f}")
print(f"Training MAE: {train_mae:.1f}")
print(f"Test MAE:     {test_mae:.1f}")

# Check for overfitting
r2_diff = train_r2 - test_r2
if r2_diff > 0.1:
    print("⚠️  Possible overfitting detected")
else:
    print("✅ No major overfitting")

# Feature importance
print(f"\n🔍 MOST IMPORTANT FEATURES:")
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'coefficient': model.coef_
}).sort_values('coefficient', key=abs, ascending=False)

for idx, row in feature_importance.iterrows():
    feature = row['feature']
    coef = row['coefficient']
    direction = "increases" if coef > 0 else "decreases"
    print(f"{feature:15s}: {coef:6.1f} ({direction} popularity)")

print(f"\nModel ready! Run Cell 3 for visualizations and Cell 4 to test on your Japanese song.")

🏋️ TRAINING YOUR MODEL
Training set: 8 songs
Test set: 2 songs
✓ Model trained!

📊 MODEL PERFORMANCE:
Training R²: 0.000
Test R²:     -1.376
Training MAE: 8.4
Test MAE:     7.6
⚠️  Possible overfitting detected

🔍 MOST IMPORTANT FEATURES:
danceability   :    0.0 (decreases popularity)
energy         :    0.0 (decreases popularity)
speechiness    :    0.0 (decreases popularity)
acousticness   :    0.0 (decreases popularity)
instrumentalness:    0.0 (decreases popularity)
liveness       :    0.0 (decreases popularity)
valence        :    0.0 (decreases popularity)
tempo          :    0.0 (decreases popularity)
loudness       :    0.0 (decreases popularity)

Model ready! Run Cell 3 for visualizations and Cell 4 to test on your Japanese song.


In [16]:
# CELL 4: Test Your Model on Your Japanese Song
# (Run this AFTER Cells 1, 2, and 3)

print("🎌 TESTING ON YOUR JAPANESE SONG")
print("=" * 40)

japanese_track_id = "1YrU8ExqF04ygegVoOOoFU"

try:
    # Get Japanese song data
    response = requests.get(f"{api_base}/track/{japanese_track_id}")
    japanese_data = response.json()
    
    track_info = japanese_data['track_info']
    audio_features = japanese_data['audio_features']
    
    print(f"Song: {track_info['name']}")
    print(f"Artist: {track_info['artist']}")
    print(f"Actual Spotify Popularity: {track_info['popularity']}")
    
    # Prepare features for prediction
    japanese_features = pd.DataFrame([audio_features])[feature_columns]
    
    # Make ML prediction
    ml_prediction = model.predict(japanese_features)[0]
    
    print(f"\n🧠 Your prediction: 40")
    print(f"🤖 ML prediction: {ml_prediction:.1f}")
    print(f"📊 Actual score: {track_info['popularity']}")
    
    # Calculate errors
    human_error = abs(track_info['popularity'] - 40)
    ml_error = abs(track_info['popularity'] - ml_prediction)
    
    print(f"\n📈 PREDICTION RESULTS:")
    print(f"Your error: {human_error:.1f} points")
    print(f"ML error: {ml_error:.1f} points")
    
    if ml_error < human_error:
        print("🏆 ML model wins!")
    elif human_error < ml_error:
        print("🏆 Human intuition wins!")
    else:
        print("🤝 It's a tie!")
    
    # Save the model
    import joblib
    import os
    os.makedirs('../ml_models', exist_ok=True)
    
    joblib.dump({
        'model': model,
        'features': feature_columns,
        'test_r2': test_r2,
        'test_mae': test_mae
    }, '../ml_models/popularity_predictor_v1.pkl')
    
    print(f"\n💾 Model saved to ml_models/popularity_predictor_v1.pkl")
    
except Exception as e:
    print(f"❌ Error testing Japanese song: {e}")

print(f"\n🎉 EXPERIMENT COMPLETE!")
print(f"You built and tested your first ML model!")
print(f"Model performance: R² = {test_r2:.3f}, MAE = {test_mae:.1f}")
print(f"Ready for more advanced ML experiments!")

🎌 TESTING ON YOUR JAPANESE SONG
Song: 中庭の少女たち
Artist: SHISHAMO
Actual Spotify Popularity: 50
❌ Error testing Japanese song: "None of [Index(['danceability', 'energy', 'speechiness', 'acousticness',\n       'instrumentalness', 'liveness', 'valence', 'tempo', 'loudness'],\n      dtype='object')] are in the [columns]"

🎉 EXPERIMENT COMPLETE!
You built and tested your first ML model!
Model performance: R² = -1.376, MAE = 7.6
Ready for more advanced ML experiments!
