In [None]:
# binge_predictor.py
# ðŸŽ¬ BingeCast: Predict Your Next Netflix Marathon

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

print("\n[Step 1]: Starting BingeCast Project ðŸš€\n")

# ---------------------------
# Step 2: Generate Synthetic Data
# ---------------------------
np.random.seed(42)

n_samples = 1000
ages = np.random.randint(15, 60, n_samples)
moods = np.random.choice(['Happy', 'Sad', 'Bored', 'Stressed', 'Excited'], n_samples)
genres = np.random.choice(['Comedy', 'Drama', 'Action', 'Sci-Fi', 'Romance'], n_samples)
times = np.random.choice(['Morning', 'Afternoon', 'Evening', 'Late Night'], n_samples)

# Generate target variable (episodes watched)
episodes = (
    (ages / 50)
    + np.random.normal(0, 0.5, n_samples)
    + [2 if m == 'Bored' else 0 for m in moods]
    + [1 if t in ['Evening', 'Late Night'] else 0 for t in times]
    + np.random.randint(1, 4, n_samples)
)

data = pd.DataFrame({
    'Age': ages,
    'Mood': moods,
    'Preferred_Genre': genres,
    'Time_of_Day': times,
    'Episodes_Watched': np.round(episodes, 1)
})

print("[Step 2]: Synthetic dataset created âœ…\n")
print(data.head())

# ---------------------------
# Step 3: Exploratory Data Analysis (EDA)
# ---------------------------
print("\n[Step 3]: Generating EDA plots ðŸ“Š (close the plots to continue)\n")

plt.figure()
sns.histplot(data=['Episodes_Binged'], bins=20, kde=True)
plt.title('Distribution of Episodes Watched')
plt.xlabel('Episodes')
plt.ylabel('Frequency')
plt.show()

plt.figure()
sns.boxplot(x='Mood', y='Episodes_Watched', data=data)
plt.title('Mood vs Episodes Watched')
plt.show()

# ---------------------------
# Step 4: Data Preprocessing
# ---------------------------
print("\n[Step 4]: Preprocessing data ðŸ”„\n")

X = data[['Age', 'Mood', 'Preferred_Genre', 'Time_of_Day']]
y = data['Episodes_Watched']

categorical_features = ['Mood', 'Preferred_Genre', 'Time_of_Day']
numeric_features = ['Age']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numeric_features)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ---------------------------
# Step 5: Train Models
# ---------------------------
print("\n[Step 5]: Training models ðŸ¤–\n")

models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
}

results = {}

for name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model)])
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)
    
    r2 = r2_score(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    
    results[name] = {'R2': r2, 'RMSE': rmse, 'Model': pipeline}
    
    print(f"{name} â†’ RÂ²: {r2:.3f} | RMSE: {rmse:.3f}")

# ---------------------------
# Step 6: Evaluate Models
# ---------------------------
print("\n[Step 6]: Evaluating models ðŸ§®\n")

best_model_name = max(results, key=lambda x: results[x]['R2'])
best_model = results[best_model_name]['Model']
print(f"âœ… Best Model: {best_model_name}\n")

# ---------------------------
# Step 7: Visualize Predictions and Feature Importances
# ---------------------------
print("[Step 7]: Generating final plots ðŸ“ˆ (close the plots to finish)\n")

y_pred = best_model.predict(X_test)

# Prediction vs Actual
plt.figure()
sns.scatterplot(x=y_test, y=y_pred)
plt.xlabel("Actual Episodes Watched")
plt.ylabel("Predicted Episodes Watched")
plt.title(f"{best_model_name}: Actual vs Predicted")
plt.show()

# Feature Importance (only for Random Forest)
if best_model_name == 'Random Forest':
    model = best_model.named_steps['model']
    feature_names = best_model.named_steps['preprocessor'] \
        .transformers_[0][1] \
        .get_feature_names_out(categorical_features).tolist() + numeric_features
    
    importances = model.feature_importances_
    feat_importances = pd.Series(importances, index=feature_names).sort_values(ascending=False)
    
    plt.figure(figsize=(8, 5))
    sns.barplot(x=feat_importances, y=feat_importances.index)
    plt.title("Feature Importance")
    plt.xlabel("Importance")
    plt.ylabel("Feature")
    plt.show()

print("\nðŸŽ‰ [End]: BingeCast script complete! Close the final plot window to finish.\n")
