In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Load Data
df = pd.read_csv('social_media_data.csv')

# Select only relevant columns for modeling
input_features = ['Age', 'SocialMediaTime', 'ScreenTime', 'PrimaryPlatform', 'FatigueLevel']
df = df[input_features]

# Handle missing values if any
df = df.dropna()

# Define Features and Target
numerical_features = ['Age', 'SocialMediaTime', 'ScreenTime']
categorical_features = ['PrimaryPlatform']
target = 'FatigueLevel'

# Preprocessing Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Initialize models with default parameters
models = {
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingRegressor(random_state=42),
    'LinearRegression': LinearRegression()
}

# Split data
X = df.drop(target, axis=1)
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

best_model = None
best_r2 = -np.inf

# Train and compare models
for model_name, model in models.items():
    # Create pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # Train model
    pipeline.fit(X_train, y_train)
    
    # Evaluate
    y_pred = pipeline.predict(X_test)
    current_r2 = r2_score(y_test, y_pred)
    
    print(f"\n{model_name} Performance:")
    print(f"R² Score: {current_r2:.4f}")
    print(f"MSE: {mean_squared_error(y_test, y_pred):.4f}")
    
    # Update best model
    if current_r2 > best_r2:
        best_r2 = current_r2
        best_model = pipeline

# Save best model
joblib.dump(best_model, 'fatigue_model.pkl')
print(f"\nBest model: {type(best_model.named_steps['regressor']).__name__}")

# Example prediction (for verification)
test_sample = pd.DataFrame([{
    'Age': 52,
    'SocialMediaTime': 12.35,
    'ScreenTime': 20.33,
    'PrimaryPlatform': 'Instagram'
}])
prediction = best_model.predict(test_sample)
print(f"\nSample Prediction: {prediction[0]:.2f}")


RandomForest Performance:
R² Score: -0.0862
MSE: 7.2487

GradientBoosting Performance:
R² Score: -0.0007
MSE: 6.6778

LinearRegression Performance:
R² Score: -0.0000
MSE: 6.6736

Best model: LinearRegression

Sample Prediction: 4.96
