In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error

# Load dataset
df = pd.read_csv('social_media_data.csv')

In [2]:
df.head()

Unnamed: 0,Age,Gender,Country,SocialMediaTime,EntertainmentTime,PlatformsUsed,PrimaryPlatform,MessagingTime,VideoTime,GamingTime,...,FatigueLevel,NewsTime,AdInteraction,LearningTime,TechSavviness,PreferredDevice,DigitalWellbeing,SleepQuality,SocialIsolation,EntertainmentSpending
0,32,Other,Germany,4.35,4.08,5,TikTok,0.35,5.43,4.68,...,2,2.82,20,4.11,9,Tablet,Moderate,7,8,33.04
1,62,Other,India,4.96,4.21,2,YouTube,2.55,4.22,0.41,...,5,0.32,26,4.59,9,PC,Low,8,2,497.78
2,51,Female,USA,6.78,1.77,4,Facebook,2.09,1.09,4.38,...,3,1.25,47,0.66,9,Tablet,High,5,3,71.72
3,44,Female,India,5.06,9.21,3,YouTube,3.69,4.8,4.82,...,1,2.65,22,3.44,7,Tablet,Low,9,9,129.62
4,21,Other,Germany,2.57,1.3,4,TikTok,3.97,2.74,4.4,...,4,1.44,42,4.14,7,Smart TV,Low,5,9,35.9


In [3]:
df.shape

(300000, 34)

In [4]:
# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [5]:
# Ensure target variable is removed from features
if 'FatigueLevel' in numerical_cols:
    numerical_cols.remove('FatigueLevel')

In [6]:
# Split data
X = df.drop(columns=['FatigueLevel'])
y = df['FatigueLevel']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)


In [7]:
# Preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])


## 2. Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Define model pipeline
rf_model = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=123))
])

# Train and evaluate
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_mae = mean_absolute_error(y_test, rf_pred)
print(f"Random Forest MAE: {rf_mae:.4f}")

## 3. Gradient Boosting Model

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gb_model = Pipeline([
    ('preprocessor', preprocessor),
    ('model', GradientBoostingRegressor(random_state=123))
])

gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)
gb_mae = mean_absolute_error(y_test, gb_pred)
print(f"Gradient Boosting MAE: {gb_mae:.4f}")


## 4. Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression

lr_model = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_mae = mean_absolute_error(y_test, lr_pred)
print(f"Linear Regression MAE: {lr_mae:.4f}")


## 5. Support Vector Regression (SVR)

In [None]:
from sklearn.svm import SVR

svr_model = Pipeline([
    ('preprocessor', preprocessor),
    ('model', SVR())
])

svr_model.fit(X_train, y_train)
svr_pred = svr_model.predict(X_test)
svr_mae = mean_absolute_error(y_test, svr_pred)
print(f"SVR MAE: {svr_mae:.4f}")


## Compare Model Performance

In [None]:
# Compare MAE of all models
mae_scores = {
    "Random Forest": rf_mae,
    "Gradient Boosting": gb_mae,
    "Linear Regression": lr_mae,
    "SVR": svr_mae
}

# Display model performances
for model_name, mae in mae_scores.items():
    print(f"{model_name}: MAE = {mae:.4f}")

# Select the model with the lowest MAE
best_model_name = min(mae_scores, key=mae_scores.get)
print(f"\n✅ Best Model: {best_model_name} with MAE = {mae_scores[best_model_name]:.4f}")


## Select the Best Model

In [None]:
# Assign the best model based on MAE
if best_model_name == "Random Forest":
    best_model = rf_model
elif best_model_name == "Gradient Boosting":
    best_model = gb_model
elif best_model_name == "Linear Regression":
    best_model = lr_model
else:
    best_model = svr_model


## Save the Best Model for Future Use

In [None]:
import joblib

# Save the best model to a file
joblib.dump(best_model, 'best_fatigue_model.pkl')
print("✅ Best model saved as 'best_fatigue_model.pkl'")


## Make Predictions with New Data

# Example new data (replace with real data)
new_data = X_test.iloc[:5]  # Using first 5 samples from test data for demonstration

# Make predictions
new_predictions = best_model.predict(new_data)
print("\nPredictions for New Data:")
print(new_predictions)


## Visualize Results 


import matplotlib.pyplot as plt

# Compare actual vs predicted fatigue levels
plt.figure(figsize=(8, 5))
plt.plot(y_test.values[:20], label='Actual Fatigue Level', marker='o')
plt.plot(best_model.predict(X_test)[:20], label='Predicted Fatigue Level', marker='x')
plt.title('Actual vs Predicted Fatigue Level')
plt.xlabel('Sample Index')
plt.ylabel('Fatigue Level')
plt.legend()
plt.grid()
plt.show()
