# Baseline Model

For our baseline we decided to predict screen time on age, gender, and sleep duration.

We decided to use 

In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [30]:
# 1. Load the Data
df = pd.read_csv('data/data.csv')

# 2. Prepare the Data
# Select only the columns we need for this specific model
# Target: Avg_Sleep
# Features: Age, Gender, Feeling_Depressed
model_df = df[['Age', 'Gender', 'Feeling_Depressed', 'Avg_Sleep']].copy()

# Remove rows with missing values in these specific columns to ensure the model runs
model_df.dropna(inplace=True)

# 3. Encode Categorical Variables
# Machine learning models cannot process text (like "Male" or "Not at all").
# We use pd.get_dummies to turn them into numbers (0s and 1s).
X = pd.get_dummies(model_df[['Age', 'Gender', 'Feeling_Depressed']], drop_first=True)
y = model_df['Avg_Sleep']

# 4. Split the Data
# 80% of data used to train the model, 20% used to test its accuracy
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=36)

# 5. Initialize and Train the Baseline Model
model = LinearRegression()
model.fit(X_train, y_train)

# 6. Make Predictions on the Test Set
y_pred = model.predict(X_test)

# 7. Evaluate Performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 8. Display Results
print("--- Baseline Model Results ---")
print(f"Mean Absolute Error (MAE): {mae:.4f} hours")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R-squared (R2): {r2:.4f}")

print("\n--- Coefficients (Impact of each feature) ---")
# This shows how much each variable increases or decreases sleep on average
coef_df = pd.DataFrame({'Feature': X.columns, 'Coeff': model.coef_})
print(coef_df.sort_values(by='Coeff', ascending=False))

--- Baseline Model Results ---
Mean Absolute Error (MAE): 1.0673 hours
Mean Squared Error (MSE): 2.1493
R-squared (R2): 0.0043

--- Coefficients (Impact of each feature) ---
                              Feature     Coeff
3        Feeling_Depressed_Not at all  0.123855
4      Feeling_Depressed_Several days  0.034699
0                                 Age -0.003292
1                         Gender_Male -0.156898
2  Feeling_Depressed_Nearly every day -0.164056
