# Baseline Model

For our baseline we decided to predict screen time on age, gender, and sleep duration.

We decided to use 

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor # The second simple model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

In [6]:
# --- 1. Setup and Data Loading ---

# ASSUME: Replace 'your_dataset.csv' with your actual file path.
# A placeholder DataFrame is used below if the file is not found,
# but you should replace 'your_dataset.csv' with your file path.
FILE_PATH = 'human_cognitive_performance.csv'

try:
    df = pd.read_csv(FILE_PATH)
    print(f"Successfully loaded data from {FILE_PATH}.")
except FileNotFoundError:
    print(f"‚ö†Ô∏è WARNING: File '{FILE_PATH}' not found. Using a placeholder DataFrame for demonstration.")
    # Placeholder data using the actual column names
    data = {
        'Age': np.random.randint(18, 60, 80000),
        'Gender': np.random.choice(['Male', 'Female', 'Other'], 80000),
        'Sleep_Duration': np.random.uniform(5.0, 9.0, 80000),
        'Daily_Screen_Time': np.random.uniform(2.0, 10.0, 80000)
    }
    df = pd.DataFrame(data)

Successfully loaded data from human_cognitive_performance.csv.


In [None]:
# --- 2. Feature and Target Selection & Preprocessing ---

TARGET = 'Daily_Screen_Time'
FEATURES = ['Sleep_Duration', "Caffeine_Intake", "Stress_Level", "Memory_Test_Score"]

# Handle missing data (Simple drop for demonstration)
df = df.dropna(subset=FEATURES + [TARGET])

# Select features (X) and target (y)
X = df[FEATURES].copy()
y = df[TARGET]

# One-Hot Encode the categorical 'Gender' column
X = pd.get_dummies(X, columns=['Gender'], drop_first=True, dtype=int)

# --- 3. Train/Test Split and Scaling ---

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale all numerical features (Crucial for Linear Regression and K-NN)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert scaled arrays back to DataFrames (optional, for result inspection)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

print("Data preparation complete. Starting baseline training...")

Data preparation complete. Starting baseline training...


In [25]:
# --------------------------------------------------------------------------
## üìè Baseline 1: Simple Linear Regression (FIXED)
# --------------------------------------------------------------------------

print("\n--- Baseline 1: Simple Linear Regression ---")

# Train the model
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

# Predict and Evaluate
y_pred_lr = lr_model.predict(X_test_scaled)

# 1. Calculate MSE (Mean Squared Error)
mse_lr = mean_squared_error(y_test, y_pred_lr)

# 2. Calculate RMSE (Root Mean Squared Error) manually using numpy
rmse_lr = np.sqrt(mse_lr) 

# 3. Calculate MAE and R2 (which do not use the 'squared' parameter)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)


print("Evaluation Metrics:")
print(f"  Root Mean Squared Error (RMSE): {rmse_lr:.4f}")
print(f"  Mean Absolute Error (MAE): {mae_lr:.4f}")
print(f"  R-squared (R2) Score: {r2_lr:.4f}")


--- Baseline 1: Simple Linear Regression ---
Evaluation Metrics:
  Root Mean Squared Error (RMSE): 3.1847
  Mean Absolute Error (MAE): 2.7548
  R-squared (R2) Score: -0.0002


In [26]:
# --------------------------------------------------------------------------
## üîé Baseline 2: K-Nearest Neighbors Regressor (FIXED)
# --------------------------------------------------------------------------

# We choose a common starting point for K (e.g., k=5) without tuning
K = 5 
print(f"\n--- Baseline 2: K-Nearest Neighbors Regressor (K={K}) ---")

# Train the model
knn_model = KNeighborsRegressor(n_neighbors=K)
knn_model.fit(X_train_scaled, y_train)

# Predict and Evaluate
y_pred_knn = knn_model.predict(X_test_scaled)

# 1. Calculate MSE (Mean Squared Error)
mse_knn = mean_squared_error(y_test, y_pred_knn)

# 2. Calculate RMSE (Root Mean Squared Error) manually using numpy
rmse_knn = np.sqrt(mse_knn) 

# 3. Calculate MAE and R2
mae_knn = mean_absolute_error(y_test, y_pred_knn)
r2_knn = r2_score(y_test, y_pred_knn)

print("Evaluation Metrics:")
print(f"  Root Mean Squared Error (RMSE): {rmse_knn:.4f}")
print(f"  Mean Absolute Error (MAE): {mae_knn:.4f}")
print(f"  R-squared (R2) Score: {r2_knn:.4f}")


--- Baseline 2: K-Nearest Neighbors Regressor (K=5) ---
Evaluation Metrics:
  Root Mean Squared Error (RMSE): 3.4738
  Mean Absolute Error (MAE): 2.9327
  R-squared (R2) Score: -0.1901


In [27]:
# --- Final Comparison (Assuming lr_model variables are already defined) ---
print("\n--- Summary of Baseline Performance (on Test Set) ---")
# Note: Ensure lr_rmse is calculated as np.sqrt(mean_squared_error(y_test, y_pred_lr)) in Baseline 1
print(f"Linear Regression R2: {r2_lr:.4f} | RMSE: {rmse_lr:.4f}") 
print(f"K-NN Regressor R2:    {r2_knn:.4f} | RMSE: {rmse_knn:.4f}")


--- Summary of Baseline Performance (on Test Set) ---
Linear Regression R2: -0.0002 | RMSE: 3.1847
K-NN Regressor R2:    -0.1901 | RMSE: 3.4738
