# Machine Learning Model

### 80% train, 20% test model for sleep quality

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv("../data/sleep_health_clean.csv") 

# Drop sleep quality --> target variable 
X = df.drop('quality_of_sleep', axis=1)

# Drop all non-numerical variables
X = pd.get_dummies(X, drop_first=True)

y = df['quality_of_sleep']

# Use 20% of the data for test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Random Forest Regression

In [3]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("MSE:", mean_squared_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))
results = pd.DataFrame({'Measured': y_test, 'Predicted': y_pred})
print(results.head())

MSE: 0.03286533333333334
R2: 0.9782150011784115
     Measured  Predicted
329         9        9.0
33          6        6.0
15          6        6.0
325         9        9.0
57          6        6.0


### Linear Regression

In [4]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

print("MSE:", mean_squared_error(y_test, y_pred_lr))
print("R2:", r2_score(y_test, y_pred_lr))
results = pd.DataFrame({'Measured': y_test, 'Predicted': y_pred_lr})
print(results.head())

MSE: 0.09964700781968434
R2: 0.9339483362024836
     Measured  Predicted
329         9   9.039193
33          6   6.026811
15          6   5.901318
325         9   9.038202
57          6   6.001241


# Compare Models

In [5]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

models = {
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Linear Regression": LinearRegression()
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results.append({"Model": name, "MSE": mse, "R2": r2})

results_df = pd.DataFrame(results)
print(results_df)

               Model       MSE        R2
0      Random Forest  0.032545  0.978427
1  Gradient Boosting  0.015697  0.989595
2      Decision Tree  0.040000  0.973486
3  Linear Regression  0.099647  0.933948
