In [13]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RepeatedKFold, cross_val_score
from sklearn.metrics import r2_score
from sklearn.ensemble import BaggingRegressor, StackingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import RidgeCV, LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

In [5]:
files = {
    "November": "/Users/Steven/Documents/Documents - MacBook Pro/Cal Poly/MSBA/Spring 2025/GSB-545/GSB-545 Repository/Lab-1/data/November Sleep Data - Sheet1.csv",
    "December": "/Users/Steven/Documents/Documents - MacBook Pro/Cal Poly/MSBA/Spring 2025/GSB-545/GSB-545 Repository/Lab-1/data/December Sleep data - Sheet1.csv",
    "January": "/Users/Steven/Documents/Documents - MacBook Pro/Cal Poly/MSBA/Spring 2025/GSB-545/GSB-545 Repository/Lab-1/data/January sleep data - Sheet1.csv",
    "February": "/Users/Steven/Documents/Documents - MacBook Pro/Cal Poly/MSBA/Spring 2025/GSB-545/GSB-545 Repository/Lab-1/data/February sleep data - Sheet1 (1).csv",
    "March": "/Users/Steven/Documents/Documents - MacBook Pro/Cal Poly/MSBA/Spring 2025/GSB-545/GSB-545 Repository/Lab-1/data/March sleep data - Sheet1.csv",
    "April": "/Users/Steven/Documents/Documents - MacBook Pro/Cal Poly/MSBA/Spring 2025/GSB-545/GSB-545 Repository/Lab-1/data/April sleep data - Sheet1.csv"
}

# Load and tag each dataset with its month
monthly_dataframes = []
for month, path in files.items():
    df = pd.read_csv(path)
    df['month'] = month
    monthly_dataframes.append(df)

# Combine all months into one DataFrame
sleep_data = pd.concat(monthly_dataframes, ignore_index=True)

# Strip whitespace and standardize column names
sleep_data.columns = sleep_data.columns.str.strip().str.lower().str.replace(' ', '_')

# Remove the '%' symbol and convert to float
sleep_data['rem_sleep'] = sleep_data['rem_sleep'].str.replace('%', '').astype(float)
sleep_data['deep_sleep'] = sleep_data['deep_sleep'].str.replace('%', '').astype(float)
sleep_data['heart_rate_below_resting'] = sleep_data['heart_rate_below_resting'].str.replace('%', '').astype(float)

sleep_data = sleep_data.iloc[1:].reset_index(drop=True)

sleep_data.head()


Unnamed: 0,november,date,sleep_score,hours_of_sleep,rem_sleep,deep_sleep,heart_rate_below_resting,sleep_time,month,december,january,febeuary,march,april
0,Monday,11/1/21,88.0,8:06:00,20.0,13.0,84.0,10:41pm - 7:54am,November,,,,,
1,Tuesday,11/2/21,83.0,7:57:00,12.0,18.0,90.0,10:40pm - 7:55am,November,,,,,
2,Wednesday,11/3/21,81.0,7:06:00,13.0,22.0,93.0,11:03pm - 7:16am,November,,,,,
3,Thursday,11/4/21,86.0,7:04:00,19.0,17.0,97.0,10:55pm - 6:56am,November,,,,,
4,Friday,11/5/21,81.0,9:24:00,17.0,15.0,66.0,10:14pm - 9:01am,November,,,,,


In [7]:
# Function to convert 'HH:MM:SS' to float hours
def convert_to_hours(time_str):
    try:
        h, m, s = map(int, time_str.split(':'))
        return h + m/60 + s/3600
    except:
        return np.nan  # Return NaN if format is invalid or missing

sleep_data['hours_of_sleep'] = sleep_data['hours_of_sleep'].apply(convert_to_hours)


In [6]:
# Remove unnecessary variables
columns_to_drop = ['january', 'febeuary', 'march', 'april', 'november', 'december']
sleep_data = sleep_data.drop(columns=columns_to_drop, errors='ignore') 

In [8]:
sleep_data = sleep_data.dropna()

sleep_data

Unnamed: 0,date,sleep_score,hours_of_sleep,rem_sleep,deep_sleep,heart_rate_below_resting,sleep_time,month
0,11/1/21,88.0,8.100000,20.0,13.0,84.0,10:41pm - 7:54am,November
1,11/2/21,83.0,7.950000,12.0,18.0,90.0,10:40pm - 7:55am,November
2,11/3/21,81.0,7.100000,13.0,22.0,93.0,11:03pm - 7:16am,November
3,11/4/21,86.0,7.066667,19.0,17.0,97.0,10:55pm - 6:56am,November
4,11/5/21,81.0,9.400000,17.0,15.0,66.0,10:14pm - 9:01am,November
...,...,...,...,...,...,...,...,...
178,04/26/2022,85.0,7.300000,22.0,14.0,100.0,9:32pm - 6:00am,April
179,04/27/2022,90.0,7.566667,24.0,19.0,98.0,9:19pm - 5:49am,April
180,04/28/2022,87.0,6.900000,21.0,22.0,90.0,10:02pm - 5:46am,April
181,04/29/2022,86.0,7.750000,19.0,17.0,95.0,10:15pm - 7:24am,April


In [9]:
sleep_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 179 entries, 0 to 182
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   date                      179 non-null    object 
 1   sleep_score               179 non-null    float64
 2   hours_of_sleep            179 non-null    float64
 3   rem_sleep                 179 non-null    float64
 4   deep_sleep                179 non-null    float64
 5   heart_rate_below_resting  179 non-null    float64
 6   sleep_time                179 non-null    object 
 7   month                     179 non-null    object 
dtypes: float64(5), object(3)
memory usage: 12.6+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 179 entries, 0 to 182
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   date                      179 non-null    object 
 1   sleep_score               179

In [37]:
# Bagging Model

X = sleep_data.drop(['date','sleep_score','sleep_time','month'], axis=1)
y = sleep_data['sleep_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

bagging_model = BaggingRegressor(estimator=DecisionTreeRegressor(),n_estimators=100,random_state=42,n_jobs=-1)

bagging_model.fit(X_train, y_train)

y_pred_bagging = bagging_model.predict(X_test)

r2_bag = r2_score(y_test, y_pred_bagging)
mae_bag = mean_absolute_error(y_test, y_pred_bagging)
rmse_bag = mean_squared_error(y_test, y_pred_bagging, squared=False)

print(f"R-squared Score: {r2_bag:.3f}")
print(f"MAE: {mae_bag:.2f}")
print(f"RMSE: {rmse_bag:.2f}")


R-squared Score: 0.621
MAE: 1.60
RMSE: 2.32
R-squared Score: 0.621
MAE: 1.60
RMSE: 2.32


In [39]:
# Stacking Model

X = sleep_data.drop(['date','sleep_score','sleep_time','month'], axis=1)
y = sleep_data['sleep_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

base_models = [
    ('rf', RandomForestRegressor(n_estimators=100)),
    ('gb', GradientBoostingRegressor(n_estimators=100)),
    ('knn', KNeighborsRegressor(n_neighbors=5))
]

# Stacking Regressor with Ridge
stacking_model = StackingRegressor(estimators=base_models, final_estimator=RidgeCV(), passthrough=True)

stacking_model.fit(X_train, y_train)

y_pred = stacking_model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f"R-squared Score: {r2:.3f}")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")

R-squared Score: 0.541
MAE: 2.20
RMSE: 2.56
R-squared Score: 0.541
MAE: 2.20
RMSE: 2.56


In [38]:
# Linear Regression
X = sleep_data.drop(['date','sleep_score','sleep_time','month'], axis=1)
y = sleep_data['sleep_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

y_pred = lr_model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f"R-squared Score: {r2:.3f}")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")


R-squared Score: 0.581
MAE: 2.07
RMSE: 2.44
R-squared Score: 0.581
MAE: 2.07
RMSE: 2.44


The bagging model preforms the best in this problem. It had the highest R-squared and the lowest MAE, which means it predicts the best out of these three models. I also created a linear regression to predict sleep score. It preformed a little better than my stacking model, but not as good as the bagging model. 

In [44]:
# Bagging Model with Months

df = pd.get_dummies(sleep_data, columns=['month'], drop_first=True)

X = df.drop(['date','sleep_score','sleep_time'], axis=1)
y = df['sleep_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

bagging_model = BaggingRegressor(estimator=DecisionTreeRegressor(),n_estimators=100,random_state=42,n_jobs=-1)

bagging_model.fit(X_train, y_train)

y_pred_bagging = bagging_model.predict(X_test)

r2_bag = r2_score(y_test, y_pred_bagging)
mae_bag = mean_absolute_error(y_test, y_pred_bagging)
rmse_bag = mean_squared_error(y_test, y_pred_bagging, squared=False)

print(f"R-squared Score: {r2_bag:.3f}")
print(f"MAE: {mae_bag:.2f}")
print(f"RMSE: {rmse_bag:.2f}")

R-squared Score: 0.556
MAE: 1.85
RMSE: 2.51
R-squared Score: 0.556
MAE: 1.85
RMSE: 2.51


Including the Months variable as a predictor made the worse. 