In [1]:
# Import necessary libraries
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score
from lazypredict.Supervised import LazyClassifier, LazyRegressor
import pickle

In [2]:
# Load data from CSV file
# Replace 'Sleep_health_and_lifestyle_dataset.csv' with the actual file path or URL
sleep_data = pd.read_csv("c:\\Users\\Gabrellea\\OneDrive\\Documents\\UNC Chapel Hill\\Project 4\Sleep_health_and_lifestyle_dataset_cleaned.csv")

In [3]:
#Create a dataframe
df = pd.DataFrame(sleep_data)
df.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,BP Systolic,BP Diastolic,Daily Steps,Sleep Disorder,Sleep Disorder Measure
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126,83,4200,,1
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125,80,10000,,1
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125,80,10000,,1
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140,90,3000,Sleep Apnea,2
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140,90,3000,Sleep Apnea,2


In [4]:
#Print basic information about the dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person ID                374 non-null    int64  
 1   Gender                   374 non-null    object 
 2   Age                      374 non-null    int64  
 3   Occupation               374 non-null    object 
 4   Sleep Duration           374 non-null    float64
 5   Quality of Sleep         374 non-null    int64  
 6   Physical Activity Level  374 non-null    int64  
 7   Stress Level             374 non-null    int64  
 8   BMI Category             374 non-null    object 
 9   BP Systolic              374 non-null    int64  
 10  BP Diastolic             374 non-null    int64  
 11  Daily Steps              374 non-null    int64  
 12  Sleep Disorder           374 non-null    object 
 13  Sleep Disorder Measure   374 non-null    int64  
dtypes: float64(1), int64(9), o

In [5]:
# Categorize columns into numeric and categorical
categoric = ['Gender', 'Occupation', 'BMI Category', 'Sleep Disorder']
numeric = ['Age', 'Sleep Duration', 'Quality of Sleep', 'Physical Activity Level', 'Stress Level', 'Daily Steps', 'BP Systolic', 'BP Diastolic']

In [6]:
# Display summary statistics for numeric columns
print(df[numeric].describe().round(2))

         Age  Sleep Duration  Quality of Sleep  Physical Activity Level  \
count 374.00          374.00            374.00                   374.00   
mean   42.18            7.13              7.31                    59.17   
std     8.67            0.80              1.20                    20.83   
min    27.00            5.80              4.00                    30.00   
25%    35.25            6.40              6.00                    45.00   
50%    43.00            7.20              7.00                    60.00   
75%    50.00            7.80              8.00                    75.00   
max    59.00            8.50              9.00                    90.00   

       Stress Level  Daily Steps  BP Systolic  BP Diastolic  
count        374.00       374.00       374.00        374.00  
mean           5.39      6816.84       128.55         84.65  
std            1.77      1617.92         7.75          6.16  
min            3.00      3000.00       115.00         75.00  
25%           

In [14]:
# Handle missing values
print(df.isnull().sum().sum())


0


In [15]:
# Explore value counts for categorical columns
print(df['Gender'].value_counts())
print(df['Occupation'].value_counts())
print(df['BMI Category'].value_counts())


Male      189
Female    185
Name: Gender, dtype: int64
Nurse                   73
Doctor                  71
Engineer                63
Lawyer                  47
Teacher                 40
Accountant              37
Salesperson             32
Software Engineer        4
Scientist                4
Sales Representative     2
Manager                  1
Name: Occupation, dtype: int64
Normal        216
Overweight    148
Obese          10
Name: BMI Category, dtype: int64


In [16]:
# Modify 'BMI Category' values
df.loc[df['BMI Category'] == 'Normal Weight', 'BMI Category'] = 'Normal'
print(df['BMI Category'].value_counts())


Normal        216
Overweight    148
Obese          10
Name: BMI Category, dtype: int64


In [17]:
# Explore value counts for 'Sleep Disorder'
print(df['Sleep Disorder'].value_counts())

None           219
Sleep Apnea     78
Insomnia        77
Name: Sleep Disorder, dtype: int64


In [18]:
# Create a new column 'Sleep Disorder Measure' based on 'Sleep Disorder', 'None'=0, 'Sleep Apnea'=1, 'Insomnia'=2
# df['Sleep Disorder Measure'] = np.where(df['Sleep Disorder'] == 'No Disorder', 0, np.where(df['Sleep Disorder'] == 'Sleep Apnea', 1, 2))
# df['Sleep Disorder Measure'] = df['Sleep Disorder Measure'].astype(int)
# print(df['Sleep Disorder Measure'].value_counts())

df['Sleep Disorder Measure'] = np.where(df['Sleep Disorder'] == 'None', 1, np.where(df['Sleep Disorder'] == 'Sleep Apnea', 2, 3))
df['Sleep Disorder Measure'] = df['Sleep Disorder Measure'].astype(int)
print(df[['Sleep Disorder', 'Sleep Disorder Measure']].head(10))
print(df['Sleep Disorder Measure'].value_counts())

  Sleep Disorder  Sleep Disorder Measure
0           None                       1
1           None                       1
2           None                       1
3    Sleep Apnea                       2
4    Sleep Apnea                       2
5       Insomnia                       3
6       Insomnia                       3
7           None                       1
8           None                       1
9           None                       1
1    219
2     78
3     77
Name: Sleep Disorder Measure, dtype: int64


In [19]:
# Drop unnecessary columns
X = X = df.drop(['Person ID'], axis=1)
y = df[['Stress Level']]

In [20]:
# Label encode categorical columns and scale numeric columns
le = LabelEncoder()
scaler = StandardScaler()


X[categoric] = X[categoric].apply(lambda col: le.fit_transform(col))

X[numeric] = scaler.fit_transform(X[numeric])



In [25]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [26]:
# Lazy Predict using regression models
reg = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)

100%|██████████| 42/42 [00:02<00:00, 17.96it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000089 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 141
[LightGBM] [Info] Number of data points in the train set: 299, number of used features: 13
[LightGBM] [Info] Start training from score 5.311037





In [27]:
# Display model performance metrics
print(models.sort_values(by=['RMSE', 'Time Taken']))

                               Adjusted R-Squared  R-Squared  RMSE  Time Taken
Model                                                                         
ExtraTreeRegressor                           1.00       1.00  0.00        0.01
DecisionTreeRegressor                        1.00       1.00  0.00        0.02
AdaBoostRegressor                            1.00       1.00  0.00        0.03
RANSACRegressor                              1.00       1.00  0.00        0.03
OrthogonalMatchingPursuit                    1.00       1.00  0.00        0.01
OrthogonalMatchingPursuitCV                  1.00       1.00  0.00        0.01
LinearRegression                             1.00       1.00  0.00        0.01
TransformedTargetRegressor                   1.00       1.00  0.00        0.03
LassoLarsIC                                  1.00       1.00  0.00        0.01
Lars                                         1.00       1.00  0.00        0.01
LassoLarsCV                                  1.00   

In [28]:
# Train a Linear Regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

In [29]:
# Make predictions using the Linear Regression model
y_pred = lr.predict(X_test)
print ("Predictions: \n", y_pred)




Predictions: 
 [[3.]
 [8.]
 [8.]
 [3.]
 [8.]
 [7.]
 [8.]
 [4.]
 [3.]
 [5.]
 [7.]
 [6.]
 [5.]
 [8.]
 [5.]
 [8.]
 [6.]
 [7.]
 [3.]
 [4.]
 [6.]
 [5.]
 [8.]
 [4.]
 [4.]
 [5.]
 [7.]
 [7.]
 [7.]
 [3.]
 [6.]
 [4.]
 [6.]
 [3.]
 [8.]
 [4.]
 [8.]
 [6.]
 [5.]
 [5.]
 [7.]
 [8.]
 [7.]
 [7.]
 [5.]
 [4.]
 [4.]
 [6.]
 [8.]
 [6.]
 [3.]
 [5.]
 [8.]
 [5.]
 [6.]
 [3.]
 [5.]
 [8.]
 [3.]
 [8.]
 [5.]
 [7.]
 [8.]
 [6.]
 [4.]
 [8.]
 [7.]
 [3.]
 [5.]
 [5.]
 [4.]
 [7.]
 [3.]
 [8.]
 [3.]]


In [30]:
# Evaluate the Linear Regression model
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"""
MAPE: {mape:.2f}
RMSE: {rmse:.2f}
R2: {r2:.2f}
""")


MAPE: 0.00
RMSE: 0.00
R2: 1.00



In [31]:
# Save the trained Linear Regression model
pickle.dump(lr, open('c:\\Users\\Gabrellea\\OneDrive\\Documents\\UNC Chapel Hill\\Project 4\\model.pkl', 'wb'))
df.to_csv('c:\\Users\\Gabrellea\\OneDrive\\Documents\\UNC Chapel Hill\\Project 4\\Sleep_health_and_lifestyle_dataset_cleaned.csv', index=False)
