In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv('Sleep_health_and_lifestyle_dataset2.csv')



# Fill missing values in 'Sleep Disorder' with 'No Disorder'
df['Sleep Disorder'] = df['Sleep Disorder'].fillna('No Disorder')


ode = OrdinalEncoder()
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore') # Handle unknowns
scaler = StandardScaler()
df

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,77,4200,No Disorder
1,2,Male,28,Doctor,6.2,6,60,8,Normal,75,10000,No Disorder
2,3,Male,28,Doctor,6.2,6,60,8,Normal,75,10000,No Disorder
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,85,3000,Sleep Apnea
...,...,...,...,...,...,...,...,...,...,...,...,...
369,370,Female,59,Nurse,8.1,9,75,3,Overweight,68,7000,Sleep Apnea
370,371,Female,59,Nurse,8.0,9,75,3,Overweight,68,7000,Sleep Apnea
371,372,Female,59,Nurse,8.1,9,75,3,Overweight,68,7000,Sleep Apnea
372,373,Female,59,Nurse,8.1,9,75,3,Overweight,68,7000,Sleep Apnea


In [2]:
#[Gender, Age, Occupation, Sleep Duration, BMI, Stress Level, BMI Category, Heart Rate, Daily Steps, Physical Activity Level, Sleep Disorder]
ct = make_column_transformer(
    (ohe, ['Gender', 'Sleep Disorder']),
    (ode, ['Occupation', 'BMI Category']),
    (scaler, ['Age', 'Sleep Duration', 'Physical Activity Level', 'Stress Level', 'Heart Rate', 'Daily Steps']),
    remainder="passthrough"
)
ct.set_output(transform="pandas")

pipeline = Pipeline(steps=[
    ('column_transformer', ct),
    ('linear_regression', LinearRegression())
])


In [3]:
X = df.drop(columns=['Quality of Sleep', 'Person ID'])
Y = df['Quality of Sleep']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=42, stratify=Y)


In [4]:

pipeline

In [5]:



# Fit the pipeline with the training data
pipeline.fit(X_train, Y_train)

# Access the fitted ColumnTransformer and transform X_train
X_train_transformed =ct.transform(X_train)

# Now you can use X_train_transformed as needed
print(X_train_transformed)


Y_pred = pipeline.predict(X_test)
print(Y_pred)
print(pipeline.score(X_train,Y_train))
print(pipeline.score(X_test,Y_test))


     onehotencoder__Gender_Female  onehotencoder__Gender_Male  \
194                           0.0                         1.0   
160                           0.0                         1.0   
20                            0.0                         1.0   
348                           1.0                         0.0   
292                           1.0                         0.0   
..                            ...                         ...   
222                           0.0                         1.0   
152                           0.0                         1.0   
313                           1.0                         0.0   
245                           1.0                         0.0   
331                           1.0                         0.0   

     onehotencoder__Sleep Disorder_Insomnia  \
194                                     1.0   
160                                     0.0   
20                                      0.0   
348                            

In [6]:
r2 = r2_score(Y_test, Y_pred)
mse = mean_squared_error(Y_test, Y_pred)
mae = mean_absolute_error(Y_test, Y_pred)

print("R^2 score:", r2)
print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)


R^2 score: 0.9632510694144412
Mean Squared Error (MSE): 0.05111531251580661
Mean Absolute Error (MAE): 0.16192526190825868


In [10]:
import numpy as np
import pandas as pd

# Define the input data
input_data = np.array([[
    'Male',        # Gender
    19,            # Age
    'Doctor',     # Occupation
    6.5,           # Sleep Duration
    80,            # Physical Activity Level
    3,             # Stress Level
    'Normal',      # BMI Category
    70,            # Heart Rate
    15000,          # Daily Steps
    'None'         # Sleep Disorder
]])

# Convert the input data to a DataFrame with appropriate column names
input_df = pd.DataFrame(input_data, columns=[
    'Gender', 'Age', 'Occupation', 'Sleep Duration',
    'Physical Activity Level', 'Stress Level', 'BMI Category',
    'Heart Rate', 'Daily Steps', 'Sleep Disorder'
])

# Print the input DataFrame to ensure it looks correct
print("Input DataFrame:")
print(input_df)

# Apply the column transformer and make predictions using the pipeline
predicted_quality_of_sleep = pipeline.predict(input_df)

# Print the predicted quality of sleep
print("Predicted Quality of Sleep:", predicted_quality_of_sleep)


Input DataFrame:
  Gender Age Occupation Sleep Duration Physical Activity Level Stress Level  \
0   Male  19     Doctor            6.5                      80            3   

  BMI Category Heart Rate Daily Steps Sleep Disorder  
0       Normal         70       15000           None  
Predicted Quality of Sleep: [8.30573109]
