In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import klib

from sklearn.preprocessing import OneHotEncoder,LabelEncoder,OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler,StandardScaler,RobustScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV


from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from joblib import dump

from tabulate import tabulate
import warnings
warnings.filterwarnings('ignore')

df=pd.read_csv(r"C:\Users\Saidabrorkhon\Downloads\Telegram Desktop\sleep_cycle_productivity.csv")

df.drop(columns=['Date', 'Person_ID'], inplace=True)


# %%
def categorize_sleep_quality(score):
    if score <= 2:
        return 'Poor'
    elif score <= 5:
        return 'Okay'
    elif score <= 7:
        return 'Good'
    else:
        return 'Excellent'


df['Sleep Quality Category'] = df['Sleep Quality'].apply(categorize_sleep_quality)
df['Sleep Quality'].value_counts()

# Encoding
label=LabelEncoder()
df['Gender']=label.fit_transform(df['Gender'])

encoder=OrdinalEncoder(categories=[['Poor','Okay','Good','Excellent']])
df[['Sleep Quality Category']] = encoder.fit_transform(df[['Sleep Quality Category']])


df['Sleep Quality Category'] = df['Sleep Quality Category'].astype(int)
df['Gender'] = df['Gender'].astype(int)

# Feature Engineering

# Nap Duration
df['Nap Duration'] = (8 - df['Total Sleep Hours']) * 10 + df['Stress Level'] * 5 - df['Mood Score'] * 3 - df['Work Hours (hrs/day)'] * 2
df['Nap Duration'] = np.maximum(df['Nap Duration'], 0)  

# Room Environment
df['Room Environment'] = df['Sleep Quality'] * 10 - df['Screen Time Before Bed (mins)'] * 0.3 - df['Caffeine Intake (mg)'] * 0.05
df['Room Environment'] = df['Room Environment'].clip(0, 100)  


x = df.drop(['Sleep Quality', 'Sleep Quality Category'], axis=1)
y = df['Sleep Quality Category']

x_train,x_temp,y_train,y_temp=train_test_split(x,y,test_size=0.2,random_state=42)
x_test,x_val,y_test,y_val=train_test_split(x_temp,y_temp,test_size=0.5,random_state=42)

# DecisionTreeClassifier
dt_model=DecisionTreeClassifier(random_state=42)
dt_model.fit(x_train,y_train)
y_pred=dt_model.predict(x_test)
score1=accuracy_score(y_test,y_pred)
print(f'Decision Tree Classifier accuracy: {score1}')


lgb = LGBMClassifier()
lgb.fit(x_train, y_train)
y_pred = lgb.predict(x_test)
accuracy_score_lgb = lgb.score(x_test, y_test)
print("Accuracy Score: ", accuracy_score_lgb)
cm_lgb = classification_report(y_test, y_pred, output_dict=True)
print(classification_report(y_test, y_pred))


model = lgb
dump(model, 'sleep_quality_model.joblib')

Decision Tree Classifier accuracy: 0.844
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000433 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2098
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 14
[LightGBM] [Info] Start training from score -1.611941
[LightGBM] [Info] Start training from score -1.184988
[LightGBM] [Info] Start training from score -1.606941
[LightGBM] [Info] Start training from score -1.223326
Accuracy Score:  0.892
              precision    recall  f1-score   support

           0       0.74      0.76      0.75       102
           1       0.82      0.80      0.81       139
           2       0.98      0.98      0.98       103
           3       1.00      1.00      1.00       156

    accuracy                           0.89       500
   macro avg       0.88      0.89      0.89       500
weighted avg       0.89      0.89      0.89       50

['sleep_quality_model.joblib']

In [3]:
x_test.to_csv('x_test.csv', index=False)
y_test.to_csv('y_test.csv', index=False)