In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [4]:
csv = pd.read_csv('../Final_Data/Resources/Sleep_health_and_lifestyle_dataset.csv')
data = pd.DataFrame(csv)
data

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,370,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
370,371,Female,59,Nurse,8.0,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
371,372,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
372,373,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea


In [5]:
data.dtypes

Person ID                    int64
Gender                      object
Age                          int64
Occupation                  object
Sleep Duration             float64
Quality of Sleep             int64
Physical Activity Level      int64
Stress Level                 int64
BMI Category                object
Blood Pressure              object
Heart Rate                   int64
Daily Steps                  int64
Sleep Disorder              object
dtype: object

In [6]:
data[['systolic', 'diastolic']] = data['Blood Pressure'].str.split('/', expand=True)
data = data.drop(columns=['Person ID','Blood Pressure'])


In [7]:
data = data.where(pd.notna(data), 'none')

In [8]:
X = data.drop(columns='Sleep Disorder')
y = data['Sleep Disorder']

In [9]:
X.dtypes

Gender                      object
Age                          int64
Occupation                  object
Sleep Duration             float64
Quality of Sleep             int64
Physical Activity Level      int64
Stress Level                 int64
BMI Category                object
Heart Rate                   int64
Daily Steps                  int64
systolic                    object
diastolic                   object
dtype: object

In [10]:
X[['systolic','diastolic']] = X[['systolic','diastolic']].astype(int)

In [11]:
X = pd.get_dummies(X)
X

Unnamed: 0,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,Heart Rate,Daily Steps,systolic,diastolic,Gender_Female,...,Occupation_Nurse,Occupation_Sales Representative,Occupation_Salesperson,Occupation_Scientist,Occupation_Software Engineer,Occupation_Teacher,BMI Category_Normal,BMI Category_Normal Weight,BMI Category_Obese,BMI Category_Overweight
0,27,6.1,6,42,6,77,4200,126,83,0,...,0,0,0,0,1,0,0,0,0,1
1,28,6.2,6,60,8,75,10000,125,80,0,...,0,0,0,0,0,0,1,0,0,0
2,28,6.2,6,60,8,75,10000,125,80,0,...,0,0,0,0,0,0,1,0,0,0
3,28,5.9,4,30,8,85,3000,140,90,0,...,0,1,0,0,0,0,0,0,1,0
4,28,5.9,4,30,8,85,3000,140,90,0,...,0,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,59,8.1,9,75,3,68,7000,140,95,1,...,1,0,0,0,0,0,0,0,0,1
370,59,8.0,9,75,3,68,7000,140,95,1,...,1,0,0,0,0,0,0,0,0,1
371,59,8.1,9,75,3,68,7000,140,95,1,...,1,0,0,0,0,0,0,0,0,1
372,59,8.1,9,75,3,68,7000,140,95,1,...,1,0,0,0,0,0,0,0,0,1


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [13]:
scaler = StandardScaler()

X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)
classifier

In [15]:
classifier.fit(X_train_scaled, y_train)

In [16]:
print(classifier.score(X_train_scaled, y_train))
print(classifier.score(X_test_scaled, y_test))

0.9285714285714286
0.9042553191489362


In [17]:
predictions = classifier.predict(X_test_scaled)

In [18]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

accuracy_score(y_test, predictions)

0.9042553191489362

In [19]:
confusion_matrix(y_test,predictions)

array([[16,  2,  1],
       [ 3, 49,  1],
       [ 0,  2, 20]], dtype=int64)

In [20]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

    Insomnia       0.84      0.84      0.84        19
        None       0.92      0.92      0.92        53
 Sleep Apnea       0.91      0.91      0.91        22

    accuracy                           0.90        94
   macro avg       0.89      0.89      0.89        94
weighted avg       0.90      0.90      0.90        94



In [21]:
import joblib

In [22]:
joblib.dump(classifier, '../Final_Data/Resources/log_regression_model.pkl')

['../Final_Data/Resources/log_regression_model.pkl']