In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# red in the csv file
csv = pd.read_csv('../Final_Data/Resources/Sleep_health_and_lifestyle_dataset.csv')
data = pd.DataFrame(csv)
data

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,370,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
370,371,Female,59,Nurse,8.0,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
371,372,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
372,373,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea


In [3]:
data.dtypes

Person ID                    int64
Gender                      object
Age                          int64
Occupation                  object
Sleep Duration             float64
Quality of Sleep             int64
Physical Activity Level      int64
Stress Level                 int64
BMI Category                object
Blood Pressure              object
Heart Rate                   int64
Daily Steps                  int64
Sleep Disorder              object
dtype: object

In [4]:
#clean up the data 
data[['systolic', 'diastolic']] = data['Blood Pressure'].str.split('/', expand=True)
data[['systolic','diastolic']] = data[['systolic','diastolic']].astype(int)
data = data.drop(columns=['Person ID','Blood Pressure'])
data['BMI Category'] = data['BMI Category'].replace('Normal Weight','Normal')
# change the null values into a string none
data = data.where(pd.notna(data), 'none')

In [5]:
#split the data
X = data.drop(columns='Sleep Disorder')
y = data['Sleep Disorder']

In [6]:
# get dummies for the features
X = pd.get_dummies(X)
X

Unnamed: 0,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,Heart Rate,Daily Steps,systolic,diastolic,Gender_Female,...,Occupation_Manager,Occupation_Nurse,Occupation_Sales Representative,Occupation_Salesperson,Occupation_Scientist,Occupation_Software Engineer,Occupation_Teacher,BMI Category_Normal,BMI Category_Obese,BMI Category_Overweight
0,27,6.1,6,42,6,77,4200,126,83,False,...,False,False,False,False,False,True,False,False,False,True
1,28,6.2,6,60,8,75,10000,125,80,False,...,False,False,False,False,False,False,False,True,False,False
2,28,6.2,6,60,8,75,10000,125,80,False,...,False,False,False,False,False,False,False,True,False,False
3,28,5.9,4,30,8,85,3000,140,90,False,...,False,False,True,False,False,False,False,False,True,False
4,28,5.9,4,30,8,85,3000,140,90,False,...,False,False,True,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,59,8.1,9,75,3,68,7000,140,95,True,...,False,True,False,False,False,False,False,False,False,True
370,59,8.0,9,75,3,68,7000,140,95,True,...,False,True,False,False,False,False,False,False,False,True
371,59,8.1,9,75,3,68,7000,140,95,True,...,False,True,False,False,False,False,False,False,False,True
372,59,8.1,9,75,3,68,7000,140,95,True,...,False,True,False,False,False,False,False,False,False,True


In [7]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Linear Regression Model

In [8]:
from sklearn.linear_model import LogisticRegression

In [9]:
# create the model
lr_model = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [10]:
#fit the model
lr_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
# find the final score
print(lr_model.score(X_train, y_train))
print(lr_model.score(X_test, y_test))

0.8785714285714286
0.9042553191489362


In [12]:
# get the predictions
lr_predictions = lr_model.predict(X_test)

In [13]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
#complete an accuracy score
accuracy_score(y_test, lr_predictions)

0.9042553191489362

In [14]:
#create a confusion matrix
confusion_matrix(y_test,lr_predictions)

array([[17,  0,  2],
       [ 2, 18,  2],
       [ 2,  1, 50]], dtype=int64)

In [15]:
#create a classification report 
print(classification_report(y_test,lr_predictions))

              precision    recall  f1-score   support

    Insomnia       0.81      0.89      0.85        19
 Sleep Apnea       0.95      0.82      0.88        22
        none       0.93      0.94      0.93        53

    accuracy                           0.90        94
   macro avg       0.89      0.89      0.89        94
weighted avg       0.91      0.90      0.90        94



In [16]:
import joblib
joblib.dump(lr_model, '../Final_Data/Resources/log_regression_model.pkl')

['../Final_Data/Resources/log_regression_model.pkl']

# K-Neighbors Model

In [17]:
from sklearn.neighbors import KNeighborsClassifier

In [18]:
# create fit and predict using the kn model
kn_model = KNeighborsClassifier(n_neighbors=3)
kn_model.fit(X_train,y_train)
kn_predictions = kn_model.predict(X_test)

In [19]:
#find the accuracy score
accuracy_score(y_test, kn_predictions)

0.8723404255319149

In [20]:
#create a confusion matrix
confusion_matrix(y_test,kn_predictions)

array([[16,  1,  2],
       [ 2, 17,  3],
       [ 3,  1, 49]], dtype=int64)

In [21]:
# create a classification report
print(classification_report(y_test,kn_predictions))

              precision    recall  f1-score   support

    Insomnia       0.76      0.84      0.80        19
 Sleep Apnea       0.89      0.77      0.83        22
        none       0.91      0.92      0.92        53

    accuracy                           0.87        94
   macro avg       0.85      0.85      0.85        94
weighted avg       0.88      0.87      0.87        94



# Random Forest Model

In [22]:
from sklearn.ensemble import RandomForestClassifier

In [23]:
# create a random forest model
rf_model = RandomForestClassifier(n_estimators=300, random_state=1)

In [24]:
# fit the model
rf_model = rf_model.fit(X_train, y_train)

In [25]:
#make predictions
rf_predictions = rf_model.predict(X_test)

In [26]:
#find the accuracy score
accuracy_score(y_test, rf_predictions)

0.9042553191489362

In [27]:
#create a confusion matrix
confusion_matrix(y_test,rf_predictions)

array([[14,  3,  2],
       [ 0, 20,  2],
       [ 2,  0, 51]], dtype=int64)

In [28]:
# get the classification report
print(classification_report(y_test,rf_predictions))

              precision    recall  f1-score   support

    Insomnia       0.88      0.74      0.80        19
 Sleep Apnea       0.87      0.91      0.89        22
        none       0.93      0.96      0.94        53

    accuracy                           0.90        94
   macro avg       0.89      0.87      0.88        94
weighted avg       0.90      0.90      0.90        94

