# Objective:

- To predict sleeping disorder with the help of life style they live to overcome the overcomr the problem.

# Step 1: Import Libraries and Dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

In [2]:
df = pd.read_csv(r"./Sleep_health_and_lifestyle_dataset.csv")
df.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


In [3]:
df.shape

(374, 13)

# Step 2: Data Pre-Processing

### Removing Irrelavent Variable

In [5]:
df = df.drop(columns=["Person ID"])

In [6]:
def summary(data:pd.DataFrame):
    col_name = []
    unique_val = []
    null_per = []
    col_type = []
    
    for col in data.columns:
        col_name.append(col)
        unique_val.append(data[col].nunique())
        null_per.append(data[col].isnull().sum()*100/data.shape[0])
        col_type.append(data[col].dtype)
        
    summary_df = pd.DataFrame()
    summary_df["Columns"] = col_name
    summary_df["Unique Values"] = unique_val
    summary_df["Null Percentage"] = null_per
    summary_df["Data Type"] = col_type
    print("Dataset Summary") 
    return summary_df

### Checking Missing Values and dataset information

In [7]:
summary(df)

Dataset Summary


Unnamed: 0,Columns,Unique Values,Null Percentage,Data Type
0,Gender,2,0.0,object
1,Age,31,0.0,int64
2,Occupation,11,0.0,object
3,Sleep Duration,27,0.0,float64
4,Quality of Sleep,6,0.0,int64
5,Physical Activity Level,16,0.0,int64
6,Stress Level,6,0.0,int64
7,BMI Category,4,0.0,object
8,Blood Pressure,25,0.0,object
9,Heart Rate,19,0.0,int64


### Age Analysis

In [8]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
px.histogram(df, x="Age", nbins=4, color='Sleep Disorder')

**Data is of age of people between 25 to 60 and most of the people between age 40 to 50 tend to suffer from insomnia and people between age 50 to 60 tend to suffer from Sleep Apnea** 

#### Sleep Disorder Analysis

In [9]:
px.pie(data_frame=df, names="Sleep Disorder", hole=.3)

**All together 41.5 percent people are suffring from sleep disorder from which 20.6 percent belong to Insomnia and 20.9 percent belong to Sleep Apnea.**

#### Gender Analysis

In [10]:
male_female = pd.crosstab(df.Gender, df["Sleep Disorder"])

In [11]:
px.bar(male_female, barmode="group")

**According to the chart we can see that Sleep disorder is more to be found in female then male and insomnia is also to be found more in female then male**

In [12]:
px.histogram(df, x="Sleep Duration", nbins=6, color="Sleep Disorder")

**People with sleep less then 6.7 hours tend to suffer from Insomnia or sleep Apnea.**

In [13]:
corr = df.corr()





In [14]:
px.imshow(corr, text_auto=True, aspect='auto')

**Corelation between Quality of sleep and Sleep duration seems to be high. So we will be droping one of them while building model. due to multicolinearity.**

### Converting Quality varivbales to Numbers

In [15]:
df[df.select_dtypes("object").columns]

Unnamed: 0,Gender,Occupation,BMI Category,Blood Pressure,Sleep Disorder
0,Male,Software Engineer,Overweight,126/83,
1,Male,Doctor,Normal,125/80,
2,Male,Doctor,Normal,125/80,
3,Male,Sales Representative,Obese,140/90,Sleep Apnea
4,Male,Sales Representative,Obese,140/90,Sleep Apnea
...,...,...,...,...,...
369,Female,Nurse,Overweight,140/95,Sleep Apnea
370,Female,Nurse,Overweight,140/95,Sleep Apnea
371,Female,Nurse,Overweight,140/95,Sleep Apnea
372,Female,Nurse,Overweight,140/95,Sleep Apnea


In [16]:
df["Gender"].replace({"Male":1, "Female":0}, inplace=True)
df["BMI Category"].replace({'Normal':0, 'Overweight':2, 'Normal Weight':1, 'Obese':3}, inplace=True)
df["Occupation"].replace({'Nurse':0, 'Doctor':1, 'Engineer':2, 'Lawyer':3, 'Teacher':4, 'Accountant':5,
       'Salesperson':6, 'Software Engineer':7, 'Scientist':8, 'Sales Representative':9,
       'Manager':10}, inplace=True)
df["Blood Pressure"].replace({'130/85':0, '140/95':1, '125/80':2, '120/80':3, '115/75':4, '135/90':5, '140/90':6,
       '125/82':7, '132/87':8, '128/85':9, '126/83':10, '115/78':11, '139/91':12, '142/92':13,
       '119/77':14, '135/88':15, '129/84':16, '128/84':17, '131/86':18, '117/76':19, '130/86':20,
       '118/75':21, '121/79':22, '122/80':23, '118/76':24}, inplace=True)
df["Sleep Disorder"].replace({'None':0, 'Sleep Apnea':1, 'Insomnia':3}, inplace=True)

In [17]:
df.head()

Unnamed: 0,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,27,7,6.1,6,42,6,1,10,77,4200,0
1,1,28,1,6.2,6,60,8,0,2,75,10000,0
2,1,28,1,6.2,6,60,8,0,2,75,10000,0
3,1,28,9,5.9,4,30,8,3,6,85,3000,1
4,1,28,9,5.9,4,30,8,3,6,85,3000,1


# Step 3: Data Preparation

In [18]:
df_tr, df_ts = train_test_split(df, test_size=.2)

tr_x = df_tr.iloc[:, :-1]
tr_y = df_tr.iloc[:, -1]

ts_x = df_ts.iloc[:, :-1]
ts_y = df_ts.iloc[:, -1]

tr_x.shape, tr_y.shape, ts_x.shape, ts_y.shape

((299, 11), (299,), (75, 11), (75,))

# Step 4: Model building

In [27]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver="lbfgs", max_iter=10000)
logreg.fit(tr_x, tr_y)
pred = logreg.predict(ts_x, )
print(f"Confution matrix:\n{confusion_matrix(ts_y, pred)}\n")
print(f"Accuracy: \n{accuracy_score(ts_y, pred, normalize=True)}\n")
print(f"Precision Score: \n{precision_score(ts_y, pred, average=None)}\n")
print(f"Recall Score: \n{recall_score(ts_y, pred, average=None)}\n")

Confution matrix:
[[39  1  2]
 [ 1 12  0]
 [ 4  2 14]]

Accuracy: 
0.8666666666666667

Precision Score: 
[0.88636364 0.8        0.875     ]

Recall Score: 
[0.92857143 0.92307692 0.7       ]



In [20]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion="entropy")
dt.fit(tr_x, tr_y)
pred = dt.predict(ts_x, )
print(f"Confution matrix:\n{confusion_matrix(ts_y, pred)}\n")
print(f"Accuracy: \n{accuracy_score(ts_y, pred, normalize=True)}\n")
print(f"Precision Score: \n{precision_score(ts_y, pred, average=None)}\n")
print(f"Recall Score: \n{recall_score(ts_y, pred, average=None)}\n")

Confution matrix:
[[39  1  2]
 [ 0 12  1]
 [ 4  2 14]]

Accuracy: 
0.8666666666666667

Precision Score: 
[0.90697674 0.8        0.82352941]

Recall Score: 
[0.92857143 0.92307692 0.7       ]



In [21]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(criterion="entropy")
rfc.fit(tr_x, tr_y)
pred = rfc.predict(ts_x)
print(f"Confution matrix:\n{confusion_matrix(ts_y, pred)}\n")
print(f"Accuracy: \n{accuracy_score(ts_y, pred, normalize=True)}\n")
print(f"Precision Score: \n{precision_score(ts_y, pred, average=None)}\n")
print(f"Recall Score: \n{recall_score(ts_y, pred, average=None)}\n")

Confution matrix:
[[39  1  2]
 [ 0 12  1]
 [ 4  2 14]]

Accuracy: 
0.8666666666666667

Precision Score: 
[0.90697674 0.8        0.82352941]

Recall Score: 
[0.92857143 0.92307692 0.7       ]



In [22]:
import pickle
pickle.dump(rfc, open(r"./model/model.pkl", "wb"))