In [29]:
import pandas as pd
import numpy as np
import plotly.express as px
import nbformat
import plotly.graph_objects as go

import sklearn as sk
from sklearn.linear_model  import  LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error
from sklearn.linear_model import SGDRegressor

import pickle


In [2]:
df = pd.read_csv("sleep_data/SleepStudyData.csv")
print(f'Number of rows: {df.shape[0]}')
print(f'Number of columns: {df.shape[1]}')
df.head(5)

Number of rows: 104
Number of columns: 6


Unnamed: 0,Enough,Hours,PhoneReach,PhoneTime,Tired,Breakfast
0,Yes,8.0,Yes,Yes,3,Yes
1,No,6.0,Yes,Yes,3,No
2,Yes,6.0,Yes,Yes,2,Yes
3,No,7.0,Yes,Yes,4,No
4,No,7.0,Yes,Yes,2,Yes


In [3]:
columns = df.columns

for i in columns:
    if type(df.iloc[0][i]) is str:
        df[i] = df[i].apply(lambda x: 1 if x == 'Yes' else 0)

    df[i] = df[i].astype(float)
df = df.transform(pd.to_numeric, errors='coerce')

In [4]:
df

Unnamed: 0,Enough,Hours,PhoneReach,PhoneTime,Tired,Breakfast
0,1.0,8.0,1.0,1.0,3.0,1.0
1,0.0,6.0,1.0,1.0,3.0,0.0
2,1.0,6.0,1.0,1.0,2.0,1.0
3,0.0,7.0,1.0,1.0,4.0,0.0
4,0.0,7.0,1.0,1.0,2.0,1.0
...,...,...,...,...,...,...
99,0.0,7.0,1.0,1.0,2.0,1.0
100,0.0,7.0,0.0,1.0,3.0,1.0
101,1.0,8.0,1.0,1.0,3.0,1.0
102,1.0,7.0,1.0,1.0,2.0,1.0


In [12]:
px.histogram(df, x = 'Hours', color = 'Enough', title = "Comparison of Hours Slept and Tiredness")


In [13]:
px.density_heatmap(df, x = 'Tired', y = 'Breakfast', title = "Does Breakfast Affect Tiredness")

In [19]:
df = df.fillna(0)

In [20]:
X = df[['Enough','PhoneReach','PhoneTime','Tired','Breakfast']]
y = df['Hours'] 
X_train, X_test, y_train, y_test = train_test_split(X,y)


In [25]:
def run_experiment(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("R^2 : ", r2_score(y_test, y_pred))
    print("MAE :", mean_absolute_error(y_test,y_pred))
    print("RMSE:",np.sqrt(mean_squared_error(y_test, y_pred)))

In [26]:
rf = RandomForestRegressor()
run_experiment(rf, X_train, X_test, y_train, y_test)

R^2 :  -0.07220634924556846
MAE : 0.9659153709538326
RMSE: 1.4209540728691985


In [28]:
sgd = SGDRegressor()
run_experiment(sgd, X_train, X_test, y_train, y_test)

R^2 :  -0.6646940410856443
MAE : 1.3276605663746717
RMSE: 1.770549472722529


### Model of Choice: Random Forest
Due to the limited data, there is going to be a very low association between the predicted and real values. Hopefully with more data, it will improve. 

In [30]:
filename = 'sleep_model.sav'
pickle.dump(rf, open(filename, 'wb'))

In [31]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

-0.07220634924556846


In [34]:
X_train['PhoneTime'].value_counts()

1.0    63
0.0    15
Name: PhoneTime, dtype: int64

In [39]:
rf.predict([X.iloc[0]])[0]


X does not have valid feature names, but RandomForestRegressor was fitted with feature names



7.3418253968254