In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix, classification_report

# Read in the data
df = pd.read_csv('sleep_health_and_lifestyle_dataset.csv')

# Drop data we're not using
data = df.copy().drop(['Person ID', 'Occupation', 'Sleep Disorder', 'Blood Pressure'], axis = 1)

# Rename columns
data.rename(columns = {'BMI Category': 'BMI', 'Quality of Sleep': 'Sleep Quality', 'Physical Activity Level': 'Physical Activity'}, inplace = True)

# Convert BMI normal weight -> normal
data['BMI'] = np.where(data['BMI'] == 'Normal Weight', 'Normal', data['BMI'])


# Convert BMI strings to numbers
data['BMI'].replace(["Normal", "Overweight", "Obese"], [0, 1, 2], inplace=True)

# Convert categorical data to numerical data
# Convert Gender Strings to Numbers: Male to 0 and Female to 1
data['Gender'] = np.where(data['Gender'] == 'Male', 1, 0)

print(data.columns)


Index(['Gender', 'Age', 'Sleep Duration', 'Sleep Quality', 'Physical Activity',
       'Stress Level', 'BMI', 'Heart Rate', 'Daily Steps'],
      dtype='object')


In [56]:
# linear regression model

# Consider what predictor variables we want to use
sleepDuration = data['Sleep Duration']
sleepQuality = data['Sleep Quality']
stressLevel = data['Stress Level']

# Create a dataframe with the predictor variables
pred = pd.DataFrame(data={'Sleep Duration': sleepDuration, 'Sleep Quality': sleepQuality, 'Stress Level': stressLevel})

# Drop class we want to predict: Stress Level
X = pred.copy().drop(['Stress Level'], axis=1)
y = pred['Stress Level']

# print(X)
# print(y)

# Split the data into training/testing sets with ratio of 80:20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create the linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Make predictions using the testing set
y_trainPred = lr.predict(X_train)
y_testPred = lr.predict(X_test)

# Round the predictions to the nearest integer
y_trainPred = np.round(y_trainPred)
y_testPred = np.round(y_testPred)

# print(y_trainPred)

# Calculate MSE & R^2
mseTrain = mean_squared_error(y_train, y_trainPred)
r2Train = r2_score(y_train, y_trainPred)
mseTest = mean_squared_error(y_test, y_testPred)
r2Test = r2_score(y_test, y_testPred)

# Print results
print(f'\nTrain MSE: {mseTrain}')
print(f'Train R^2: {r2Train}\n')
print(f'Test MSE: {mseTest}')
print(f'Test R^2: {r2Test}')

[ 7.  3.  6.  3.  4.  7.  4.  4.  3.  3.  7.  3.  3.  7.  7.  4.  6.  4.
  6.  4.  7.  4.  4.  6.  4.  7.  4.  3.  4.  3.  6.  7. 10.  7.  3.  4.
  3.  4.  7.  6.  7.  6.  4.  4.  7. 10.  7.  7.  6.  6.  6.  7.  4.  7.
  4.  4.  6.  7.  7.  4.  6.  7.  6.  7.  8.  3.  3.  4.  7.  3.  4.  3.
  6.  4.  4.  6.  7. 10.  6.  7.  4.  3.  6.  7.  8.  3.  6.  4.  7.  4.
  7.  7.  6.  4.  7.  3.  4.  7.  7.  6.  7.  7.  7.  4.  3.  3.  3.  7.
  6.  4.  7.  7.  7.  7.  6.  4.  3.  4.  8.  6.  4.  7.  6.  4.  7.  4.
  6.  8.  6.  6.  6.  6.  4.  6.  4.  4.  3.  4.  3.  7.  6.  4.  3.  4.
  4.  4.  6.  6.  3.  6.  4.  3.  4.  6.  4.  3.  7.  6.  6.  4.  7.  7.
  7.  3.  6.  7.  3.  3. 10.  4.  3.  7.  7.  3.  3.  4.  3.  3.  7.  6.
  4.  6.  6.  7.  4.  7.  6.  3.  4.  6.  6.  7.  4.  7.  4.  4.  3.  4.
  4.  4.  4.  3.  4.  4.  7.  4.  3.  6.  4.  4.  8.  7. 10.  6.  3.  6.
  3.  6.  4.  4.  6.  4.  6.  6.  6.  3.  3.  6.  3.  7.  7.  7.  7.  7.
  3.  3.  4.  4.  7.  4.  4.  7.  3.  7.  4.  6.  6

In [None]:
# logistic regression model
