# Stress Level Prediction Model V2

This notebook demonstrates stress level prediction using multiple machine learning algorithms on sleep health and lifestyle data.

## 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

## 2. Load Data

In [2]:
# Load and preprocess data
df = pd.read_csv('sleep_heath_lifecycle_dataset.csv')
df.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


## 3. Data Preprocessing Function

In [3]:
def preprocess_data(df):
    """Preprocess the sleep health dataset for stress prediction"""
    df_processed = df.copy()
    
    # Handle missing values
    df_processed['Sleep Disorder'] = df_processed['Sleep Disorder'].fillna('None')
    
    # Extract blood pressure components
    bp_parts = df_processed['Blood Pressure'].str.split('/')
    df_processed['Systolic_BP'] = bp_parts.str[0].astype(int)
    df_processed['Diastolic_BP'] = bp_parts.str[1].astype(int)
    
    # Encode categorical variables
    le_gender = LabelEncoder()
    le_occupation = LabelEncoder()
    le_bmi = LabelEncoder()
    le_sleep_disorder = LabelEncoder()
    
    df_processed['Gender_Encoded'] = le_gender.fit_transform(df_processed['Gender'])
    df_processed['Occupation_Encoded'] = le_occupation.fit_transform(df_processed['Occupation'])
    df_processed['BMI_Category_Encoded'] = le_bmi.fit_transform(df_processed['BMI Category'])
    df_processed['Sleep_Disorder_Encoded'] = le_sleep_disorder.fit_transform(df_processed['Sleep Disorder'])
    
    # Select features for model
    feature_cols = ['Age', 'Sleep Duration', 'Quality of Sleep', 'Physical Activity Level',
                   'Heart Rate', 'Daily Steps', 'Systolic_BP', 'Diastolic_BP',
                   'Gender_Encoded', 'Occupation_Encoded', 'BMI_Category_Encoded', 
                   'Sleep_Disorder_Encoded']
    
    X = df_processed[feature_cols]
    y = df_processed['Stress Level']
    
    return X, y, {
        'gender_encoder': le_gender,
        'occupation_encoder': le_occupation,
        'bmi_encoder': le_bmi,
        'sleep_disorder_encoder': le_sleep_disorder
    }

## 4. Preprocess Data and Split into Train/Test Sets

In [4]:
# Preprocess data
X, y, encoders = preprocess_data(df)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

Training set size: (299, 12)
Test set size: (75, 12)


## 5. Feature Scaling

In [5]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 6. Model Evaluation Function

In [6]:
# Model evaluation function
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    """Evaluate model performance"""
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    return {
        'Model': model_name,
        'RMSE': rmse,
        'MAE': mae,
        'R²': r2,
        'Predictions': y_pred
    }

## 7. Initialize Models

In [7]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.1),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'SVR': SVR(kernel='rbf', C=1.0),
    'Neural Network': MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)
}

## 8. Evaluate All Models

In [8]:
# Evaluate all models
results = []
model_predictions = {}

print("Model Performance Comparison:")
print("=" * 60)

for name, model in models.items():
    if name in ['SVR', 'Neural Network']:
        result = evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test, name)
    else:
        result = evaluate_model(model, X_train, X_test, y_train, y_test, name)
    
    results.append(result)
    model_predictions[name] = result['Predictions']
    
    print(f"{name:18} | RMSE: {result['RMSE']:.3f} | MAE: {result['MAE']:.3f} | R²: {result['R²']:.3f}")

Model Performance Comparison:
Linear Regression  | RMSE: 0.360 | MAE: 0.235 | R²: 0.959
Ridge Regression   | RMSE: 0.363 | MAE: 0.237 | R²: 0.958
Lasso Regression   | RMSE: 0.554 | MAE: 0.413 | R²: 0.902
Random Forest      | RMSE: 0.171 | MAE: 0.051 | R²: 0.991
Gradient Boosting  | RMSE: 0.076 | MAE: 0.032 | R²: 0.998
SVR                | RMSE: 0.296 | MAE: 0.146 | R²: 0.972
Neural Network     | RMSE: 0.310 | MAE: 0.142 | R²: 0.969


## 9. Results Summary

In [9]:
# Create results DataFrame
results_df = pd.DataFrame(results)[['Model', 'RMSE', 'MAE', 'R²']]
print(results_df)
print(f"\nBest Model: {results_df.loc[results_df['R²'].idxmax(), 'Model']}")

               Model      RMSE       MAE        R²
0  Linear Regression  0.359976  0.235230  0.958524
1   Ridge Regression  0.362710  0.237440  0.957891
2   Lasso Regression  0.554083  0.413048  0.901734
3      Random Forest  0.170638  0.051200  0.990680
4  Gradient Boosting  0.075667  0.031784  0.998167
5                SVR  0.295771  0.145668  0.972000
6     Neural Network  0.309665  0.142479  0.969307

Best Model: Gradient Boosting


## 10. Feature Importance Analysis

In [10]:
# Feature importance (Random Forest)
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print(f"Feature Importance (Random Forest):")
print("=" * 40)
print(feature_importance)

Feature Importance (Random Forest):
                    Feature  Importance
2          Quality of Sleep    0.721522
4                Heart Rate    0.123025
1            Sleep Duration    0.087840
5               Daily Steps    0.016222
9        Occupation_Encoded    0.014461
3   Physical Activity Level    0.012520
0                       Age    0.010323
8            Gender_Encoded    0.006953
6               Systolic_BP    0.003268
7              Diastolic_BP    0.001935
10     BMI_Category_Encoded    0.001387
11   Sleep_Disorder_Encoded    0.000544


## 11. Prediction Function

In [11]:
# Prediction function
def predict_stress_level(sleep_duration, quality_sleep, age, heart_rate, 
                        physical_activity, daily_steps, gender='Male', 
                        occupation='Engineer', bmi_category='Normal', 
                        systolic_bp=120, diastolic_bp=80, sleep_disorder='None'):
    """Predict stress level for new input"""
    
    # Create input array
    input_data = {
        'Age': age,
        'Sleep Duration': sleep_duration,
        'Quality of Sleep': quality_sleep,
        'Physical Activity Level': physical_activity,
        'Heart Rate': heart_rate,
        'Daily Steps': daily_steps,
        'Systolic_BP': systolic_bp,
        'Diastolic_BP': diastolic_bp,
        'Gender_Encoded': 1 if gender == 'Male' else 0,
        'Occupation_Encoded': encoders['occupation_encoder'].transform([occupation])[0],
        'BMI_Category_Encoded': encoders['bmi_encoder'].transform([bmi_category])[0],
        'Sleep_Disorder_Encoded': encoders['sleep_disorder_encoder'].transform([sleep_disorder])[0]
    }
    
    input_df = pd.DataFrame([input_data])
    prediction = rf_model.predict(input_df)[0]
    
    return round(prediction, 1)

## 12. Example Prediction

In [12]:
# Example prediction
sample_prediction = predict_stress_level(
    sleep_duration=7.0,
    quality_sleep=7,
    age=30,
    heart_rate=70,
    physical_activity=60,
    daily_steps=8000
)

print(f"Sample Prediction:")
print(f"Predicted Stress Level: {sample_prediction}")

Sample Prediction:
Predicted Stress Level: 5.4
