#### *E-mail: shahat769674@gmail.com*

## **Dataset:**
*Developer Stress Simulation Dataset*

*Link: https://www.kaggle.com/datasets/mabubakrsiddiq/developer-stress-simulation-dataset*


## Import Essential Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import (  RandomForestRegressor,
                                GradientBoostingRegressor,
                                VotingRegressor,
                                StackingRegressor
)

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

import warnings
warnings.filterwarnings("ignore")

## Task 1: Data Loading

In [3]:
data_path = "/content/sample_data/developer_stress.csv"
data = pd.read_csv(data_path)
data.head(10)

Unnamed: 0,Hours_Worked,Sleep_Hours,Bugs,Deadline_Days,Coffee_Cups,Meetings,Interruptions,Experience_Years,Code_Complexity,Remote_Work,Stress_Level
0,10,8,25,53,4,9,2,Senior,Medium,Yes,58.521033
1,7,8,33,33,2,6,9,Junior,Medium,Yes,47.461651
2,14,8,44,54,10,12,2,Junior,Low,No,59.21158
3,11,6,5,46,0,13,9,Mid,Low,Yes,100.0
4,8,7,36,23,9,3,2,Junior,Medium,Yes,28.784957
5,10,6,32,53,5,7,4,Senior,Medium,No,68.798863
6,13,4,21,21,1,4,4,Senior,High,No,100.0
7,6,4,20,4,2,0,3,Mid,Medium,Yes,100.0
8,10,7,5,55,0,3,0,Junior,Medium,Yes,42.817044
9,14,6,5,32,8,2,4,Junior,Medium,No,46.18385


### Quick EDA

In [4]:
data.shape

(500, 11)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Hours_Worked      500 non-null    int64  
 1   Sleep_Hours       500 non-null    int64  
 2   Bugs              500 non-null    int64  
 3   Deadline_Days     500 non-null    int64  
 4   Coffee_Cups       500 non-null    int64  
 5   Meetings          500 non-null    int64  
 6   Interruptions     500 non-null    int64  
 7   Experience_Years  500 non-null    object 
 8   Code_Complexity   500 non-null    object 
 9   Remote_Work       500 non-null    object 
 10  Stress_Level      500 non-null    float64
dtypes: float64(1), int64(7), object(3)
memory usage: 43.1+ KB


### Y Data Profiling

In [6]:
!pip install -U ydata-profiling

Collecting ydata-profiling
  Downloading ydata_profiling-4.18.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting visions<0.8.2,>=0.7.5 (from visions[type_image_path]<0.8.2,>=0.7.5->ydata-profiling)
  Downloading visions-0.8.1-py3-none-any.whl.metadata (11 kB)
Collecting minify-html>=0.15.0 (from ydata-profiling)
  Downloading minify_html-0.18.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting filetype>=1.0.0 (from ydata-profiling)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting phik<0.13,>=0.12.5 (from ydata-profiling)
  Downloading phik-0.12.5-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (5.6 kB)
Collecting multimethod<2,>=1.4 (from ydata-profiling)
  Downloading multimethod-1.12-py3-none-any.whl.metadata (9.6 kB)
Collecting imagehash==4.3.2 (from ydata-profiling)
  Downloading ImageHash-4.3.2-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting dacite<2,>=1.9 (from ydata-profiling)
  Downloading

In [7]:
from ydata_profiling import ProfileReport

profile = ProfileReport(data, title="Developer Stress Simulation - EDA", explorative=True )
profile.to_file("ydata.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/11 [00:00<?, ?it/s][A
100%|██████████| 11/11 [00:00<00:00, 89.58it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

## Task 2: Data Preprocessing

In [8]:
X = data.drop('Stress_Level', axis=1)
y = data['Stress_Level']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

Training set shape: (400, 10)
Test set shape: (100, 10)


In [10]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

## Task 3: Pipeline Creation

In [11]:
# for numeric features
num_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

In [12]:
# for categorical features
cat_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]
)

In [13]:
# Combine them
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numeric_features),
        ('cat', cat_transformer, categorical_features)
    ]
)

preprocessor

## Task 4: Primary Model Selection

In [14]:
primary_model = RandomForestRegressor(
    n_estimators=100,
    random_state=42
)

## Task 5: Model Training

In [15]:
# Random Forest
rf_pipeline = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('model', primary_model)
    ]
)
rf_pipeline

## Task 6: Cross-Validation

In [16]:
# 5-fold cross-validation on the training set
cv_scores = cross_val_score(
    rf_pipeline,
    X_train,
    y_train,
    cv=5,
    scoring='neg_mean_squared_error'
)

mse_scores = -cv_scores

print("Cross-Validation MSE scores:", mse_scores)
print("Average CV MSE:", np.mean(mse_scores))
print("Standard deviation:", np.std(mse_scores))

Cross-Validation MSE scores: [ 94.7732312   83.92069699 106.39967617  60.36345547 117.59458612]
Average CV MSE: 92.6103291903744
Standard deviation: 19.669305538591015


## Task 7: Hyperparameter Tuning

In [18]:
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
}

grid_search = GridSearchCV(
    estimator=rf_pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [19]:
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best CV Score: ", -grid_search.best_score_)

Best Hyperparameters:  {'model__max_depth': 10, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 300}
Best CV Score:  89.57288014672697


## Task 8: Best Model Selection

In [21]:
# Best model from GridSearchCV
best_model = grid_search.best_estimator_

print("Best Model Pipeline:")
best_model

Best Model Pipeline:


## Task 9: Model Performance Evaluation

In [22]:
# Predict on the test set
y_pred = best_model.predict(X_test)

# Calculate metrics
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

# Display results
print("Test Set Performance Metrics:")
print(f"R² Score: {r2:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")

Test Set Performance Metrics:
R² Score: 0.8862
Mean Squared Error (MSE): 70.1743
Root Mean Squared Error (RMSE): 8.3770
Mean Absolute Error (MAE): 5.2260


## Save the Final Model

In [23]:
import pickle

filename = 'model.pkl'

with open(filename, 'wb') as file:
    pickle.dump(best_model, file)

print(f"Model saved as {filename}")

Model saved as model.pkl


In [24]:
# End