# Mental Health in the Tech Industry  
## **Supervised Learning - Regression**
---
### Importing Libraries
(well it's a ritual now)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error
import joblib, json
#removes the unwanted warnings for better presentation
import warnings
warnings.filterwarnings('ignore')
print("All Libraries successfully imported and ready to use 💪")

All Libraries successfully imported and ready to use 💪


### Data Loading and Description

In [2]:
df = pd.read_csv("cleaned_survey.csv")
df.sample(10)

Unnamed: 0,Age,Gender,Country,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,benefits,...,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence
497,27,Male,Singapore,No,No,No,Sometimes,1-5,Yes,No,...,Yes,Medium,Yes,No,Some of them,Yes,No,No,No,No
738,30,Male,United States,No,Yes,No,Never,100-500,No,Yes,...,Don't know,Difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,No
730,37,Male,United States,No,No,No,Never,500-1000,Yes,Yes,...,Don't know,Medium,Maybe,No,Some of them,Yes,No,No,Yes,Yes
467,30,Male,Germany,No,No,No,Often,26-100,No,No,...,Don't know,Medium,Maybe,No,Some of them,Some of them,Maybe,Yes,Don't know,No
884,40,Male,United States,No,Yes,Yes,Sometimes,More than 1000,No,Yes,...,Yes,Medium,Yes,Yes,No,No,No,Maybe,No,Yes
543,49,Male,Japan,Yes,Yes,Yes,Sometimes,1-5,Yes,No,...,Don't know,Medium,Yes,Yes,No,No,No,No,No,Yes
36,24,Male,United Kingdom,No,No,Yes,Sometimes,6-25,No,No,...,Don't know,Medium,Maybe,Maybe,Some of them,No,No,Yes,No,Yes
87,29,Male,United States,No,No,No,Never,26-100,No,Don't know,...,Don't know,Medium,No,No,Yes,Yes,Maybe,Maybe,No,No
547,37,Male,Ireland,Yes,No,No,Never,6-25,No,No,...,Don't know,Medium,Yes,Maybe,No,No,No,Maybe,Don't know,No
1002,39,Male,Greece,No,No,No,Sometimes,6-25,Yes,No,...,Yes,Medium,Yes,No,No,No,No,No,Don't know,No


### Splitting our dataset

In [3]:
print(df.info())

X = df.drop(columns=['Age'])
y = df['Age']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1024 entries, 0 to 1023
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Age                        1024 non-null   int64 
 1   Gender                     1024 non-null   object
 2   Country                    1024 non-null   object
 3   self_employed              1024 non-null   object
 4   family_history             1024 non-null   object
 5   treatment                  1024 non-null   object
 6   work_interfere             1024 non-null   object
 7   no_employees               1024 non-null   object
 8   remote_work                1024 non-null   object
 9   benefits                   1024 non-null   object
 10  care_options               1024 non-null   object
 11  wellness_program           1024 non-null   object
 12  seek_help                  1024 non-null   object
 13  anonymity                  1024 non-null   object
 14  leave   

### Dynamic preprocessor builder function
Builds a pipeline to clean, encode, and scale numeric and categorical data for modeling.

In [4]:
def build_preprocessor(X):
    num_cols = X.select_dtypes(include=["int64", "float64"]).columns
    cat_cols = X.select_dtypes(exclude=["int64", "float64"]).columns

    # Remove duplicates just in case
    num_cols = list(set(num_cols))
    cat_cols = list(set(cat_cols))

    num_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", num_pipe, num_cols),
            ("cat", cat_pipe, cat_cols)
        ]
    )

    return preprocessor

"One Function to run them all (again)"   

This function is used to evaluate the different models of classification

In [5]:
def evaluate_model(name, model):
    preprocessor = build_preprocessor(X)
    pipe = Pipeline([
        ("pre", preprocessor),
        ("model", model)
    ])
    pipe.fit(X_train, y_train)
    y_pred_test = pipe.predict(X_test)

    r2 = r2_score(y_test, y_pred_test)
    rmse = root_mean_squared_error(y_test, y_pred_test)
    mae = mean_absolute_error(y_test, y_pred_test)

    results.append({
        "Model": name,
        "R² score": r2,
        "RMSE": rmse,
        "MAE": mae,
    })

    print(f"Model: {name}\n")
    if "Ridge" in name:
        print("Best HyperParameters:", ridge_gs.best_params_)
    elif "Lasso" in name:
        print("Best HyperParameters:", lasso_gs.best_params_)
    elif "XGBoost" in name:
        print("Best HyperParameters:", xgb_gs.best_params_)
    elif "Support" in name:
        print("Best Params:", svr_gs.best_params_)
    elif "Gradient" in name:
        print("Best Params:", gb_gs.best_params_)
    elif "Random" in name:
        print("Best Params:",rf_gs.best_params_)
    print(f"RMSE: {rmse:.3f}")
    print(f"MAE: {mae:.3f}")
    print(f"R² score: {r2:.3f}")
    print("-" * 42)

## Training of different models starts!
Using **Grid Search CV** to tune the best Hyperparameters for all models

---

### Linear Regression

In [6]:
results = []
preprocessor = build_preprocessor(X_train)

# Linear Regression (no tuning)
evaluate_model("Linear Regression", LinearRegression())

Model: Linear Regression

RMSE: 7.366
MAE: 5.891
R² score: -0.036
------------------------------------------


## Regularization in action
###  Ridge (L2)

In [7]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)
# Ridge Regression
ridge_pipe = Pipeline([
    ('pre', preprocessor),
    ('model', Ridge(max_iter=2000,random_state=42))
])
ridge_param_grid = {
    'model__alpha': [0.01, 0.1, 1.0, 10, 100],
    'model__solver': ['auto', 'svd', 'cholesky', 'lsqr', 'saga']
}
ridge_gs = GridSearchCV(ridge_pipe, ridge_param_grid, scoring='r2', cv=cv, n_jobs=-1)
ridge_gs.fit(X_train, y_train)
evaluate_model("Ridge Regression", ridge_gs.best_estimator_.named_steps["model"])


Model: Ridge Regression

Best HyperParameters: {'model__alpha': 100, 'model__solver': 'lsqr'}
RMSE: 7.178
MAE: 5.690
R² score: 0.016
------------------------------------------


### Lasso (L1)

In [8]:
# Lasso Regression
lasso_pipe = Pipeline([
    ('pre', preprocessor),
    ('model', Lasso(max_iter=2000, random_state=42))
])
lasso_param_grid = {
    'model__alpha': [0.0001, 0.001, 0.01, 0.1, 1.0],
    'model__selection': ['cyclic', 'random']
}
lasso_gs = GridSearchCV(lasso_pipe, lasso_param_grid, scoring='r2', cv=cv, n_jobs=-1)
lasso_gs.fit(X_train, y_train)
evaluate_model("Lasso Regression", lasso_gs.best_estimator_.named_steps["model"])

Model: Lasso Regression

Best HyperParameters: {'model__alpha': 0.1, 'model__selection': 'cyclic'}
RMSE: 7.141
MAE: 5.655
R² score: 0.026
------------------------------------------


### Random Forest is back!

In [9]:
# Random Forest Regressor
rf_pipe = Pipeline([
    ('pre', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])
rf_param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 10, 20],
    'model__max_features': ['sqrt', 'log2', None]
}
rf_gs = GridSearchCV(rf_pipe, rf_param_grid, scoring='r2', cv=cv, n_jobs=-1)
rf_gs.fit(X_train, y_train)
evaluate_model("Random Forest Regressor", rf_gs.best_estimator_.named_steps["model"])

Model: Random Forest Regressor

Best Params: {'model__max_depth': 10, 'model__max_features': 'log2', 'model__n_estimators': 100}
RMSE: 6.980
MAE: 5.515
R² score: 0.069
------------------------------------------


### Random Forest (log-transformation)

In [10]:
y_train_rf_log = np.log1p(y_train)
y_test_rf_log = np.log1p(y_test)

# Train your model with log-transformed target
rf_pipe_log = Pipeline([
    ('pre', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

rf_log = GridSearchCV(rf_pipe_log, rf_param_grid, cv=cv, n_jobs=-1)
rf_log.fit(X_train, y_train_rf_log)

# Predict and inverse transform
y_pred_rf_log = rf_log.predict(X_test)
y_pred_rf = np.expm1(y_pred_rf_log)

results.append({
        "Model": "Random Forest (log-transformed)",
        "R² score": r2_score(y_test, y_pred_rf),
        "RMSE": root_mean_squared_error(y_test, y_pred_rf),
        "MAE": mean_absolute_error(y_test, y_pred_rf)
})

print("Model: Random Forest (log-transformed)\n")
print("Best Params:",rf_log.best_params_)
print("MAE:", f"{mean_absolute_error(y_test, y_pred_rf):.3f}")
print("RMSE:", f"{root_mean_squared_error(y_test, y_pred_rf):.3f}")
print("R² Score:", f"{r2_score(y_test, y_pred_rf):.3f}")

Model: Random Forest (log-transformed)

Best Params: {'model__max_depth': 10, 'model__max_features': 'sqrt', 'model__n_estimators': 200}
MAE: 5.451
RMSE: 7.036
R² Score: 0.054


### Gradient Boosting

In [11]:
# Gradient Boosting Regressor
gb_pipe = Pipeline([
    ('pre', preprocessor),
    ('model', GradientBoostingRegressor(random_state=42))
])
gb_param_grid = {
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.05, 0.1, 0.2],
    'model__max_depth': [3, 5],
    'model__subsample': [0.8, 1.0],
    'model__max_features': ['sqrt', 'log2', None]
}
gb_gs = GridSearchCV(gb_pipe, gb_param_grid, scoring='r2', cv=cv, n_jobs=-1)
gb_gs.fit(X_train, y_train)
evaluate_model("Gradient Boosting Regressor", gb_gs.best_estimator_.named_steps["model"])

Model: Gradient Boosting Regressor

Best Params: {'model__learning_rate': 0.05, 'model__max_depth': 5, 'model__max_features': 'log2', 'model__n_estimators': 100, 'model__subsample': 1.0}
RMSE: 7.055
MAE: 5.587
R² score: 0.049
------------------------------------------


### SVM is also back!

In [12]:
# Support Vector Regressor
svr_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', SVR())
])
svr_param_grid = {
    'model__kernel': ['rbf', 'linear', 'poly', 'sigmoid'],
    'model__C': [0.1, 1, 10, 100],
    'model__gamma': ['scale', 'auto'],
    'model__epsilon': [0.1, 0.2, 0.3]
}
svr_gs = GridSearchCV(svr_pipe, svr_param_grid, scoring='r2', cv=cv, n_jobs=-1)
svr_gs.fit(X_train, y_train)
evaluate_model('Support Vector Regressor', svr_gs.best_estimator_.named_steps['model'])

Model: Support Vector Regressor

Best Params: {'model__C': 10, 'model__epsilon': 0.3, 'model__gamma': 'scale', 'model__kernel': 'rbf'}
RMSE: 7.260
MAE: 5.683
R² score: -0.007
------------------------------------------


### XGBoost is also back!

In [13]:
# XGBoost Regressor
xgb_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(objective='reg:squarederror', random_state=42, n_jobs=-1))
])
xgb_param_grid = {
    'model__n_estimators': [100, 200, 500],
    'model__max_depth': [3, 5, 7, 9],
    'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'model__gamma': [0, 0.1, 0.2]
}
xgb_gs = GridSearchCV(xgb_pipe, xgb_param_grid, scoring='r2', cv=cv, n_jobs=-1)
xgb_gs.fit(X_train, y_train)
evaluate_model('XGBRegressor', xgb_gs.best_estimator_.named_steps['model'])

Model: XGBRegressor

RMSE: 7.051
MAE: 5.560
R² score: 0.050
------------------------------------------


### XGBRegressor (log-transformation)

In [14]:
y_train_xgb_log = np.log1p(y_train)
y_test_xgb_log = np.log1p(y_test)

# Train your model with log-transformed target
xgb_pipe_log = Pipeline([
    ('pre', preprocessor),
    ('model', XGBRegressor(objective='reg:squarederror', random_state=42, n_jobs=-1))
])

xgb_log = GridSearchCV(xgb_pipe_log, xgb_param_grid, cv=cv, n_jobs=-1)
xgb_log.fit(X_train, y_train_xgb_log)

# Predict and inverse transform
y_pred_xgb_log = xgb_log.predict(X_test)
y_pred_xgb = np.expm1(y_pred_xgb_log)

results.append({
        "Model": "XGBoost (log-transformed)",
        "R² score": r2_score(y_test, y_pred_xgb),
        "RMSE": root_mean_squared_error(y_test, y_pred_xgb),
        "MAE": mean_absolute_error(y_test, y_pred_xgb)
})

print("Model: XGBoost (log-transformed)\n")
print("Best Params:",xgb_log.best_params_)
print("MAE:", f"{mean_absolute_error(y_test, y_pred_xgb):.3f}")
print("RMSE:", f"{root_mean_squared_error(y_test, y_pred_xgb):.3f}")
print("R² Score:", f"{r2_score(y_test, y_pred_xgb):.3f}")

Model: XGBoost (log-transformed)

Best Params: {'model__gamma': 0, 'model__learning_rate': 0.05, 'model__max_depth': 3, 'model__n_estimators': 100}
MAE: 5.474
RMSE: 7.046
R² Score: 0.052


### Comparision Table

In [15]:
results_df = pd.DataFrame(results).sort_values('R² score', ascending=False).reset_index(drop=True)
print("📊 Regression Model Comparison:")
display(results_df)

📊 Regression Model Comparison:


Unnamed: 0,Model,R² score,RMSE,MAE
0,Random Forest Regressor,0.069362,6.980238,5.515165
1,Random Forest (log-transformed),0.054473,7.035854,5.450534
2,XGBoost (log-transformed),0.051649,7.04635,5.473678
3,XGBRegressor,0.050332,7.051243,5.560343
4,Gradient Boosting Regressor,0.049264,7.055208,5.587464
5,Lasso Regression,0.025909,7.141338,5.655086
6,Ridge Regression,0.015816,7.178238,5.690324
7,Support Vector Regressor,-0.006819,7.260316,5.683155
8,Linear Regression,-0.036312,7.365888,5.890834


### Saving our Best Regression Model ✨🥂

In [16]:
best_model_name = 'Random Forest'
print(f'Best model by R² score: {best_model_name}')

final_model = rf_gs.best_estimator_.named_steps['model']

final_pipeline = Pipeline([
    ('pre', preprocessor),
    ('model', final_model)
])

final_pipeline.fit(X_train, y_train)

with open("regression_columns.json", "w") as f:
    json.dump(list(X_train.columns), f)

# Save to file
joblib.dump(final_pipeline, 'regression_model.pkl')
print('Final model pipeline saved as regression_model.pkl')

Best model by R² score: Random Forest
Final model pipeline saved as regression_model.pkl
