In [13]:
# Step 1: Import libraries
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression

# Step 2: Load data
df = pd.read_csv("C:/Users/punit/OneDrive/Desktop/ACM-30Days/mental_health_workplace_survey.csv")

# Step 3: Define target and drop missing values in target
target = "StressLevel"
df = df.dropna(subset=[target])

# Step 4: Select features
features = [
    "Gender", "RemoteWork", "SleepHours", "WorkHoursPerWeek",
    "BurnoutLevel", "JobSatisfaction", "ManagerSupportScore",
    "CareerGrowthScore"
]

data = df[features + [target]].copy()

# Step 5: Separate numerical and categorical columns
numerical_cols = data.select_dtypes(include=["int64", "float64"]).columns.drop(target).tolist()
categorical_cols = data.select_dtypes(include=["object"]).columns.tolist()

# Step 6: Create interaction features
data["StressBurnoutInteraction"] = data["StressLevel"] * data["BurnoutLevel"]
data["SleepPerWorkHour"] = data["SleepHours"] / (data["WorkHoursPerWeek"] + 1e-5)

# Add new features to numerical columns list
numerical_cols += ["StressBurnoutInteraction", "SleepPerWorkHour"]

# Step 7: Train-test split
X = data[numerical_cols + categorical_cols]
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42)

# Step 8: Feature selection using mutual information
mi_scores = mutual_info_regression(X_train[numerical_cols], y_train)
mi_series = pd.Series(mi_scores, index=numerical_cols)

top_numerical = mi_series.sort_values(ascending=False).head(8).index.tolist()

# Combine top numerical and categorical features
selected_features = top_numerical + categorical_cols
X_train = X_train[selected_features]
X_test = X_test[selected_features]

# Step 9: Preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), top_numerical),
    ("cat", OneHotEncoder(drop="first", sparse_output=False), categorical_cols)
])

# Step 10: Define and train models
models = {
    "LinearRegression": LinearRegression(),
    "RidgeRegression": Ridge(alpha=1.0),
    "LassoRegression": Lasso(alpha=0.1)
}

results = {}

for name, model in models.items():
    pipeline = Pipeline(steps=[
        ("preprocessing", preprocessor),
        ("regressor", model)
    ])

    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)

    mse = mean_squared_error(y_test, preds)
    r2 = r2_score(y_test, preds)

    results[name] = {"MSE": mse, "R² Score": r2}

# Step 11: Display Results
results_df = pd.DataFrame(results).T
print("Regression Results (sorted by R² Score):")
print(results_df.sort_values(by="R² Score", ascending=False).round(5))

Regression Results (sorted by R² Score):
                      MSE  R² Score
LassoRegression   1.17508   0.82256
RidgeRegression   1.17554   0.82249
LinearRegression  1.17618   0.82239
