In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import root_mean_squared_error,r2_score,mean_absolute_error

In [4]:
df=pd.read_csv("cleaned_dataset.csv")
df.head()

Unnamed: 0,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,...,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence
0,37.0,Female,United States,IL,Unknown,No,Yes,Often,6-25,No,...,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No
1,44.0,Male,United States,IN,Unknown,No,No,Rarely,More than 1000,No,...,Don't know,Don't know,Maybe,No,No,No,No,No,Don't know,No
2,32.0,Male,Canada,CA,Unknown,No,No,Rarely,6-25,No,...,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No
3,31.0,Male,United Kingdom,CA,Unknown,Yes,Yes,Often,26-100,No,...,No,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes
4,31.0,Male,United States,TX,Unknown,No,No,Never,100-500,Yes,...,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No


In [5]:
features = [
    'self_employed',
    'no_employees',""
    'Gender',
    'benefits',
    'care_options',
    'wellness_program',
    'seek_help',
    'leave',
    'mental_health_consequence',
    'coworkers',
    'supervisor',
    'mental_health_interview',
]
target='Age'

In [6]:
df_copy=df[features+[target]].copy()

In [7]:
X=df_copy[features]
y=df_copy[target]

In [8]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)


In [9]:
encoder_n_scale= Pipeline([

    ('ohe', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)),
    ('scaler', StandardScaler())
])

In [42]:
preprocessor=ColumnTransformer([
('encode_n_scaler',encoder_n_scale,features),
],remainder='passthrough')
preprocessor


In [43]:
X_train_preprocessed=preprocessor.fit_transform(X_train)
X_test_preprocessed=preprocessor.transform(X_test)

In [44]:
lr_pipeline=Pipeline([
    ("preprocess",preprocessor),
    ('lr',LinearRegression())
])
lr_pipeline

In [None]:
lr_pipeline.fit(X_train, y_train)
y_pred = lr_pipeline.predict(X_test)

In [53]:
rmse=root_mean_squared_error(y_pred=y_pred,y_true=y_test)
mae=mean_absolute_error(y_pred=y_pred,y_true=y_test)
r2=r2_score(y_pred=y_pred,y_true=y_test)
print(f"root mean squared error:{rmse}")
print(f"mean absolute error:{mae}")
print(f"r2 score:{r2}")

root mean squared error:7.044480697201581
mean absolute error:5.517173055860105
r2 score:0.020306959067610375


In [71]:
rf_pipeline=Pipeline([
    ("preprocess",preprocessor),
    ('rf',RandomForestRegressor(random_state=42))
])
rf_pipeline

In [72]:
rf_pipeline.fit(X_train, y_train)
y_pred = rf_pipeline.predict(X_test)

In [73]:
rmse=root_mean_squared_error(y_pred=y_pred,y_true=y_test)
mae=mean_absolute_error(y_pred=y_pred,y_true=y_test)
r2=r2_score(y_pred=y_pred,y_true=y_test)
print(f"root mean squared error:{rmse}")
print(f"mean absolute error:{mae}")
print(f"r2 score:{r2}")

root mean squared error:7.7822794772060435
mean absolute error:6.035588737717308
r2 score:-0.19565449943289703


In [74]:
xgb_pipeline=Pipeline([
    ("preprocess",preprocessor),
    ('xgb',XGBRegressor(random_state=42))
])
xgb_pipeline

In [75]:
xgb_pipeline.fit(X_train, y_train)
y_pred = xgb_pipeline.predict(X_test)

In [76]:
rmse=root_mean_squared_error(y_pred=y_pred,y_true=y_test)
mae=mean_absolute_error(y_pred=y_pred,y_true=y_test)
r2=r2_score(y_pred=y_pred,y_true=y_test)
print(f"root mean squared error:{rmse}")
print(f"mean absolute error:{mae}")
print(f"r2 score:{r2}")

root mean squared error:8.954742282167443
mean absolute error:6.9134076254708425
r2 score:-0.583063147426583


In [None]:
gb_pipeline=Pipeline([
    ("preprocess",preprocessor),
    ('xgb',GradientBoostingRegressor(random_state=42))
])
gb_pipeline

In [78]:
gb_pipeline.fit(X_train, y_train)
y_pred = gb_pipeline.predict(X_test)

In [79]:
rmse=root_mean_squared_error(y_pred=y_pred,y_true=y_test)
mae=mean_absolute_error(y_pred=y_pred,y_true=y_test)
r2=r2_score(y_pred=y_pred,y_true=y_test)
print(f"root mean squared error:{rmse}")
print(f"mean absolute error:{mae}")
print(f"r2 score:{r2}")

root mean squared error:7.183227231512913
mean absolute error:5.554225239423484
r2 score:-0.0186647221458085


In [81]:
best_model=LinearRegression()
print(f"best results are with {best_model}")

best results are with LinearRegression()


In [82]:
import joblib
joblib.dump(lr_pipeline,"regressor.pkl")

['regressor.pkl']