In [1]:
import numpy as np

import joblib
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error



In [2]:
X_train = np.load("../data/X_train.npy")
X_eval  = np.load("../data/X_eval.npy")
y_train = np.load("../data/y_train.npy")
y_eval  = np.load("../data/y_eval.npy")

X_train.shape, X_eval.shape


((700000, 32), (200000, 32))

In [3]:
ridge = Ridge(alpha=1.0, random_state=42)
ridge.fit(X_train, y_train)

y_pred_ridge = ridge.predict(X_eval)

print("Ridge Regression Performance")
print("R2:", r2_score(y_eval, y_pred_ridge))
print("MAE:", mean_absolute_error(y_eval, y_pred_ridge))
print("RMSE:", np.sqrt(mean_squared_error(y_eval, y_pred_ridge)))


Ridge Regression Performance
R2: 0.9940612888330873
MAE: 275.67698588252745
RMSE: 340.46869660035384


  ret = a @ b
  ret = a @ b
  ret = a @ b
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


In [4]:
np.isinf(X_train).sum(), np.isnan(X_train).sum()


(np.int64(0), np.int64(0))

In [5]:
sgd = SGDRegressor(
    loss="squared_error",     # Linear regression loss
    penalty="l2",             # Ridge-style regularization
    alpha=0.0001,             # Regularization strength
    learning_rate="invscaling",
    eta0=0.01,
    max_iter=2000,
    tol=1e-3,
    random_state=42
)


In [6]:
sgd.fit(X_train, y_train)


In [7]:
y_pred_sgd = sgd.predict(X_eval)


  ret = a @ b
  ret = a @ b
  ret = a @ b


In [8]:
print("SGD Regressor Performance")
print("R2:", r2_score(y_eval, y_pred_sgd))
print("MAE:", mean_absolute_error(y_eval, y_pred_sgd))
print("RMSE:", np.sqrt(((y_eval - y_pred_sgd) ** 2).mean()))


SGD Regressor Performance
R2: 0.9940586802333544
MAE: 275.7189976669356
RMSE: 340.5434644259881


In [9]:
np.isnan(y_pred_sgd).sum(), np.isinf(y_pred_sgd).sum()


(np.int64(0), np.int64(0))

In [10]:
rf = RandomForestRegressor(
    n_estimators=200,        # number of trees
    max_depth=18,            # controls overfitting
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)


In [11]:
rf.fit(X_train, y_train)


In [12]:
y_pred_rf = rf.predict(X_eval)


In [13]:
print("Random Forest Performance")
print("R2:", r2_score(y_eval, y_pred_rf))
print("MAE:", mean_absolute_error(y_eval, y_pred_rf))
print("RMSE:", np.sqrt(((y_eval - y_pred_rf) ** 2).mean()))


Random Forest Performance
R2: 0.9905841045847578
MAE: 338.30010766689577
RMSE: 428.7083075290198


In [14]:
np.isnan(y_pred_rf).sum(), np.isinf(y_pred_rf).sum()


(np.int64(0), np.int64(0))

In [15]:
import pandas as pd

importances = rf.feature_importances_

preprocessor = joblib.load("../models/preprocessor.pkl")

num_features = ['age', 'bmi', 'children']
cat_features = [
    'gender', 'smoker', 'region', 'medical_history',
    'family_medical_history', 'exercise_frequency',
    'occupation', 'coverage_level'
]

feature_names = (
    preprocessor.named_transformers_['num']
        .get_feature_names_out(num_features).tolist()
    +
    preprocessor.named_transformers_['cat']
        .named_steps['onehot']
        .get_feature_names_out(cat_features).tolist()
)

feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

feature_importance_df.head(10)


Unnamed: 0,feature,importance
6,smoker_no,0.201755
30,coverage_level_Premium,0.183819
13,medical_history_Heart disease,0.155696
17,family_medical_history_Heart disease,0.15514
7,smoker_yes,0.120969
20,exercise_frequency_Frequently,0.021649
31,coverage_level_Standard,0.017559
29,coverage_level_Basic,0.017256
16,family_medical_history_Diabetes,0.013439
1,bmi,0.012138


In [16]:
joblib.dump(rf, "../models/insurance_model.pkl")


['../models/insurance_model.pkl']