In [None]:
import os, math, numpy as np, pandas as pd, matplotlib.pyplot as plt
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="seaborn.axisgrid")
# ML models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns
from xgboost import XGBRegressor
# Deep Learning
import tensorflow as tf
from tensorflow.keras import Sequential, Input
from tensorflow.keras.layers import Dense, Normalization, Dropout, BatchNormalization
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
df = pd.read_csv("/kaggle/input/medical-insurance-price-prediction/Medical_insurance.csv")
print(df.info())
print(df.head())

# Quick exploration of target distribution
df["charges"].plot(kind="hist", bins=40, title="Distribution of Medical Charges")
plt.show()

In [None]:
dataset = df.copy()
dataset.tail()
dataset.isna().sum()

In [None]:
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

In [None]:
sns.pairplot(
    df[['age', 'bmi', 'children', 'charges']], 
    diag_kind='kde'
)
train_dataset.describe().transpose()

In [None]:
train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('charges')
test_labels = test_features.pop('charges')

In [None]:
train_dataset.describe().transpose()[['mean', 'std']]

In [None]:
# One-hot encode categorical columns
train_features = pd.get_dummies(train_features, drop_first=True)
test_features = pd.get_dummies(test_features, drop_first=True)

# Convert everything to float32
train_features = train_features.astype(np.float32)
test_features = test_features.astype(np.float32)

# Normalization layer
normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(np.array(train_features))

print("Mean values learned by normalizer:")
print(normalizer.mean.numpy())

In [None]:
first = np.array(train_features[:1])

with np.printoptions(precision=2, suppress=True):
  print('First example:', first)
  print()
  print('Normalized:', normalizer(first).numpy())

In [None]:
x_train, x_test = train_features, test_features
y_train, y_test = train_labels, test_labels

In [None]:
#Use feature "bmi"
bmi = np.array(train_features['bmi'])

bmi_normalizer = layers.Normalization(input_shape=[1,], axis=None)
bmi_normalizer.adapt(bmi)

bmi_model = keras.Sequential([
    bmi_normalizer,
    layers.Dense(units=1)
])

bmi_model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.1),
                 loss="mean_absolute_error",
                 metrics=['mae'])

history_bmi = bmi_model.fit(train_features['bmi'],
                           train_labels,
                           epochs = 100,
                           validation_split = 0.2,
                           verbose = 0)

#Evaluate
bmi_eval = bmi_model.evaluate(test_features['bmi'], test_labels, verbose = 1)
print("Linear Single Feature (BMI) MAE: ", bmi_eval[1])

In [None]:
def plot_bmi_loss(history):
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Error [Charges]')
    plt.legend()
    plt.grid(True)
    plt.title("Linear Regression (Single Feature: BMI) Training History")
    plt.show()

# Call it
plot_bmi_loss(history_bmi)

In [None]:
# ======================
# 1. Prepare Data
# ======================
numeric_train = np.array(train_features, dtype=np.float32)
numeric_test  = np.array(test_features, dtype=np.float32)

# Normalizer for all numeric features
normalizer = layers.Normalization(axis=-1)
normalizer.adapt(numeric_train)

# ======================
# 2. Define Model
# ======================
multi_model = keras.Sequential([
    normalizer,
    layers.Dense(units=1)  # linear regression output
])

multi_model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.1),
    loss="mean_absolute_error",
    metrics=['mae']
)

# ======================
# 3. Train
# ======================
history_multi = multi_model.fit(
    numeric_train,
    train_labels,
    epochs=100,
    validation_split=0.2,
    verbose=0
)

# ======================
# 4. Evaluate
# ======================
multi_eval = multi_model.evaluate(numeric_test, test_labels, verbose=1)
print("Multiple Linear Regression MAE:", multi_eval[1])

# ======================
# 5. Plot Training Loss
# ======================
def plot_loss(history, title):
    plt.plot(history.history['loss'], label='loss')
    plt.plot(history.history['val_loss'], label='val_loss')
    plt.xlabel('Epoch')
    plt.ylabel('MAE')
    plt.title(title)
    plt.legend()
    plt.grid(True)

plot_loss(history_multi, "Multiple Linear Regression Training")


In [None]:
#Define DNN model (Single Feature)
bmi_dnn_model = keras.Sequential([
    bmi_normalizer,
    layers.Dense(64, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])
#Compile model
bmi_dnn_model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss="mae",
    metrics=['mae']
)

#Train model
history_bmi_dnn = bmi_dnn_model.fit(
    bmi,
    train_labels,
    epochs=100,
    validation_split=0.2,
    verbose=0
)
bmi_dnn_eval = bmi_dnn_model.evaluate(np.array(test_features['bmi']), test_labels, verbose=1)
print("DNN (Single Feature: BMI) MAE:", bmi_dnn_eval[1])

def plot_bmi_dnn_loss(history):
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Val Loss')
    plt.xlabel('Epoch')
    plt.ylabel('MAE [Charges]')
    plt.legend()
    plt.grid(True)
    plt.title("DNN (Single Feature: BMI)")
    plt.show()

plot_bmi_dnn_loss(history_bmi_dnn)

In [None]:
#DNN - Multiple Features
# Define model
dnn_model = keras.Sequential([
    normalizer,                          
    layers.Dense(128, activation='relu'),
    layers.Dense(128, activation='relu'), 
    layers.Dense(1)                       
])
# Compile
dnn_model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss="mae",
    metrics=['mae']
)

# Train
history_dnn = dnn_model.fit(
    train_features, train_labels,
    epochs=100,
    validation_split=0.2,
    verbose=0
)

# Evaluate
dnn_eval = dnn_model.evaluate(test_features, test_labels, verbose=1)
print("DNN (Multiple Features) MAE:", dnn_eval[1])

# Plot loss
def plot_dnn_loss(history):
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Val Loss')
    plt.xlabel('Epoch')
    plt.ylabel('MAE [Charges]')
    plt.legend()
    plt.grid(True)
    plt.title("DNN (Multiple Features)")
    plt.show()

plot_dnn_loss(history_dnn)

In [None]:
#Visualization of Predictions
y_pred = dnn_model.predict(test_features)

plt.figure(figsize=(8, 6))
plt.scatter(test_labels, y_pred, alpha=0.6)
plt.xlabel("Actual Charges")
plt.ylabel("Predicted Charges")
plt.title("DNN (Multiple Features) - Actual vs Predicted Charges")

# Add diagonal reference line
lims = [0, max(test_labels.max(), y_pred.max())]
plt.plot(lims, lims, 'r--', label="Perfect Prediction")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
rf_model = RandomForestRegressor(
    n_estimators=200, random_state=42
)
rf_model.fit(train_features, train_labels)

rf_pred = rf_model.predict(test_features)
rf_mae = mean_absolute_error(test_labels, rf_pred)

print("Random Forest MAE:", rf_mae)

In [None]:
xgb_model = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    random_state=42
)
xgb_model.fit(train_features, train_labels)

xgb_pred = xgb_model.predict(test_features)
xgb_mae = mean_absolute_error(test_labels, xgb_pred)

print("XGBoost MAE:", xgb_mae)

In [None]:
#Performance Comparison
# Collect MAE results
results = {
    "Linear (BMI only)": bmi_eval[1],
    "Linear (Multiple Features)": multi_eval[1],
    "DNN (BMI only)": bmi_dnn_eval[1],
    "DNN (Multiple Features)": dnn_eval[1],
    "Random Forest": rf_mae,
    "XGBoost": xgb_mae
}

# Convert to DataFrame for easy visualization
results_df = pd.DataFrame.from_dict(results, orient='index', columns=['MAE'])
print(results_df)
results_df.plot(kind='bar', figsize=(9,6), legend=False)
plt.ylabel("MAE [Charges]")
plt.title("Model Performance Comparison")
plt.grid(axis='y')
plt.show()