<a href="https://colab.research.google.com/github/salahAlawieh/Machine-Learning-with-Python/blob/main/fcc_predict_health_costs_with_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1 - Import Libraries

# Install TensorFlow documentation tools for plotting and model tracking
!pip install -q git+https://github.com/tensorflow/docs

# Common data handling and visualization libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Ensure TensorFlow 2.x is active in Colab
try:
    %tensorflow_version 2.x
except Exception:
    pass

# TensorFlow and Keras for building and training models
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# TensorFlow Docs utilities for visualization
import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling


In [None]:
# Cell 2 - Load Dataset

# Download dataset
!wget https://cdn.freecodecamp.org/project-data/health-costs/insurance.csv

# Load dataset into pandas DataFrame
dataset = pd.read_csv('insurance.csv')

# Preview last 5 rows
dataset.tail()

In [None]:
# Cell 3 - Inspect Dataset

# Preview first few rows
print("Sample records:")
display(dataset.head())

# Data types and structure
print("\nDataset Info:")
dataset.info()

# Summary statistics for numeric columns
print("\nSummary statistics:")
display(dataset.describe())

# Check for missing values
print("\nMissing values per column:")
print(dataset.isnull().sum())

In [None]:
# Cell 4 - Encode Categorical Data

# One-hot encode categorical columns: sex, smoker, region
dataset = pd.get_dummies(dataset, columns=['sex', 'smoker', 'region'], drop_first=True)

# Display dataset info and sample
print("Updated dataset info after encoding:")
dataset.info()
print("\nSample of encoded dataset:")
display(dataset.head())

In [None]:
# Cell 5 - Split Data and Separate Labels

from sklearn.model_selection import train_test_split

# Split 80% train, 20% test
train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=42)

# Separate labels ('expenses')
train_labels = train_dataset.pop('expenses')
test_labels = test_dataset.pop('expenses')

# Confirm shapes
print("Training set shape:", train_dataset.shape)
print("Testing set shape:", test_dataset.shape)
print("\nTraining labels shape:", train_labels.shape)
print("Testing labels shape:", test_labels.shape)

In [None]:
# Cell 6 - Normalize Numeric Features

from sklearn.preprocessing import StandardScaler

# Numeric features to scale
numeric_features = ['age', 'bmi', 'children']

# Initialize scaler
scaler = StandardScaler()

# Fit on training data and transform both train and test sets
train_dataset[numeric_features] = scaler.fit_transform(train_dataset[numeric_features])
test_dataset[numeric_features] = scaler.transform(test_dataset[numeric_features])

# Preview normalized dataset
print("Sample of normalized training dataset:")
display(train_dataset.head())

In [None]:
# Cell 7 - Build Improved Neural Network Model

# Define model with two hidden layers for nonlinear relationships
model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=[train_dataset.shape[1]]),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)  # Output layer for regression
])

# Compile model
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='mse',
    metrics=['mae']
)

# Display summary
model.summary()

In [None]:
# Cell 8 - Train Improved Neural Network Model

# Early stopping to prevent overfitting
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

# Train model
history = model.fit(
    train_dataset,
    train_labels,
    validation_split=0.2,
    epochs=500,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)

# Plot training and validation MAE
plt.figure(figsize=(8,5))
plt.plot(history.history['mae'], label='Train MAE')
plt.plot(history.history['val_mae'], label='Validation MAE')
plt.xlabel('Epoch')
plt.ylabel('MAE')
plt.legend()
plt.title('Training and Validation MAE')
plt.show()

In [None]:
# Cell 9 - Evaluate Improved Model on Test Set

# Evaluate model on test set
test_loss, test_mae = model.evaluate(test_dataset, test_labels, verbose=2)
print(f"\nTest Mean Absolute Error (MAE): {test_mae:.2f}")

# Make predictions
test_predictions = model.predict(test_dataset).flatten()

# Plot predicted vs actual
plt.figure(figsize=(8,6))
plt.scatter(test_labels, test_predictions, alpha=0.6)
plt.xlabel('Actual Expenses')
plt.ylabel('Predicted Expenses')
plt.title('Actual vs Predicted Healthcare Expenses')
lims = [0, 50000]
plt.xlim(lims)
plt.ylim(lims)
plt.plot(lims, lims, color='red', linestyle='--')
plt.show()

In [None]:
# Cell 10 - Test Challenge Submission (Fixed)

# Evaluate model on test set
loss, mae = model.evaluate(test_dataset, test_labels, verbose=2)
print("Testing set Mean Abs Error: {:5.2f} expenses".format(mae))

# Check if challenge passed
if mae < 3500:
    print("You passed the challenge. Great job!")
else:
    print("The Mean Abs Error must be less than 3500. Keep trying.")

# Predictions and plot
test_predictions = model.predict(test_dataset).flatten()

plt.figure(figsize=(8,6))
a = plt.axes(aspect='equal')
plt.scatter(test_labels, test_predictions, alpha=0.6)
plt.xlabel('True values (expenses)')
plt.ylabel('Predictions (expenses)')
plt.title('Predicted vs Actual Expenses')
lims = [0, 50000]
plt.xlim(lims)
plt.ylim(lims)
plt.plot(lims, lims, color='red', linestyle='--')
plt.show()