<a href="https://colab.research.google.com/github/ramoneas/FCC-ML-Challenge/blob/main/fcc_predict_health_costs_with_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import libraries. You may or may not use all of these.
!pip install -q git+https://github.com/tensorflow/docs
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling

In [None]:
# Import data
!wget https://cdn.freecodecamp.org/project-data/health-costs/insurance.csv
dataset = pd.read_csv('insurance.csv')
dataset.tail()

In [None]:
def apply_categorical_encoder(feature):
  #Change to embedding layer only when the feature have 100+ categories inside
  dataset_tensor = tf.constant(feature.values)
  lookup_layer = tf.keras.layers.StringLookup(output_mode="one_hot")
  lookup_layer.adapt(dataset_tensor)

  return tf.cast(lookup_layer(dataset_tensor), tf.float32)

Make sure to convert categorical data to numbers.

In [None]:
sex_encoded = apply_categorical_encoder(dataset.sex)
smoker_encoded = apply_categorical_encoder(dataset.smoker)
region_encoded = apply_categorical_encoder(dataset.region)

In [None]:
age = tf.constant(dataset.age.values, dtype=tf.float32)
bmi = tf.constant(dataset.bmi.values, dtype=tf.float32)
children = tf.constant(dataset.children.values, dtype=tf.float32)
expenses = tf.constant(dataset.expenses.values)
expenses = tf.cast(expenses, tf.float32) #Need it cause expenses is originally float64

# Stack numéricas en una sola matriz
numerical_data = tf.stack([age, bmi, children], axis=1)

In [None]:
# Concatenamos todo
features = tf.concat([numerical_data, sex_encoded, smoker_encoded, region_encoded], axis=1)
encoded_dataset = tf.concat([features, tf.expand_dims(expenses, axis=1)], axis=1)

Use 80% of the data as the `train_dataset` and 20% of the data as the `test_dataset`.

In [None]:
df = tf.random.shuffle(encoded_dataset, seed=42) #Shuffle to reorder the data.

# División 80/20
train_size = int(0.8 * len(df))
train_dataset = df[:train_size]
test_dataset = df[train_size:]


`pop` off the "expenses" column from these datasets to create new datasets called `train_labels` and `test_labels`. Use these labels when training your model.

In [None]:
train_labels = train_dataset[:, -1]
test_labels = test_dataset[:, -1]

Create a model and train it with the `train_dataset`. Run the final cell in this notebook to check your model. The final cell will use the unseen `test_dataset` to check how well the model generalizes.

In [None]:
input_shape = train_dataset[0].shape[0] #Numero de columnas.

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(input_shape,)),
    tf.keras.layers.Dense(1)
])

model.compile(
    optimizer=tf.optimizers.Adam(learning_rate=0.1),
    loss='mse',
    metrics=['mae', 'mse']
)

In [None]:
model.fit(train_dataset, train_labels, epochs=100)

In [None]:
# RUN THIS CELL TO TEST YOUR MODEL. DO NOT MODIFY CONTENTS.
# Test model by checking how well the model generalizes using the test set.
loss, mae, mse = model.evaluate(test_dataset, test_labels, verbose=2)

print("Testing set Mean Abs Error: {:5.2f} expenses".format(mae))

if mae < 3500:
  print("You passed the challenge. Great job!")
else:
  print("The Mean Abs Error must be less than 3500. Keep trying.")

# Plot predictions.
test_predictions = model.predict(test_dataset).flatten()

a = plt.axes(aspect='equal')
plt.scatter(test_labels, test_predictions)
plt.xlabel('True values (expenses)')
plt.ylabel('Predictions (expenses)')
lims = [0, 50000]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims,lims)
