<a href="https://colab.research.google.com/github/sajosam/deep_learning/blob/main/ANN_Regression_Insurance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import required libraries
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


In [2]:
# Read in the insurance dataset
insurance = pd.read_csv("https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/insurance.csv")

In [3]:

# Check out the insurance dataset
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
insurance.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [5]:
# to display the unique values in sex,smoker,region columns
print(insurance['sex'].unique())
print(insurance['smoker'].unique())
print(insurance['region'].unique())

['female' 'male']
['yes' 'no']
['southwest' 'southeast' 'northwest' 'northeast']


In [6]:

from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
# Create column transformer (this will help us normalize/preprocess our data)
ct = make_column_transformer(
    (MinMaxScaler(), ["age", "bmi", "children"]), # get all values between 0 and 1
    (OneHotEncoder(handle_unknown="ignore"), ["sex", "smoker", "region"])
)

# Create X & y
X = insurance.drop("charges", axis=1)
y = insurance["charges"]

# Build our train and test sets (use random state to ensure same split as before)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit column transformer on the training data only (doing so on test data would result in data leakage)
ct.fit(X_train)

# Transform training and test data with normalization (MinMaxScalar) and one hot encoding (OneHotEncoder)
X_train_normal = ct.transform(X_train)
X_test_normal = ct.transform(X_test)

In [7]:
# Non-normalized and non-one-hot encoded data example
X_train.loc[0]

age                19
sex            female
bmi              27.9
children            0
smoker            yes
region      southwest
Name: 0, dtype: object

In [8]:
# Normalized and one-hot encoded example
X_train_normal[0]

array([0.60869565, 0.10734463, 0.4       , 1.        , 0.        ,
       1.        , 0.        , 0.        , 1.        , 0.        ,
       0.        ])

In [9]:
# Notice the normalized/one-hot encoded shape is larger because of the extra columns
X_train_normal.shape, X_train.shape

((1070, 11), (1070, 6))

In [10]:
from tensorflow.keras.layers import Dense , InputLayer , Dropout
from tensorflow import keras

In [11]:
# Build the model
insurance_model= tf.keras.Sequential()
insurance_model.add(Dense(100, input_shape=(11,),activation='relu'))
insurance_model.add(Dense(70,activation='relu'))
insurance_model.add(Dense(40,activation='relu'))
insurance_model.add(Dense(30,activation='relu'))
insurance_model.add(Dense(1,activation='linear'))


# Compile the model
# insurance_model.compile(loss=tf.keras.losses.MeanSquaredError(reduction="auto", name="mean_squared_error"),
#                           optimizer=tf.keras.optimizers.Adam,
#                           metrics=['mae'])
insurance_model.compile(optimizer='adam', loss=tf.keras.losses.MeanAbsoluteError(),metrics='mae')



# Fit the model for 200 epochs (same as insurance_model)
insurance_model.fit(X_train_normal, y_train,batch_size=10, epochs=300)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

<keras.callbacks.History at 0x7fc0c2f932d0>

In [12]:
y_pred=insurance_model.predict(X_test_normal)

In [13]:
y_test=np.array(y_test)

In [14]:
y_test.reshape(-1,1)

array([[ 9095.06825 ],
       [ 5272.1758  ],
       [29330.98315 ],
       [ 9301.89355 ],
       [33750.2918  ],
       [ 4536.259   ],
       [ 2117.33885 ],
       [14210.53595 ],
       [ 3732.6251  ],
       [10264.4421  ],
       [18259.216   ],
       [ 7256.7231  ],
       [ 3947.4131  ],
       [46151.1245  ],
       [48673.5588  ],
       [44202.6536  ],
       [ 9800.8882  ],
       [42969.8527  ],
       [ 8233.0975  ],
       [21774.32215 ],
       [ 5080.096   ],
       [ 7441.501   ],
       [ 1256.299   ],
       [ 2755.02095 ],
       [11085.5868  ],
       [10923.9332  ],
       [12644.589   ],
       [18804.7524  ],
       [ 9715.841   ],
       [ 1131.5066  ],
       [15828.82173 ],
       [11842.62375 ],
       [ 2020.5523  ],
       [ 5693.4305  ],
       [ 2904.088   ],
       [ 7448.40395 ],
       [ 2597.779   ],
       [ 7337.748   ],
       [23887.6627  ],
       [38709.176   ],
       [ 4687.797   ],
       [ 2643.2685  ],
       [11674.13    ],
       [121

In [15]:

print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[ 9214.24414062  9095.06825   ]
 [ 5287.19921875  5272.1758    ]
 [29053.57421875 29330.98315   ]
 [ 9279.47558594  9301.89355   ]
 [30756.08984375 33750.2918    ]
 [ 4540.89355469  4536.259     ]
 [ 2130.3034668   2117.33885   ]
 [14174.7421875  14210.53595   ]
 [ 3732.67773438  3732.6251    ]
 [10307.96386719 10264.4421    ]
 [16594.11523438 18259.216     ]
 [ 7248.61376953  7256.7231    ]
 [ 3953.76489258  3947.4131    ]
 [46304.30859375 46151.1245    ]
 [48582.46875    48673.5588    ]
 [44400.3984375  44202.6536    ]
 [ 9796.81152344  9800.8882    ]
 [43142.65234375 42969.8527    ]
 [ 8271.85644531  8233.0975    ]
 [21963.66601562 21774.32215   ]
 [ 5197.87353516  5080.096     ]
 [ 7438.49951172  7441.501     ]
 [ 1268.51086426  1256.299     ]
 [ 2813.83007812  2755.02095   ]
 [11119.96484375 11085.5868    ]
 [11002.15722656 10923.9332    ]
 [12680.7734375  12644.589     ]
 [ 5179.42724609 18804.7524    ]
 [ 9807.5078125   9715.841     ]
 [ 1141.37561035  1131.5066    ]
 [ 8375.34

In [16]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.8771352577825059