# Model Training

In [1]:
import sys
import warnings
import pandas as pd
import numpy as np
import joblib
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import clone

sys.path.append("../../")

warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", None)



## Load Data

In [2]:
df = pd.read_csv("insurance.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## Create Training and Test Sets

In [3]:
mask = np.random.rand(len(df)) < 0.8

training_set = df[mask]

testing_set = df[~mask]

print(training_set.shape)
print(testing_set.shape)

(1079, 7)
(259, 7)


In [5]:
# save training and test sets to be used later
training_set.to_csv("training_set.csv")
testing_set.to_csv("testing_set.csv")

In [6]:
# separating the feature columns from the target column
feature_columns = ["age", "sex", "bmi", "children", "smoker", "region"]
target_column = "charges"

X_train = training_set[feature_columns]
y_train = training_set[target_column]

X_test = testing_set[feature_columns]
y_test = testing_set[target_column]

## Apply the Preprocessing


In [7]:
# loading the preprocessing pipeline we built in the previous notebook
transformer = joblib.load("transformer.joblib")

In [8]:
# applying the column transformer
features = transformer.fit_transform(X_train)

features

array([[19.  , 27.9 ,  0.  , ...,  1.  ,  0.  ,  3.  ],
       [18.  , 33.77,  1.  , ...,  0.  ,  1.  ,  2.  ],
       [28.  , 33.  ,  3.  , ...,  0.  ,  1.  ,  2.  ],
       ...,
       [18.  , 36.85,  0.  , ...,  0.  ,  0.  ,  2.  ],
       [21.  , 25.8 ,  0.  , ...,  0.  ,  0.  ,  3.  ],
       [61.  , 29.07,  0.  , ...,  1.  ,  0.  ,  1.  ]])

## Find an Optimal Pipeline

In [9]:
tpot_regressor = TPOTRegressor(generations=50,
                               population_size=50,
                               random_state=42,
                               cv=5,
                               n_jobs=8,
                               verbosity=2,
                               early_stop=10)

In [10]:
tpot_regressor = tpot_regressor.fit(features, y_train)

Optimization Progress:   0%|          | 0/2550 [00:00<?, ?pipeline/s]




Generation 1 - Current best internal CV score: -20607315.220001213

Generation 2 - Current best internal CV score: -20607315.220001213

Generation 3 - Current best internal CV score: -20607315.220001213

Generation 4 - Current best internal CV score: -20607315.220001213

Generation 5 - Current best internal CV score: -20378707.33926659

Generation 6 - Current best internal CV score: -20378707.33926659

Generation 7 - Current best internal CV score: -20378707.33926659

Generation 8 - Current best internal CV score: -20378707.33926659

Generation 9 - Current best internal CV score: -20362350.652015977

Generation 10 - Current best internal CV score: -20362350.652015977

Generation 11 - Current best internal CV score: -20362350.652015977

Generation 12 - Current best internal CV score: -20362350.652015977

Generation 13 - Current best internal CV score: -20362350.652015977

Generation 14 - Current best internal CV score: -20362350.652015977

Generation 15 - Current best internal CV score

## Create Pipeline

Now that we have an optimal pipeline created by TPOT we will be adding our own preprocessors to it. To do this we'll need to have an unfitted pipeline object.

To create an unfitted pipeline from the fitted pipeline that we already have, we'll clone the pipeline object:

In [11]:
unfitted_tpot_regressor = clone(tpot_regressor.fitted_pipeline_)

unfitted_tpot_regressor

Now that we can build the same pipeline that was found by the TPOT package, we'll add our own preprocessors to the pipeline. This will ensure that the final pipeline will accept the features in the original dataset and will process the features correctly.

We'll compose the preprocessing pipeline and the tpot pipeline into one pipeline:

In [12]:
model = Pipeline([
    ("transformer", transformer),
    ("tpot_pipeline", unfitted_tpot_regressor)
])

## Train Model

In [13]:
model.fit(X_train, y_train)

## Test Model With Single Sample

In [14]:
# testing the ColumnTransformer
test_df = pd.DataFrame([[65, "male", 12.5, 0, "yes", "southwest"]],
                       columns=["age", "sex", "bmi", "children", "smoker", "region"])


result = model.predict(test_df)

result

array([21992.03266738])

## Save Model

In [15]:
joblib.dump(model, "model.joblib")

['model.joblib']