# Model Training


In [73]:
import sys
import os
import pandas as pd
import numpy as np 
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split
import sklearn.metrics
from sklearn.metrics import r2_score

pd.set_option("display.max_columns", None)

sys.path.append("../../")

## Load Data

In [74]:
df = pd.read_csv("../../data/insurance.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## Apply the Preprocessing

In [75]:
import joblib

# loading the preprocessing pipeline we built in the previous notebook
column_transformer = joblib.load("../model_files/transformer.joblib")

In [76]:
# applying the column transformer

features = column_transformer.fit_transform(df)

features



array([[1.90000000e+01, 2.79000000e+01, 0.00000000e+00, ...,
        0.00000000e+00, 3.00000000e+00, 1.68849240e+04],
       [1.80000000e+01, 3.37700000e+01, 1.00000000e+00, ...,
        1.00000000e+00, 2.00000000e+00, 1.72555230e+03],
       [2.80000000e+01, 3.30000000e+01, 3.00000000e+00, ...,
        1.00000000e+00, 2.00000000e+00, 4.44946200e+03],
       ...,
       [1.80000000e+01, 3.68500000e+01, 0.00000000e+00, ...,
        0.00000000e+00, 2.00000000e+00, 1.62983350e+03],
       [2.10000000e+01, 2.58000000e+01, 0.00000000e+00, ...,
        0.00000000e+00, 3.00000000e+00, 2.00794500e+03],
       [6.10000000e+01, 2.90700000e+01, 0.00000000e+00, ...,
        0.00000000e+00, 1.00000000e+00, 2.91413603e+04]])

## Create Training and Test Sets

In [77]:
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    df["charges"], 
                                                    train_size=0.80, 
                                                    test_size=0.20, 
                                                    random_state=42)

print("Feature Dataframe Shapes: ", X_train.shape, X_test.shape)
print("Target Dataframe Shapes: ", y_train.shape, y_test.shape)

Feature Dataframe Shapes:  (1070, 34) (268, 34)
Target Dataframe Shapes:  (1070,) (268,)


In [78]:
# save test set to be used later
X_test.to_csv("../../data/testing_predictors.csv")
y_test.to_csv("../../data/testing_targets.csv")

AttributeError: 'numpy.ndarray' object has no attribute 'to_csv'

## Train a Model

In [79]:
tpot_regressor = TPOTRegressor(generations=10,
                               population_size=10,
                               random_state=42,
                               cv=5,
                               n_jobs=4,
                               verbosity=2)

In [80]:
tpot_regressor = tpot_regressor.fit(X_train, y_train)

Version 0.11.6.post2 of tpot is outdated. Version 0.11.6.post3 was released 1 day ago.


HBox(children=(HTML(value='Optimization Progress'), FloatProgress(value=0.0, max=110.0), HTML(value='')))


Generation 1 - Current best internal CV score: -0.001151138157379548

Generation 2 - Current best internal CV score: -0.001151138157379548

Generation 3 - Current best internal CV score: -0.001151138157379548

Generation 4 - Current best internal CV score: -0.001151138157379548

Generation 5 - Current best internal CV score: -3.193635632661969e-22

Generation 6 - Current best internal CV score: -3.193635632661969e-22

Generation 7 - Current best internal CV score: -3.193635632661969e-22

Generation 8 - Current best internal CV score: -1.2103502888371635e-22

Generation 9 - Current best internal CV score: -1.2103502888371635e-22

Generation 10 - Current best internal CV score: -1.0918663294138555e-22

Best pipeline: LassoLarsCV(OneHotEncoder(RandomForestRegressor(input_matrix, bootstrap=True, max_features=0.1, min_samples_leaf=19, min_samples_split=19, n_estimators=100), minimum_fraction=0.15, sparse=False, threshold=10), normalize=False)


## Create Pipeline

Now that we have an optimal pipeline created by TPOT we will be adding our own preprocessors to it.

First, we'll export the pipeline as a python file:

In [24]:
original_data = pd.read_csv("../../data/insurance.csv")

In [25]:
tpot_regressor.export('tpot_pipeline.py')

This is the code that will create the pipeline that was found by the TPOT package, we'll execute it here to make sure that we can instantiate the Pipeline object.

In [26]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoLarsCV
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import Normalizer
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive

tpot_pipeline = make_pipeline(
    Normalizer(norm="l2"),
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    RandomForestRegressor(bootstrap=True, max_features=0.7500000000000001, min_samples_leaf=17, min_samples_split=9, n_estimators=100)
)

# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

Now that we can build the same pipeline that was found by the TPOT package, we'll add our own preprocessors to the pipeline. This will ensure that the final pipeline will accept the features in the original dataset and will process the features correctly.

Lastly, we'll compose all of the pipelines we created above into one ColumnTransformer:

In [51]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ("categorical_transformers", categorical_transformers, ['sex', 'smoker', 'region'])
    ]
)

Now we can add the ColumnTransformer to the tpot pipeline:

In [None]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('tpot_pipeline', tpot_pipeline)
])

In [None]:
exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

## Test Pipeline

To make sure that the pipeline works as expected we'll fit it and make a prediction using data from the original dataset.

In [7]:
predictions = tpot_regressor.predict(X_test)
print(r2_score(y_test, predictions))

0.8828053733667738


## Save Unfitted Pipeline

In [9]:
pickle.dump(tpot_regressor.fitted_pipeline_, open("../model_files/model.pkl", "wb"))

In [10]:
tpot_regressor.export('tpot_exported_pipeline.py')


In [11]:
tpot_regressor._optimized_pipeline

[<deap.gp.Primitive at 0x1262deb30>,
 <deap.gp.Primitive at 0x1262f8900>,
 <deap.gp.Primitive at 0x1262fba40>,
 <deap.gp.Terminal at 0x1262dd180>,
 <deap.gp.Terminal at 0x1262ffac0>,
 <deap.gp.Terminal at 0x1262f9380>,
 <deap.gp.Terminal at 0x1262e7900>,
 <deap.gp.Terminal at 0x1262e7d80>,
 <deap.gp.Terminal at 0x1262ea380>,
 <deap.gp.Terminal at 0x1262ea6c0>,
 <deap.gp.Terminal at 0x1262ea9c0>]