# Model Training

In [1]:
import sys
import pandas as pd
import numpy as np
import joblib
from tpot import TPOTRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

sys.path.append("../../")

pd.set_option("display.max_columns", None)



## Load Data

In [2]:
df = pd.read_csv("../../data/insurance.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## Create Training and Test Sets

In [3]:
mask = np.random.rand(len(df)) < 0.8

training_set = df[mask]

testing_set = df[~mask]

print(training_set.shape)
print(testing_set.shape)

(1054, 7)
(284, 7)


In [4]:
# save training and test sets to be used later
training_set.to_csv("../../data/training_set.csv")
testing_set.to_csv("../../data/testing_set.csv")

In [5]:
# separating the feature columns from the target column
feature_columns = ["age", "sex", "bmi", "children", "smoker", "region"]
target_column = "charges"

X_train = training_set[feature_columns]
y_train = training_set[target_column]

X_test = testing_set[feature_columns]
y_test = testing_set[target_column]

## Apply the Preprocessing

In [6]:
# loading the preprocessing pipeline we built in the previous notebook
transformer = joblib.load("../model_files/transformer.joblib")

In [7]:
# applying the column transformer
features = transformer.fit_transform(X_train)

features



array([[19.   , 27.9  ,  0.   , ...,  1.   ,  0.   ,  3.   ],
       [33.   , 22.705,  0.   , ...,  0.   ,  1.   ,  1.   ],
       [32.   , 28.88 ,  0.   , ...,  0.   ,  1.   ,  1.   ],
       ...,
       [57.   , 25.74 ,  2.   , ...,  0.   ,  0.   ,  2.   ],
       [50.   , 30.97 ,  3.   , ...,  0.   ,  1.   ,  1.   ],
       [21.   , 25.8  ,  0.   , ...,  0.   ,  0.   ,  3.   ]])

## Find an Optimal Pipeline

In [8]:
tpot_regressor = TPOTRegressor(generations=50,
                               population_size=100,
                               random_state=42,
                               cv=5,
                               n_jobs=8,
                               verbosity=2)

In [9]:
tpot_regressor = tpot_regressor.fit(features, y_train)

Optimization Progress:   0%|          | 0/5100 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -20181615.829195686

Generation 2 - Current best internal CV score: -20181615.829195686

Generation 3 - Current best internal CV score: -19848332.164544817

Generation 4 - Current best internal CV score: -19848332.164544817

Generation 5 - Current best internal CV score: -19848332.164544817

Generation 6 - Current best internal CV score: -19848332.164544817

Generation 7 - Current best internal CV score: -19816504.27026009

Generation 8 - Current best internal CV score: -19727874.78911667

Generation 9 - Current best internal CV score: -19727874.78911667

Generation 10 - Current best internal CV score: -19727874.78911667

Generation 11 - Current best internal CV score: -19727874.78911667

Generation 12 - Current best internal CV score: -19727874.78911667

Generation 13 - Current best internal CV score: -19701596.956555426

Generation 14 - Current best internal CV score: -19701596.956555426

Generation 15 - Current best internal CV score: 

## Create Pipeline

Now that we have an optimal pipeline created by TPOT we will be adding our own preprocessors to it. To do this we'll need to have an unfitted pipeline object, we don't have that right now because the TPOTRegressor pipeline has been fitted. 

To get an unfitted pipeline we'll ask TPOT to export the pipeline as a python file:

In [10]:
tpot_regressor.export("../model_files/tpot_pipeline.py")

This is the code that will create the pipeline that was found by the TPOT package, we'll execute it here to make sure that we can instantiate the Pipeline object.

In [12]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFwe, f_regression
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive

# Average CV score on the training set was: -19575007.77756657
tpot_pipeline = make_pipeline(
    SelectFwe(score_func=f_regression, alpha=0.037),
    StackingEstimator(estimator=ElasticNetCV(l1_ratio=0.9500000000000001, tol=1e-05)),
    RandomForestRegressor(bootstrap=True, max_features=0.8, min_samples_leaf=18, min_samples_split=14, n_estimators=100)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(tpot_pipeline.steps, 'random_state', 42)

Now that we can build the same pipeline that was found by the TPOT package, we'll add our own preprocessors to the pipeline. This will ensure that the final pipeline will accept the features in the original dataset and will process the features correctly.

We'll compose all of the pipelines we created above into one Pipeline:

In [13]:
model = Pipeline([
    ("transformer", transformer),
    ("tpot_pipeline", tpot_pipeline)
])

## Train Model

In [14]:
model.fit(X_train, y_train)

  corr /= X_norms
  corr /= X_norms
  F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom


Pipeline(steps=[('transformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('dfs_pipeline',
                                                  Pipeline(steps=[('dfs_transformer',
                                                                   DFSTransformer(ignore_variables={'Transactions': ['sex',
                                                                                                                     'smoker',
                                                                                                                     'region']},
                                                                                  target_entity='Transactions',
                                                                                  trans_primitives=['add_numeric',
                                                                                                    'subtract_numeric',
                             

In [15]:
predictions = model.predict(X_test)

In [16]:
r2 = r2_score(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)

print("r2 score: ", r2)
print("mean squared error: ", mse)
print("mean absolute error: ", mae)

r2 score:  0.8408772880472506
mean squared error:  22226429.889209867
mean absolute error:  2539.0184656719302


## Test Model With Single Sample

In [17]:
# testing the ColumnTransformer
test_df = pd.DataFrame([[65, "male", 12.5, 0, "yes", "southwest"]],
                       columns=["age", "sex", "bmi", "children", "smoker", "region"])


result = model.predict(test_df)

result

array([23656.57334195])

## Save Model

In [18]:
joblib.dump(model, "../model_files/model.joblib")

['../model_files/model.joblib']