# Data Preparation

In [2]:
import sys
from copy import deepcopy
import warnings
import numpy as np
from numpy import inf, nan
import pandas as pd
import joblib
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
import featuretools as ft

sys.path.append("../../")

from insurance_charges_model.prediction.transformers import DFSTransformer
from insurance_charges_model.prediction.transformers import InfinityToNaNTransformer
from insurance_charges_model.prediction.transformers import IntToFloatTransformer
from insurance_charges_model.prediction.transformers import BooleanTransformer

warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", None)

## Loading the Data

In [3]:
df = pd.read_csv("insurance.csv")
df.head()


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## Deep Feature Synthesis
We'll be using the featuretools package to do feature engineering.

An EntitySet is an object that we will give to the featuretools package in order to do feature engineering. An entitySet denotes the features of specific "entity" in the real world. In this case, we will work with only one type of entity: "transactions".

In [4]:
entityset = ft.EntitySet(id="Transactions")
entityset = entityset.entity_from_dataframe(entity_id="Transactions",
                                            dataframe=df,
                                            make_index=True,
                                            index="index")

entityset

Entityset: Transactions
  Entities:
    Transactions [Rows: 1338, Columns: 8]
  Relationships:
    No relationships

In [5]:
# getting a list of variables associated with the EntitySet we just created
entityset["Transactions"].variables

[<Variable: index (dtype = index)>,
 <Variable: age (dtype = numeric)>,
 <Variable: sex (dtype = categorical)>,
 <Variable: bmi (dtype = numeric)>,
 <Variable: children (dtype = numeric)>,
 <Variable: smoker (dtype = categorical)>,
 <Variable: region (dtype = categorical)>,
 <Variable: charges (dtype = numeric)>]

Now that we have defined an EntitySet for our data, we'll ask the featuretools package to create some features for us. The package defines a set of "primitives" that are able to create new features by processing the features that already exist in the EntitySet.

We are also going to ignore the categorical and boolean features in the dataset because they don't play well with the numerical features.

In [6]:
feature_dataframe, features = ft.dfs(entityset=entityset,
                                     target_entity="Transactions",
                                     trans_primitives=["add_numeric", "subtract_numeric",
                                                       "multiply_numeric", "divide_numeric",
                                                       "greater_than", "less_than"],
                                     # ignoring some variables
                                     ignore_variables={"Transactions": ["sex", "smoker", "region", "charges"]})

features

[<Feature: age>,
 <Feature: bmi>,
 <Feature: children>,
 <Feature: age + bmi>,
 <Feature: age + children>,
 <Feature: bmi + children>,
 <Feature: age / bmi>,
 <Feature: age / children>,
 <Feature: bmi / age>,
 <Feature: bmi / children>,
 <Feature: children / age>,
 <Feature: children / bmi>,
 <Feature: age > bmi>,
 <Feature: age > children>,
 <Feature: bmi > age>,
 <Feature: bmi > children>,
 <Feature: children > age>,
 <Feature: children > bmi>,
 <Feature: age < bmi>,
 <Feature: age < children>,
 <Feature: bmi < age>,
 <Feature: bmi < children>,
 <Feature: children < age>,
 <Feature: children < bmi>,
 <Feature: age * bmi>,
 <Feature: age * children>,
 <Feature: bmi * children>,
 <Feature: age - bmi>,
 <Feature: age - children>,
 <Feature: bmi - children>]

In [7]:
print("We have created {} new features from the original {} features.".format(len(features), len(df.columns) - 2))

We have created 30 new features from the original 6 features.


The feature_dataframe variable now contains the new features:

In [8]:
feature_dataframe.head()

Unnamed: 0_level_0,age,bmi,children,age + bmi,age + children,bmi + children,age / bmi,age / children,bmi / age,bmi / children,children / age,children / bmi,age > bmi,age > children,bmi > age,bmi > children,children > age,children > bmi,age < bmi,age < children,bmi < age,bmi < children,children < age,children < bmi,age * bmi,age * children,bmi * children,age - bmi,age - children,bmi - children
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
0,19,27.9,0,46.9,19,27.9,0.681004,inf,1.468421,inf,0.0,0.0,False,True,True,True,False,False,True,False,False,False,True,True,530.1,0,0.0,-8.9,19,27.9
1,18,33.77,1,51.77,19,34.77,0.533017,18.0,1.876111,33.77,0.055556,0.029612,False,True,True,True,False,False,True,False,False,False,True,True,607.86,18,33.77,-15.77,17,32.77
2,28,33.0,3,61.0,31,36.0,0.848485,9.333333,1.178571,11.0,0.107143,0.090909,False,True,True,True,False,False,True,False,False,False,True,True,924.0,84,99.0,-5.0,25,30.0
3,33,22.705,0,55.705,33,22.705,1.453424,inf,0.68803,inf,0.0,0.0,True,True,False,True,False,False,False,False,True,False,True,True,749.265,0,0.0,10.295,33,22.705
4,32,28.88,0,60.88,32,28.88,1.108033,inf,0.9025,inf,0.0,0.0,True,True,False,True,False,False,False,False,True,False,True,True,924.16,0,0.0,3.12,32,28.88


## Encode Features using Deep Feature Synthesis

Now we can create a Transformer that we can use later to create the features, given samples of the dataset.

In [9]:
dfs_transformer = DFSTransformer("Transactions",
                                 trans_primitives=["add_numeric", "subtract_numeric",
                                                   "multiply_numeric", "divide_numeric",
                                                   "greater_than", "less_than"],
                                 ignore_variables={"Transactions": ["sex", "smoker", "region"]})

In [10]:
# testing the transformer
test_df = pd.DataFrame([[65, "male", 12.5, 0, "yes", "southwest"],
                        [75, "female", 78.770, 1, "no", "southeast"]],
                       columns=["age", "sex", "bmi", "children", "smoker", "region"])

# copying the transformer object in order to fit and test it
dfs_transformer_copy = deepcopy(dfs_transformer)

dfs_transformer_copy.fit(test_df)
new_df = dfs_transformer_copy.transform(test_df)

if len(new_df.columns) != 30:
    raise ValueError("Unexpected number of columns found in the dataframe.")

new_df.head()

Unnamed: 0_level_0,age,bmi,children,age + bmi,age + children,bmi + children,age / bmi,age / children,bmi / age,bmi / children,children / age,children / bmi,age > bmi,age > children,bmi > age,bmi > children,children > age,children > bmi,age < bmi,age < children,bmi < age,bmi < children,children < age,children < bmi,age * bmi,age * children,bmi * children,age - bmi,age - children,bmi - children
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
0,65,12.5,0,77.5,65,12.5,5.2,inf,0.192308,inf,0.0,0.0,True,True,False,True,False,False,False,False,True,False,True,True,812.5,0,0.0,52.5,65,12.5
1,75,78.77,1,153.77,76,79.77,0.952139,75.0,1.050267,78.77,0.013333,0.012695,False,True,True,True,False,False,True,False,False,False,True,True,5907.75,75,78.77,-3.77,74,77.77


## Create Transformer for inf Values

Some of the features created by the featuretools package have a value of 'inf'. We'll create a transformer that maps these values to 0.0 to allow the models to be trained.

In [11]:
infinity_transformer = InfinityToNaNTransformer()

In [12]:
# testing the transformer
inpt = [[1.0], [inf], [1.0]]

# copying the transformer object in order to fit and test it
infinity_transformer_copy = deepcopy(infinity_transformer)

infinity_transformer_copy.fit(inpt)
result = infinity_transformer_copy.transform(inpt)

print(result)

[[ 1.]
 [nan]
 [ 1.]]


In order to handle the NaN values, we'll use a SimpleImputer that will fill in the missing value:

In [13]:
simple_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

In [14]:
# testing the transformer

# copying the transformer object in order to fit and test it
simple_imputer_copy = deepcopy(simple_imputer)

simple_imputer_copy.fit(result)

test_df = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]
print(simple_imputer_copy.transform(result))

[[1.]
 [1.]
 [1.]]


The SimpleImputer transformer has problems with imputing values that are not floats when using the 'mean' strategy. To fix this, we'll create a transformer that will convert all integer columns into floating point columns:

In [15]:
int_to_float_transformer = IntToFloatTransformer()

In [16]:
# test the transformer

# copying the transformer object in order to fit and test it
int_to_float_transformer_copy = deepcopy(int_to_float_transformer)

int_to_float_transformer_copy.fit(result)

test_df = [[2, 3.0], [4, 6.0], [10, 9.0]]
print(int_to_float_transformer_copy.transform(test_df))


[[ 2.  3.]
 [ 4.  6.]
 [10.  9.]]


Lastly, we'll put the IntToFloatTransformer, DFSTransformer, InfinityToNaNTransformer, and SimpleImputer transformers into a Pipeline so they'll all work together as a unit:

In [17]:
dfs_pipeline = Pipeline([
    ("dfs_transformer", dfs_transformer),
    ("int_to_float_transformer", int_to_float_transformer),
    ("infinity_transformer", infinity_transformer),
    ("simple_imputer", simple_imputer),
])

In [18]:
# testing the transformer
test_df = pd.DataFrame([[65, 12.5, 0],
                        [75, 78.770, 1]],
                       columns=["age", "bmi", "children"])

# copying the transformer object in order to fit and test it
dfs_pipeline_copy = deepcopy(dfs_pipeline)

dfs_pipeline_copy.fit(test_df)
new_df = dfs_pipeline_copy.transform(test_df)

if len(new_df[0]) != 30:
    raise ValueError("Unexpected number of columns found in the dataframe.")

## Encode Boolean Features

We'll create a transformer that is able to convert the string in the 'smoker' feature to a boolean value.

In [19]:
boolean_transformer = BooleanTransformer(true_value="yes", false_value="no")

In [20]:
# testing the transformer
test_df = pd.DataFrame([["yes"], ["no"], ["yes"]],
                       columns=["smoker"])

# copying the transformer object in order to fit and test it
boolean_transformer_copy = deepcopy(boolean_transformer)

boolean_transformer_copy.fit(test_df)
result = boolean_transformer_copy.transform(test_df)

if (result != np.array([[True], [False], [True]])).all():
    raise ValueError("Unexpected values found in array.")

## Encode Categorical Features

Next, we'll create an encoder that will encode the categorical features. The categorical features that we will encode will be 'sex' and 'region'.

In [21]:
ordinal_encoder = OrdinalEncoder()

In [22]:
# testing the transformer
test_df = pd.DataFrame([["southwest"], ["northeast"], ["southwest"]],
                       columns=["region"])

# copying the transformer object in order to fit and test it
ordinal_encoder_copy = deepcopy(ordinal_encoder)

ordinal_encoder_copy.fit(test_df)
result = ordinal_encoder_copy.transform(test_df)

if (result != np.array([[1.0], [0.0], [1.0]])).all():
    raise ValueError("Unexpected values found in array.")

## Create ColumnTransformer

Combining all of the preprocessors into one ColumnTransformer that can be used to preprocess the data.

In [23]:
column_transformer = ColumnTransformer(
    remainder="passthrough",
    transformers=[
        ("dfs_pipeline", dfs_pipeline, ["age", "sex", "bmi", "children", "smoker", "region"]),
        ("boolean_transformer", boolean_transformer, ["smoker"]),
        ("ordinal_encoder", ordinal_encoder, ["sex", "region"])
    ]
)

In [24]:
# testing the ColumnTransformer
test_df = pd.DataFrame([[65, "male", 12.5, 0, "yes", "southwest"],
                        [75, "female", 78.770, 1, "no", "southeast"]],
                       columns=["age", "sex", "bmi", "children", "smoker", "region"])

# copying the transformer object in order to fit and test it
column_transformer_copy = deepcopy(column_transformer)

column_transformer_copy.fit(test_df)

result = column_transformer_copy.transform(test_df)

if len(result[0]) != 33:  # expecting 33 features to come out of the ColumnTransformer
    raise ValueError("Unexpected number of columns found in the dataframe.")

## Saving ColumnTransformer

NOTE: the ColumnTransformer object is saved in an UNFITTED state, it will be fitted to the data set later

In [25]:
joblib.dump(column_transformer, "transformer.joblib")

['transformer.joblib']