Created using the lecturer provided  [link](https://nabeelvalley.co.za/docs/data-science-with-python/xgboost-and-pipelines/)

In [38]:
import pandas as pd

from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import SelectFromModel
import numpy as np

In [39]:
path = r'data/freMTPL2freq.csv'
df = pd.read_csv(path)
df

Unnamed: 0,IDpol,ClaimNb,Exposure,Area,VehPower,VehAge,DrivAge,BonusMalus,VehBrand,VehGas,Density,Region
0,1.0,1,0.10000,D,5,0,55,50,B12,Regular,1217,R82
1,3.0,1,0.77000,D,5,0,55,50,B12,Regular,1217,R82
2,5.0,1,0.75000,B,6,2,52,50,B12,Diesel,54,R22
3,10.0,1,0.09000,B,7,0,46,50,B12,Diesel,76,R72
4,11.0,1,0.84000,B,7,0,46,50,B12,Diesel,76,R72
...,...,...,...,...,...,...,...,...,...,...,...,...
678008,6114326.0,0,0.00274,E,4,0,54,50,B12,Regular,3317,R93
678009,6114327.0,0,0.00274,E,4,0,41,95,B12,Regular,9850,R11
678010,6114328.0,0,0.00274,D,6,2,45,50,B12,Diesel,1323,R82
678011,6114329.0,0,0.00274,B,4,0,60,50,B12,Regular,95,R26


In [40]:
X = df.drop('ClaimNb', axis=1)
y = df.ClaimNb


X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

categorical_cols = [cname for cname in X_train.columns if X_train[cname].nunique() < 10 and
                        X_train[cname].dtype == "object"]

numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

my_cols = categorical_cols + numerical_cols

X_train = X_train[my_cols].copy()
X_valid = X_valid[my_cols].copy()

numerical_transformer = SimpleImputer(strategy='constant')


categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))


])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols),

    ])

Train The Model

In [46]:
model = XGBRegressor()

pipeline = Pipeline(steps=[
                           ('preprocessor', preprocessor),
                           ('poly', PolynomialFeatures(degree=3,interaction_only=True)),
                           ('selector', SelectFromModel(estimator=XGBRegressor(), max_features=10, threshold=-np.inf)),
                          ('model', model)])

pipeline.fit(X_train, y_train)

Evaluation

Note that we need to pre-format our eval_set data so that it has the proprocessing steps applied so that the data structures are aligned
We also need to prefix any inputs that we want passed on to our model with model__ so that the pipeline passes it to the correct obje

In [47]:
preprocessor.fit(X_valid)
X_valid_transformed = preprocessor.transform(X_valid)

In [48]:
# pipeline.fit(X_train, y_train,
#                 model__early_stopping_rounds=20,
#                 model__eval_set=[(X_valid, y_valid)],
#                 model__verbose=False)

In [49]:
predictions = pipeline.predict(X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))

Mean Absolute Error: 0.0730531393032033


In [50]:
# Without selection - Mean Absolute Error: 0.07311316296635668
# 8 Features - Mean Absolute Error: 0.07317318337106148
# 6 Features - Mean Absolute Error: 0.07291717183486429
# 5 Features - Mean Absolute Error: 0.07289091276547582
# 20(with poly) Features - Mean Absolute Error: 0.07289091276547582

In [None]:
for d in [3,4]:
    for k in range(13,40,2):
        model = XGBRegressor()

        pipeline = Pipeline(steps=[
                                   ('preprocessor', preprocessor),
                                   ('poly', PolynomialFeatures(degree=d,interaction_only=False)),
                                   ('selector', SelectFromModel(estimator=XGBRegressor(), max_features=k)),
                                    ('model', model)])

        pipeline.fit(X_train, y_train)
        preprocessor.fit(X_valid)
        X_valid_transformed = preprocessor.transform(X_valid)
        pipeline.fit(X_train, y_train,
                     model__eval_set=[(X_valid_transformed, y_valid)],
                     model__verbose=False
                     )
        predictions = pipeline.predict(X_valid)
        print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))
        print("# Of Degree: " ,d)
        print("# Of K: " , k)
        print("============================")