**Objective**


The purpose of this project is to create machine learning models that could accurately predict the severity of auto claims from independent numerical and categorical variables. The dataset is provided by Allstate Insurance, a P&C insurance company that is specialized in auto insurance in North America. Extreme Gradient Boosting (XGBoost) is used to create 1000 cycles of modeling with new models add into every round of cycle. Lastly, mean absolute error is used to evaluate how well the models able to predict the severity of auto claims.

In [None]:
import pandas as pd
import sklearn
import xgboost
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor

In [None]:
# Load train and test data from the Kaggle competition input links
# Use 'describe' to check: number of unique values in categorical variables
#                          and summary statistics for numerical variables
path = '/kaggle/input/allstate-claims-severity/train.csv'
path_test = '/kaggle/input/allstate-claims-severity/test.csv'
car_data = pd.read_csv(path, index_col=['id'])
car_data_test = pd.read_csv(path_test, index_col=['id'])

car_data.head()
car_data.describe(include='all')

In [None]:
# --------Method 1----------
# Used XGBoost (extreme gradient boosting) to iterativelly adding model
# into existing modeles to improve the mean absolute squares
# Split the data into train and validations sets by using train_test_split
X = car_data.drop(columns = ['loss']).select_dtypes(exclude=['object'])
y = car_data['loss']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state = 0)
car_model = XGBRegressor(n_estimators=1000, learning_rate=0.01)
car_model.fit(X_train, y_train, 
             early_stopping_rounds=5, 
             eval_set=[(X_valid, y_valid)], 
             verbose=False)
claims_predict = car_model.predict(X_valid)
mean_absolute_error(claims_predict, y_valid)

In [None]:
# Use the entire train set to train the model
car_model_full = XGBRegressor(n_estimators=1000, learning_rate=0.05)
car_model_full.fit(X, y, 
             early_stopping_rounds=5, 
             eval_set=[(X_valid, y_valid)], 
             verbose=False)
test_preds = car_model_full.predict(car_data_test.select_dtypes(exclude=['object']))


In [None]:
# --------Method 2----------
# Label categorical data to fit the mode
# Find categorical variables that have less than 10 unique values
X = car_data.drop(columns = ['loss'])
y = car_data['loss']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state = 0)

object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]
high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)


OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)


In [None]:
# Used XGBoost (extreme gradient boosting) to iterativelly adding model
# into existing modeles to improve the mean absolute squares
car_model_2 = XGBRegressor(n_estimators=1000, learning_rate=0.01)
car_model_2.fit(OH_X_train, y_train, 
             early_stopping_rounds=5, 
             eval_set=[(OH_X_valid, y_valid)], 
             verbose=False)
claims_predict = car_model_2.predict(OH_X_valid)
mean_absolute_error(claims_predict, y_valid)
## mae = 1208

In [None]:
# Use the entire train set to train the second model 
OH_cols_full = pd.DataFrame(OH_encoder.fit_transform(X[low_cardinality_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(car_data_test[low_cardinality_cols]))

OH_cols_full.index = X.index
OH_cols_test.index = car_data_test.index

num_X = X.drop(object_cols, axis=1)
num_X_test = car_data_test.drop(object_cols, axis=1)

OH_X_full = pd.concat([num_X, OH_cols_full], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

car_model_full_2 = XGBRegressor(n_estimators=1000, learning_rate=0.05)
car_model_full_2.fit(OH_X_full, y, verbose=False)
test_preds = car_model_full_2.predict(OH_X_test)

In [None]:
# output the csv prediction file and submit to Kaggle Leaderboard
output = pd.DataFrame({'id': car_data_test.index,
                      'loss': test_preds})
output.to_csv('submission.csv', index=False)