# Modeling

In this notebook, the performance of different models is examined.

## Setup

Import libraries.

In [15]:
pip install imblearn

Note: you may need to restart the kernel to use updated packages.


In [16]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [17]:
import numpy as np
import pandas as pd
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE
from sklearn.compose import make_column_transformer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from xgboost import XGBClassifier

Load datasets.

In [18]:
df_train = pd.read_csv("../data/processed/train.csv")
df_val = pd.read_csv("../data/processed/val.csv")
df_test = pd.read_csv("../data/processed/test.csv")

In [19]:
X_train = df_train.drop(columns=["claim_number", "fraud"])
y_train = df_train["fraud"]
X_val = df_val.drop(columns=["claim_number", "fraud"])
y_val = df_val["fraud"]
X_test = df_test.drop(columns=["claim_number"])

In [20]:
X_test

Unnamed: 0,age_of_driver,gender,marital_status,safty_rating,annual_income,high_education_ind,address_change_ind,living_status,accident_site,past_num_of_claims,...,liab_prct,channel,policy_report_filed_ind,claim_est_payout,age_of_vehicle,vehicle_category,vehicle_price,vehicle_weight,latitude,longitude
0,39.0,M,1.0,73,36633.0,1,0,Own,Highway,0,...,25,Phone,0,5196.552552,8.0,Large,24360.592730,26633.27819,41.50,-94.52
1,56.0,M,1.0,63,40252.0,1,1,Own,Highway,0,...,50,Broker,1,7381.165248,6.0,Large,39710.426650,15875.34874,33.67,-112.06
2,39.0,F,0.0,75,36634.0,1,1,Own,Parking Lot,0,...,98,Broker,1,5612.123938,7.0,Medium,50327.566180,21365.05932,38.87,-77.85
3,33.0,M,1.0,72,34960.0,0,1,Own,Local,0,...,45,Phone,0,7957.267641,2.0,Medium,23457.352820,26707.46021,42.51,-93.37
4,24.0,F,1.0,70,31776.0,1,1,Rent,Local,0,...,11,Online,0,6232.195932,1.0,Medium,18238.392020,22949.73744,38.84,-77.44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11997,64.0,F,1.0,87,41566.0,0,1,Own,Local,1,...,50,Broker,1,7475.901535,3.0,Large,20231.723630,27015.87981,39.13,-77.77
11998,34.0,F,1.0,70,35256.0,1,0,Own,Parking Lot,1,...,28,Broker,1,3838.951645,7.0,Compact,21900.031920,16122.91954,38.79,-77.53
11999,45.0,M,1.0,61,38072.0,1,0,Rent,Local,1,...,94,Broker,1,5236.564274,7.0,Medium,8291.858346,26825.84418,40.63,-79.94
12000,28.0,F,0.0,92,33322.0,1,0,Own,Local,0,...,39,Online,1,3885.395436,5.0,Compact,16168.298850,14855.54007,33.50,-112.08


In [21]:
X_test.columns

Index(['age_of_driver', 'gender', 'marital_status', 'safty_rating',
       'annual_income', 'high_education_ind', 'address_change_ind',
       'living_status', 'accident_site', 'past_num_of_claims',
       'witness_present_ind', 'liab_prct', 'channel',
       'policy_report_filed_ind', 'claim_est_payout', 'age_of_vehicle',
       'vehicle_category', 'vehicle_price', 'vehicle_weight', 'latitude',
       'longitude'],
      dtype='object')

In [22]:
y_train

0        0
1        0
2        0
3        1
4        0
        ..
14391    0
14392    1
14393    0
14394    1
14395    0
Name: fraud, Length: 14396, dtype: int64

In [23]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14396 entries, 0 to 14395
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age_of_driver            14396 non-null  float64
 1   gender                   14396 non-null  object 
 2   marital_status           14396 non-null  float64
 3   safty_rating             14396 non-null  int64  
 4   annual_income            14396 non-null  float64
 5   high_education_ind       14396 non-null  int64  
 6   address_change_ind       14396 non-null  int64  
 7   living_status            14396 non-null  object 
 8   accident_site            14396 non-null  object 
 9   past_num_of_claims       14396 non-null  int64  
 10  witness_present_ind      14396 non-null  float64
 11  liab_prct                14396 non-null  int64  
 12  channel                  14396 non-null  object 
 13  policy_report_filed_ind  14396 non-null  int64  
 14  claim_est_payout      

In [24]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
X_train1=enc.fit_transform(X_train)
X_test1=enc.transform(X_test)

In [25]:
X_train

Unnamed: 0,age_of_driver,gender,marital_status,safty_rating,annual_income,high_education_ind,address_change_ind,living_status,accident_site,past_num_of_claims,...,liab_prct,channel,policy_report_filed_ind,claim_est_payout,age_of_vehicle,vehicle_category,vehicle_price,vehicle_weight,latitude,longitude
0,50.0,F,1.0,90,39135.0,1,1,Own,Local,0,...,50,Online,0,5866.835619,5.0,Large,9556.595872,28996.57016,40.66,-80.24
1,35.0,F,1.0,94,35553.0,1,1,Own,Parking Lot,0,...,12,Broker,0,3198.017617,3.0,Large,26284.399050,10311.38505,40.42,-80.35
2,55.0,F,1.0,68,40090.0,1,0,Own,Local,0,...,97,Broker,1,2741.537462,5.0,Large,23273.938860,24769.56131,38.97,-77.64
3,49.0,F,0.0,75,38921.0,1,0,Rent,Parking Lot,0,...,100,Broker,0,2261.128600,3.0,Medium,14348.455970,36530.13738,40.64,-80.49
4,40.0,M,1.0,90,36911.0,0,0,Own,Local,0,...,0,Broker,0,6013.423653,4.0,Medium,38449.110250,21087.95005,42.03,-93.65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14391,61.0,F,1.0,51,41099.0,0,0,Own,Parking Lot,0,...,84,Broker,1,9960.691734,6.0,Compact,42088.341970,17185.43318,33.51,-112.12
14392,30.0,F,1.0,78,34013.0,1,0,Rent,Local,0,...,33,Phone,1,2528.937564,8.0,Large,21726.278240,26923.96439,40.76,-80.32
14393,49.0,M,1.0,76,38930.0,1,1,Own,Parking Lot,0,...,97,Phone,1,1938.767818,3.0,Large,16785.373260,25955.93642,39.77,-105.09
14394,46.0,F,0.0,71,38281.0,1,1,Own,Local,0,...,9,Phone,1,5563.803695,3.0,Medium,23412.242820,27479.67724,33.62,-112.05


In [26]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train1,y_train)

In [27]:
y_pred=lr_clf.predict(X_train1)
y_pred

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [28]:
a={'age_of_driver': 39, 'annual_income': 36633, 'past_num_of_claims': 2, 
   'safty_rating': 73, 'gender': 'F', 'marital_status': 1, 'living_status': 'Own', 
   'high_education_ind': 1, 'address_change_ind': 1, 'claim_est_payout': 5196, 'liab_prct': 25, 
   'witness_present_ind': 1, 'policy_report_filed_ind': 1, 'accident_site': 'Highway', 'channel': 'Broker', 
   'age_of_vehicle': 8, 'vehicle_price': 24360, 'vehicle_weight': 26633, 'vehicle_category': 'Compact',
   'latitude': 41.5, 'longitude': -94.52}

In [29]:
b={'age_of_driver':a['age_of_driver'],'gender':a['gender'],'marital_status':a['marital_status'],
  'safty_rating':a['safty_rating'],'annual_income':a['annual_income'],'high_education_ind':a['high_education_ind'],
   'address_change_ind':a['address_change_ind'],
       'living_status':a['living_status'], 'accident_site':a['accident_site'], 
   'past_num_of_claims':a['past_num_of_claims'],
       'witness_present_ind':a['witness_present_ind'], 'liab_prct':a['liab_prct'], 
   'channel':a['channel'],'policy_report_filed_ind':a['policy_report_filed_ind'], 
   'claim_est_payout':a['claim_est_payout'], 'age_of_vehicle':a['age_of_vehicle'],
       'vehicle_category':a['vehicle_category'], 'vehicle_price':a['vehicle_price'], 
   'vehicle_weight':a['vehicle_weight'], 'latitude':a['latitude'],
       'longitude':a['longitude']
  }

In [30]:
a1=pd.DataFrame(b,index=[0])
a1

Unnamed: 0,age_of_driver,gender,marital_status,safty_rating,annual_income,high_education_ind,address_change_ind,living_status,accident_site,past_num_of_claims,...,liab_prct,channel,policy_report_filed_ind,claim_est_payout,age_of_vehicle,vehicle_category,vehicle_price,vehicle_weight,latitude,longitude
0,39,F,1,73,36633,1,1,Own,Highway,2,...,25,Broker,1,5196,8,Compact,24360,26633,41.5,-94.52


In [31]:
a2=enc.transform(a1)

In [32]:
lr_clf.predict(a2)

array([0], dtype=int64)

In [33]:
import pickle
with open('encoder.pickle','wb') as f:
    pickle.dump(enc,f)

In [34]:
import pickle
with open('model.pickle','wb') as f:
    pickle.dump(lr_clf,f)

In [35]:
with open("encoder.pickle", "rb") as f:
    enc1 = pickle.load(f)

In [36]:
with open("model.pickle", "rb") as f:
    md = pickle.load(f)

In [37]:
md.predict(enc1.transform(a1))

array([0], dtype=int64)

## Model Selection

`OneHotEncoder` will dummify categorical features, and numerical features will be re-scaled with `MinMaxScaler`.

In [38]:
categorical_features = X_train.columns[X_train.dtypes == object].tolist()
column_transformer = make_column_transformer(
    (OneHotEncoder(drop="first"), categorical_features),
    remainder="passthrough",
)
scaler = MinMaxScaler()

A simple function that defines the training pipeline: fit the model, predict on the validation set, print the evaluation metric.

In [39]:
def modeling(X_train, y_train, X_val, y_val, steps):
    pipeline = make_pipeline(*steps)
    pipeline.fit(X_train, y_train)
    y_val_pred = pipeline.predict_proba(X_val)[:, 1]
    metric = roc_auc_score(y_val, y_val_pred)
    if isinstance(pipeline._final_estimator, RandomizedSearchCV) or isinstance(pipeline._final_estimator, GridSearchCV):
        print(f"Best params: {pipeline._final_estimator.best_params_}")
    print(f"AUC score: {metric}")
    return pipeline

### K-Nearest Neighbor

KNN has two hyperparameters: the number of neighbors, and whether all points in each neighborhood are weighted equally or weighted by the inverse of their distance. Since the number of hyperparameters is small. A grid search is used to find the optimal hyperparameter values.

In [40]:
param_grid = {
    "n_neighbors": [5, 10, 25, 50],
    "weights": ["uniform", "distance"],
}

knn_clf = GridSearchCV(
    KNeighborsClassifier(),
    param_grid=param_grid,
    n_jobs=-1,
    cv=5,
    scoring="roc_auc",
)

knn_pipeline = modeling(X_train, y_train, X_val, y_val, [column_transformer, scaler, knn_clf])

Best params: {'n_neighbors': 50, 'weights': 'distance'}
AUC score: 0.6507841602442943


### Logistic Regression

For logistic regression, there is no hyperparameter to tune.

In [41]:
lr_clf = LogisticRegression()
lr_pipeline = modeling(X_train, y_train, X_val, y_val, [column_transformer, scaler, lr_clf])

AUC score: 0.7157014847720347


Look at the model coefficients.

In [42]:
def add_dummies(df, categorical_features):
    dummies = pd.get_dummies(df[categorical_features], drop_first=True)
    df = pd.concat([dummies, df], axis=1)
    df = df.drop(categorical_features, axis=1)
    return df.columns

feature_names = add_dummies(X_train, categorical_features)

pd.DataFrame({
    "feature_name": feature_names,
    "coefficient": lr_pipeline._final_estimator.coef_[0]
}).sort_values(by="coefficient", ascending=False).reset_index(drop=True)

Unnamed: 0,feature_name,coefficient
0,past_num_of_claims,1.75016
1,annual_income,1.570769
2,age_of_vehicle,0.982407
3,address_change_ind,0.398596
4,longitude,0.362837
5,living_status_Rent,0.128913
6,policy_report_filed_ind,0.083922
7,channel_Phone,0.039526
8,liab_prct,0.031912
9,vehicle_weight,0.03177


### XGBoost

Since there are many hyperparameters in XGBoost, I decide to use a randomized search for hyperparameter tuning.

In [43]:
param_grid = {
    "max_depth": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    "learning_rate": [0.001, 0.01, 0.1, 0.2, 0.3],
    "subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "colsample_bytree": [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "colsample_bylevel": [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "min_child_weight": [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
    "gamma": [0, 0.25, 0.5, 1.0],
    "n_estimators": [10, 20, 40, 60, 80, 100, 150, 200]
}

xgb_clf = RandomizedSearchCV(
    XGBClassifier(),
    param_distributions=param_grid,
    n_iter=50,
    n_jobs=-1,
    cv=5,
    random_state=23,
    scoring="roc_auc",
)

xgb_pipeline = modeling(X_train, y_train, X_val, y_val, [column_transformer, scaler, xgb_clf])

Best params: {'subsample': 0.6, 'n_estimators': 200, 'min_child_weight': 5.0, 'max_depth': 1, 'learning_rate': 0.2, 'gamma': 0.25, 'colsample_bytree': 0.8, 'colsample_bylevel': 1.0}
AUC score: 0.7294811207445069


Although the class imbalance is not very serious in this dataset, I want to see if using SMOTE to synthesize new examples for the minority class can improve the predictive performance. However, it seems that using SMOTE only worsens the performance.

In [44]:
sampler = SMOTE(random_state=42)
xgb_pipeline_smote = modeling(X_train, y_train, X_val, y_val, [column_transformer, scaler, sampler, xgb_clf])

Best params: {'subsample': 1.0, 'n_estimators': 200, 'min_child_weight': 0.5, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 0.25, 'colsample_bytree': 0.5, 'colsample_bylevel': 0.6}
AUC score: 0.6920917228441159


Save the XGBoost model (without SMOTE), since it has the best performance.

In [45]:
best_model = xgb_pipeline._final_estimator.best_estimator_
steps = [column_transformer, scaler, best_model]
pipeline = make_pipeline(*steps)
y_test_pred = pipeline.predict_proba(X_test)[:, 1]

df = pd.DataFrame({
    "claim_number": df_test["claim_number"],
    "fraud": y_test_pred
})
df.to_csv("../data/submission/submission.csv", index=False)

In [46]:
#WITH OPEN(‘MY_OBJECT.PICKLE’, ‘WB’) AS F:
#PICKLE.DUMP(MY_OBJECT, F)
import pickle
with open('best_model2.pickle','wb') as f:
    pickle.dump(pipeline,f)

To examine which feature is important, I introduce a feature with random numbers. A feature can be considered as important If the importance of that feature is larger than that of the random feature.

In [47]:
X_train["random_feature"] = np.random.uniform(size=len(X_train))
xgb_clf_random_feature = XGBClassifier(**xgb_pipeline._final_estimator.best_params_)
steps = [column_transformer, scaler, xgb_clf_random_feature]
xgb_pipeline_random_feature = make_pipeline(*steps)
xgb_pipeline_random_feature = xgb_pipeline_random_feature.fit(X_train, y_train)

pd.DataFrame({
    "feature_name": list(feature_names) + ["random_feature"],
    "importance": xgb_pipeline_random_feature._final_estimator.feature_importances_
}).sort_values(by="importance", ascending=False).reset_index(drop=True)

Unnamed: 0,feature_name,importance
0,accident_site_Parking Lot,0.119142
1,high_education_ind,0.088361
2,witness_present_ind,0.081875
3,marital_status,0.058424
4,past_num_of_claims,0.055986
5,address_change_ind,0.039236
6,age_of_driver,0.036997
7,claim_est_payout,0.033191
8,safty_rating,0.03293
9,annual_income,0.032379
