# Binary Classification of Insurance Cross Selling

## Import Libraries

In [1]:
!pip install catboost lightgbm xgboost



In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc

# For Cross Validation
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# For Machine Learning Models
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


pd.set_option('display.max_rows', 500)
pd.set_option('display.float_format', '{:.6f}'.format)

## Import Datasets

In [3]:
# Import the datasets from csv files
train = pd.read_csv('/kaggle/input/playground-series-s4e7/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e7/test.csv')

## Explore the structure of the dataset

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11504798 entries, 0 to 11504797
Data columns (total 12 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   Gender                object 
 2   Age                   int64  
 3   Driving_License       int64  
 4   Region_Code           float64
 5   Previously_Insured    int64  
 6   Vehicle_Age           object 
 7   Vehicle_Damage        object 
 8   Annual_Premium        float64
 9   Policy_Sales_Channel  float64
 10  Vintage               int64  
 11  Response              int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 1.0+ GB


In [5]:
train.describe()

Unnamed: 0,id,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Response
count,11504798.0,11504798.0,11504798.0,11504798.0,11504798.0,11504798.0,11504798.0,11504798.0,11504798.0
mean,5752398.5,38.383563,0.998022,26.41869,0.462997,30461.370411,112.425442,163.897744,0.122997
std,3321149.255474,14.993459,0.044431,12.99159,0.498629,16454.745205,54.035708,79.979531,0.328434
min,0.0,20.0,0.0,0.0,0.0,2630.0,1.0,10.0,0.0
25%,2876199.25,24.0,1.0,15.0,0.0,25277.0,29.0,99.0,0.0
50%,5752398.5,36.0,1.0,28.0,0.0,31824.0,151.0,166.0,0.0
75%,8628597.75,49.0,1.0,35.0,1.0,39451.0,152.0,232.0,0.0
max,11504797.0,85.0,1.0,52.0,1.0,540165.0,163.0,299.0,1.0


In [6]:
train.describe(include='object')

Unnamed: 0,Gender,Vehicle_Age,Vehicle_Damage
count,11504798,11504798,11504798
unique,2,3,2
top,Male,1-2 Year,Yes
freq,6228134,5982678,5783229


In [7]:
train.head(5)

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,0,Male,21,1,35.0,0,1-2 Year,Yes,65101.0,124.0,187,0
1,1,Male,43,1,28.0,0,> 2 Years,Yes,58911.0,26.0,288,1
2,2,Female,25,1,14.0,1,< 1 Year,No,38043.0,152.0,254,0
3,3,Female,35,1,1.0,0,1-2 Year,Yes,2630.0,156.0,76,0
4,4,Female,36,1,15.0,1,1-2 Year,No,31951.0,152.0,294,0


In [8]:
train.columns

Index(['id', 'Gender', 'Age', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage', 'Response'],
      dtype='object')

## Feature Engineering

In [9]:
# Fill the Response column in test as 0
test['Response'] = 0

In [10]:
# Combine test and train datasets
df = pd.concat([train, test])

In [11]:
# Convert Region_Code and Policy_Sales_Channel into int type
for col in ['Region_Code', 'Policy_Sales_Channel','Annual_Premium']:
    df[col] = df[col].astype(np.int32)

In [12]:
## Numerical Mapping for columns Gender, Vehicle_Age and Vehicle_Damage

# Encoding Gender
gender_mapping = {'Female': 0,'Male': 1}
df['Gender'] = df['Gender'].map(gender_mapping)

# Encoding Vehicle Age
vehicle_age_mapping = {'< 1 Year': 0,'1-2 Year': 1,'> 2 Years': 2}
df['Vehicle_Age'] = df['Vehicle_Age'].map(vehicle_age_mapping)

# Encoding Vehicle_Damage
vehicle_damage_mapping = {'Yes': 1,'No': 0}
df['Vehicle_Damage'] = df['Vehicle_Damage'].map(vehicle_damage_mapping)

# Drop id column
df.drop(columns=['id'],inplace=True)

In [13]:
# Create interaction variables using pd.factorize
def factorize_combination(df, col1, col2):
    return pd.factorize((df[col1].astype(str) + df[col2].astype(str)).to_numpy())[0]

def interaction_variables(df):
    df['Previously_Insured_Annual_Premium'] = factorize_combination(df, 'Previously_Insured', 'Annual_Premium')
    df['Previously_Insured_Vehicle_Age'] = factorize_combination(df, 'Previously_Insured', 'Vehicle_Age')
    df['Previously_Insured_Vehicle_Damage'] = factorize_combination(df, 'Previously_Insured', 'Vehicle_Damage')
    df['Previously_Insured_Vintage'] = factorize_combination(df, 'Previously_Insured', 'Vintage')
    return df

df = interaction_variables(df)

In [14]:
# Function to optimize memory usage
def optimize_memory_usage(df):
    print('Optimizing memory usage')
    start_mem_usage = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        if col_type.name in ['category', 'object']:
            raise ValueError(f"Column '{col}' is of type '{col_type.name}'")

        c_min = df[col].min()
        c_max = df[col].max()
        if str(col_type)[:3] == 'int':
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df[col] = df[col].astype(np.int64)
        else:
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)

    end_mem_usage = df.memory_usage().sum() / 1024**2
    print(f'Memory usage before: {start_mem_usage:.2f} MB')
    print(f'Memory usage after: {end_mem_usage:.2f} MB')
    print(f'Reduced memory usage by {(100 * (start_mem_usage - end_mem_usage) / start_mem_usage):.1f}%')

    return df

df = optimize_memory_usage(df)

Optimizing memory usage
Memory usage before: 2121.22 MB
Memory usage after: 585.16 MB
Reduced memory usage by 72.4%


In [15]:
# Separate preprocessed train and test from combined dataset
train = df[:train.shape[0]]
test = df[train.shape[0]:]

In [16]:
# Create X and y 
y = train['Response']
X = train.drop(columns=['Response'])
test = test.drop(columns=['Response'])

In [17]:
del df,train

### Stratified K-fold Cross Validation For Classification Models

In [18]:
def skfold_cv(X, y, algorithm, params, n_splits=2, n_jobs=-1):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    oof_pred = np.zeros(len(y))  # Array to store out-of-fold predictions
    
    for fold, (train_index, val_index) in enumerate(skf.split(X, y), 1):
        X_train, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        model = algorithm(**params)
        model.fit(X_train, y_train)
        pred = model.predict_proba(X_val)[:,1]
        
        oof_pred[val_index] = pred  # Store predictions in out-of-fold array

        # Calculate and print ROC AUC for the current fold
        fold_auc = roc_auc_score(y_val, pred)
        print(f'Fold {fold} - ROC AUC Score: {fold_auc:.4f}')
    
    # Calculate the ROC AUC score using the out-of-fold predictions
    auc = roc_auc_score(y, oof_pred)
    
    print(f'Average ROC AUC across all folds: {auc:.4f}')
    
    return model, auc


In [19]:
lgbm_params = {}    
lgbm_model,lgbm_scores = skfold_cv(X, y, LGBMClassifier, lgbm_params)

[LightGBM] [Info] Number of positive: 707530, number of negative: 5044869
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.328010 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1256
[LightGBM] [Info] Number of data points in the train set: 5752399, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122997 -> initscore=-1.964347
[LightGBM] [Info] Start training from score -1.964347
Fold 1 - ROC AUC Score: 0.8766
[LightGBM] [Info] Number of positive: 707529, number of negative: 5044870
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.315821 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1257
[LightGBM] [Info] Number of data points in the train set: 5752399, numbe

In [20]:
xgb_params = {}   
xgb_model,xgb_scores = skfold_cv(X, y, XGBClassifier, xgb_params)

Fold 1 - ROC AUC Score: 0.8796
Fold 2 - ROC AUC Score: 0.8802
Average ROC AUC across all folds: 0.8799


In [21]:
cat_params = {'verbose':1}
    
cat_model, cat_scores = skfold_cv(X, y, CatBoostClassifier, cat_params)

Learning rate set to 0.415326
0:	learn: 0.3283263	total: 765ms	remaining: 12m 44s
1:	learn: 0.2789434	total: 1.41s	remaining: 11m 42s
2:	learn: 0.2709276	total: 2.01s	remaining: 11m 8s
3:	learn: 0.2675340	total: 2.63s	remaining: 10m 55s
4:	learn: 0.2658352	total: 3.19s	remaining: 10m 35s
5:	learn: 0.2649139	total: 3.8s	remaining: 10m 29s
6:	learn: 0.2645402	total: 4.36s	remaining: 10m 18s
7:	learn: 0.2636149	total: 4.96s	remaining: 10m 15s
8:	learn: 0.2631434	total: 5.62s	remaining: 10m 19s
9:	learn: 0.2626930	total: 6.17s	remaining: 10m 10s
10:	learn: 0.2620001	total: 6.77s	remaining: 10m 8s
11:	learn: 0.2616105	total: 7.41s	remaining: 10m 10s
12:	learn: 0.2614223	total: 8.21s	remaining: 10m 22s
13:	learn: 0.2613062	total: 9s	remaining: 10m 34s
14:	learn: 0.2610682	total: 9.71s	remaining: 10m 37s
15:	learn: 0.2607874	total: 10.3s	remaining: 10m 36s
16:	learn: 0.2605665	total: 10.9s	remaining: 10m 29s
17:	learn: 0.2603881	total: 11.4s	remaining: 10m 23s
18:	learn: 0.2601380	total: 12.1

In [22]:
# Stratified K-Fold Cross Validation for Hyperparameter Tuned CatBoost Model
from catboost import Pool

def skfold_cv(X, y, model_class, model_params, n_splits=2, n_jobs=1):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []
    models = []

    for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        train_pool = Pool(X_train, y_train, cat_features=X_train.columns.values)
        val_pool = Pool(X_val, y_val, cat_features=X_val.columns.values)

        model = model_class(**model_params)
        model.fit(
            train_pool,
            eval_set=val_pool,
            early_stopping_rounds=model_params['early_stopping_rounds'],
            use_best_model=model_params['use_best_model'],
            verbose=model_params['verbose']
        )

        score = model.get_best_score()['validation']['AUC']
        scores.append(score)
        models.append(model)

        print(f"Fold {fold + 1} AUC: {score}")
        
        gc.collect()
        
    return models, scores

In [23]:
# Train CatBoost model with parameters

cat_params = {
    'loss_function':'Logloss',
    'eval_metric':'AUC',
    'class_names':[0, 1],
    'learning_rate':0.05,
    'iterations':5000,
    'depth':12,
    'random_strength':0,
    'l2_leaf_reg':0.5,
    'task_type':'GPU',
    'random_seed':42,
    'verbose':False,
    'early_stopping_rounds': 200, 
    'use_best_model': True}
    
cat_model, cat_scores = skfold_cv(X, y, CatBoostClassifier, cat_params)

Default metric period is 5 because AUC is/are not implemented for GPU


Fold 1 AUC: 0.8927862048149109


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 2 AUC: 0.8930912613868713


In [24]:
# Predicting the Test Results
pred = cat_model.predict_proba(test)[:,1]

In [25]:
# Adding values to the submission file
submission = pd.read_csv('/kaggle/input/playground-series-s4e7/sample_submission.csv')
submission['Response'] = pred

submission.to_csv('/kaggle/working/submission.csv', index=False)