In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Background:**

At the beginning, I attempted to perform a test on the following R solution (link provided) with target encoding, however the actual result was not satisfactory at around 0.46, either using the transformed train set on CatBoost or XgBoost. 

As suggested in many articles, target encoding methods easily result in overfitting without properly handling the preprocessing method. In addition, the main issue in this competition is that a substantial amount of unique value in the column [V22] of test data cannot be found in same column of train data, it causes the transformation to fill in the mean value for most of the empty value in V22 & other concatenated columns in test data.

At the same time, I am able to obtain satisfied score 0.433 without using target encoding & arithmetical combination by using CatBoost model only, especially CatB show its robustness when dealing high cardinality data set (most of the concatenated columns have more than 10k unique value).

I learned a lot during the testing process and kindly upvote if you find it useful :)


**Reference:**

Feature Engineering [link](https://www.kaggle.com/code/rsakata/xgboost-with-combination-of-factors/comments)<br>
Model [link](https://www.kaggle.com/code/confirm/xfeat-catboost-cpu-only)<br>
Winning Solution (v22 concept) [link](https://www.kaggle.com/competitions/bnp-paribas-cardif-claims-management/discussion/20247)

# Import Data

In [None]:
train = pd.read_csv("../input/bnp-paribas-cardif-claims-management/train.csv.zip")
test = pd.read_csv("../input/bnp-paribas-cardif-claims-management/test.csv.zip")

In [None]:
print(train.shape)
train.head()

In [None]:
y= train['target']

In [None]:
print(test.shape)
test.head()

# Data Exploration

In [None]:
pd.set_option('display.max_rows', 200)

missing_check = pd.DataFrame(train.isnull().sum() / train.shape[0],columns=['missing'])
missing_check.head(10)

In [None]:
import operator

#Display correlation for numeric columns and only take top 55 columns for feature extraction
numer = train.select_dtypes(include=['number']).columns

corr_num = {}
for col in numer:
    corr = train['target'].corr(train[col])
    if not np.isnan(corr):
#     if corr >= 0.03:
        corr_num[col] = abs(corr)

sort_num = sorted(corr_num.items(), key=operator.itemgetter(1),reverse=True)[:55]
sort_num[:10]

In [None]:
#Display correlation for categorical columns and use all categorical columns for 1 & 2 way combination plus selected columns
#in 11 ways combination
cat = train.select_dtypes(exclude=['number']).columns

corr_num = {}
for col in cat:
    corr = train['target'].corr(train[col].astype('category').cat.codes)
    if not np.isnan(corr):
#     if corr >= 0.03:
        corr_num[col] = abs(corr)

sort_cat = sorted(corr_num.items(), key=operator.itemgetter(1),reverse=True)
sort_cat

# Data Preprocessing

In [None]:
use_col = []

for i in range(0,len(sort_num)):
    if sort_num[i][0] != 'target':
         use_col.append(sort_num[i][0])

for i in range(0,len(sort_cat)):
    use_col.append(sort_cat[i][0])

len(use_col)

In [None]:
train = train[use_col]
print(train.shape)
train.head()

In [None]:
test = test[use_col]
print(test.shape)
test.head()

In [None]:
cat_col = train.select_dtypes(exclude=['number']).columns.values
num_col = train.select_dtypes(include=['number']).columns.values

In [None]:
for col in num_col:
    mean = train[col].mean(skipna=True)
    train[col] = train[col].fillna(mean)
    test[col] = test[col].fillna(mean)
    
for col in cat_col:
    mode = train[col].mode(dropna=True)[0]
    train[col] = train[col].fillna('NA')
    test[col] = test[col].fillna('NA')
#     train[col] = train[col].fillna(mode)
#     test[col] = test[col].fillna(mode)

In [None]:
print(train.isnull().sum().sum())
print(test.isnull().sum().sum())

In [None]:
from itertools import combinations

cc = list(combinations(cat_col,2))
column_names = [c[1]+c[0]  for c in cc]
df_comb2 = pd.concat([train[c[1]] + train[c[0]] for c in cc], axis=1,keys=column_names)
df_comb2.head()

In [None]:
test_df_comb2 = pd.concat([test[c[1]] + test[c[0]] for c in cc], axis=1,keys=column_names)
test_df_comb2.head()

In [None]:
#To ensure not repeating combination like v22v22v33 by creating new list
cat_col_ex_v22 = np.delete(cat_col, np.where(cat_col == 'v22'))
cat_col_ex_v22

In [None]:
from itertools import product

# First generate combination part without v22, then generate a new pair combine with v22 
cc_ex_v22 = list(combinations(cat_col_ex_v22,2))
cc_v22_1 = list(product(['v22'],cc_ex_v22))
column_names = [c[0]+c[1][0]+c[1][1] for c in cc_v22_1]
df_comb_v22 = pd.concat([train[c[0]] + train[c[1][0]] + train[c[1][1]] for c in cc_v22_1], axis=1,keys=column_names)
df_comb_v22.head()

In [None]:
test_df_comb_v22 = pd.concat([test[c[0]] + test[c[1][0]] + test[c[1][1]] for c in cc_v22_1], axis=1,keys=column_names)
test_df_comb_v22.head()

In [None]:
train = pd.concat([train,df_comb2,df_comb_v22],axis=1)
train.shape

In [None]:
test = pd.concat([test,test_df_comb2,test_df_comb_v22],axis=1)
test.shape

In [None]:
import gc

del df_comb2,df_comb_v22, test_df_comb2, test_df_comb_v22
gc.collect()

In [None]:
#Subset the categorical list with selected columns to reduct total combination
del_list = ['v110','v74','v3','v107','v71','v125','v22']

cat_col = np.delete(cat_col,np.isin(cat_col,del_list))
cat_col

In [None]:
cc_ex_v22 = list(combinations(cat_col,10))
cc_v22_11 = list(product(['v22'],cc_ex_v22))
len(cc_v22_11)

In [None]:
column_names = [col[0]+col[1][0]+col[1][1]+col[1][2]+col[1][3]+col[1][4]+col[1][5]+col[1][6]+col[1][7]+col[1][8]+col[1][9] for col in cc_v22_11]

train_11 = pd.concat([train[col[0]]+train[col[1][0]]+train[col[1][1]]+train[col[1][2]]+train[col[1][3]]+train[col[1][4]]+\
    train[col[1][5]]+train[col[1][6]]+train[col[1][7]]+train[col[1][8]]+train[col[1][9]] for col in cc_v22_11], axis=1,keys=column_names)

In [None]:
train_11.head()

In [None]:
train = pd.concat([train,train_11],axis=1)
train.shape

In [None]:
test_11 = pd.concat([test[col[0]]+test[col[1][0]]+test[col[1][1]]+test[col[1][2]]+test[col[1][3]]+test[col[1][4]]+\
    test[col[1][5]]+test[col[1][6]]+test[col[1][7]]+test[col[1][8]]+test[col[1][9]] for col in cc_v22_11], axis=1,keys=column_names)

test = pd.concat([test,test_11],axis=1)
test.shape

In [None]:
del test_11,train_11
gc.collect()

In [None]:
cat_col = train.select_dtypes(exclude=['number']).columns.values

In [None]:
#Original the train data is transformed with target encoding then used in Catboost fitting, however the score is always around 0.46
#Fit_transform is required on the train set directly rather than fit>transform, else the model will be extreme overfitting

# from category_encoders import leave_one_out

# te = leave_one_out.LeaveOneOutEncoder(verbose=0,cols=cat_col,random_state=42,sigma=0.05)
# train = te.fit_transform(train,y)
# test = te.transform(test)

In [None]:
print(train.shape)
print(train.isnull().sum().sum())
train.head()

In [None]:
print(test.shape)
print(test.isnull().sum().sum())
test.head()

# Model

## CatBoost

In [None]:
import catboost as cat

#2400 best score with validation set
params = {
    "loss_function": "Logloss",
    "eval_metric": "Logloss",
    "learning_rate": 0.03,
    "iterations": 2400,
    "l2_leaf_reg": 3,
    "random_seed": 432013,
    "subsample": 0.66,
    "od_type": "Iter",
    "rsm": 0.2,
    "depth": 6,
    "border_count": 128
}

In [None]:
# from sklearn.model_selection import train_test_split

# X_train1, X_val, y_train1, y_val = train_test_split(train, y, test_size=0.2, random_state=42,stratify=y)

In [None]:
# model = cat.CatBoostClassifier(**params)
# train_data = cat.Pool(X_train1, label=y_train1,cat_features=cat_col)
# val_data = cat.Pool(X_val, label=y_val,cat_features=cat_col)

# fit_model = model.fit(train_data, verbose=30,eval_set=val_data,early_stopping_rounds=100)

In [None]:
#For this model, numeric columns remained as numeric features, convert numeric value to string then pass to cat_feature should provide better score

model = cat.CatBoostClassifier(**params)
train_data = cat.Pool(train, label=y,cat_features=cat_col)
test_data = cat.Pool(test,cat_features=cat_col)

fit_model = model.fit(train_data, verbose=0)

In [None]:
pd.set_option('display.max_rows', 500)

pd.DataFrame({'feature_importance': model.get_feature_importance(train_data), 
              'feature_names': train.columns}).sort_values(by=['feature_importance'], 
                                                           ascending=False).head(20)

In [None]:
y_pred = fit_model.predict_proba(test_data)

## XGBoost

Xgboost plus target encoding using same the features can only return score 0.46 above, therefore this model is not adopted

In [None]:
# import xgboost as xgb

# nrounds  = 500
# params = {
#     "eta": 0.05,
#     "max_depth": 6,
#     "colsample_bylevel": 0.3,
#     "objective": 'binary:logistic',
#     "eval_metric": 'logloss'}

# dtrain = xgb.DMatrix(train, y)
# dtest = xgb.DMatrix(test)
# watchlist = [(dtrain, 'train')]
# model = xgb.train(params=params, dtrain=dtrain, num_boost_round=nrounds,evals=watchlist,verbose_eval=True )

In [None]:
# y_pred = model.predict(dtest)

In [None]:
pd.DataFrame(y_pred[:,1]).describe()

# Submission

In [None]:
submission = pd.read_csv("../input/bnp-paribas-cardif-claims-management/sample_submission.csv.zip")

In [None]:
submission['PredictedProb'] = y_pred[:,1]
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)