In [60]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder, OneHotEncoder
from sklearn.model_selection import cross_val_score, cross_val_predict,KFold, StratifiedKFold
from catboost import CatBoostClassifier
pd.set_option('display.max_columns', None)

/kaggle/input/av-janatahack-crosssell-prediction/sample.csv
/kaggle/input/av-janatahack-crosssell-prediction/test.csv
/kaggle/input/av-janatahack-crosssell-prediction/train.csv


In [61]:
train = pd.read_csv("../input/av-janatahack-crosssell-prediction/train.csv")
test = pd.read_csv("../input/av-janatahack-crosssell-prediction/test.csv")
sample = pd.read_csv("../input/av-janatahack-crosssell-prediction/sample.csv")

In [62]:
train.drop('id',axis=1,inplace=True)
test.drop('id',axis=1,inplace=True)

In [63]:
train.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


In [64]:
train.isnull().sum()

Gender                  0
Age                     0
Driving_License         0
Region_Code             0
Previously_Insured      0
Vehicle_Age             0
Vehicle_Damage          0
Annual_Premium          0
Policy_Sales_Channel    0
Vintage                 0
Response                0
dtype: int64

In [65]:
#Remove duplicate rows from training set
train.drop_duplicates(keep='first',inplace = True)
train.duplicated().sum()

0

In [66]:
train['Response'].value_counts()

0    334155
1     46685
Name: Response, dtype: int64

In [67]:
cat_cols = train.select_dtypes(include = 'object')
num_cols = train.select_dtypes(include=['int64','float64'])

In [68]:
cat_cols

Unnamed: 0,Gender,Vehicle_Age,Vehicle_Damage
0,Male,> 2 Years,Yes
1,Male,1-2 Year,No
2,Male,> 2 Years,Yes
3,Male,< 1 Year,No
4,Female,< 1 Year,No
...,...,...,...
381104,Male,1-2 Year,No
381105,Male,< 1 Year,No
381106,Male,< 1 Year,No
381107,Female,> 2 Years,Yes


In [69]:
num_cols

Unnamed: 0,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,44,1,28.0,0,40454.0,26.0,217,1
1,76,1,3.0,0,33536.0,26.0,183,0
2,47,1,28.0,0,38294.0,26.0,27,1
3,21,1,11.0,1,28619.0,152.0,203,0
4,29,1,41.0,1,27496.0,152.0,39,0
...,...,...,...,...,...,...,...,...
381104,74,1,26.0,1,30170.0,26.0,88,0
381105,30,1,37.0,1,40016.0,152.0,131,0
381106,21,1,30.0,1,35118.0,160.0,161,0
381107,68,1,14.0,0,44617.0,124.0,74,0


In [70]:
train.describe()

Unnamed: 0,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Response
count,380840.0,380840.0,380840.0,380840.0,380840.0,380840.0,380840.0,380840.0
mean,38.823593,0.997868,26.388307,0.458208,30584.016078,112.030225,154.344607,0.122584
std,15.513206,0.046126,13.229907,0.498251,17203.258664,54.206194,83.67024,0.32796
min,20.0,0.0,0.0,0.0,2630.0,1.0,10.0,0.0
25%,25.0,1.0,15.0,0.0,24426.0,29.0,82.0,0.0
50%,36.0,1.0,28.0,0.0,31678.5,133.0,154.0,0.0
75%,49.0,1.0,35.0,1.0,39408.0,152.0,227.0,0.0
max,85.0,1.0,52.0,1.0,540165.0,163.0,299.0,1.0


In [71]:
combine = train.append(test)

In [72]:
combine['Vintage'] = combine['Vintage']/365

In [73]:
combine['Vehicle_Age']=combine['Vehicle_Age'].replace({'< 1 Year':0,'1-2 Year':1,'> 2 Years':2})

In [74]:
combine['Vehicle_Damage']=combine['Vehicle_Damage'].replace({'Yes':1,'No':0})
combine['Gender']=combine['Gender'].replace({'Male':1,'Female':0})

In [75]:
#Analysis show that people in Age Group 30-60 have higher response rate so creating a separate feature
combine['Age_Group'] = np.where((combine['Age']<30) & (combine['Age'] > 60),0,1)

In [76]:
#Creatig separate groupings for region_code and sales_channels

def find_non_rare_labels(df, variable, tolerance):
    
    temp = df.groupby([variable])[variable].count() / len(df)
    
    non_rare = [x for x in temp.loc[temp>tolerance].index.values]
    
    return non_rare

def rare_encoding(data, variable, tolerance,new_col):
    frequent_cat = find_non_rare_labels(data, variable, tolerance)

    # re-group rare labels
    data[new_col] = np.where(data[variable].isin(
        frequent_cat), data[variable], 'Rare')



for variable in ['Policy_Sales_Channel']:
    
     rare_encoding(combine, variable, 0.01,'Policy_Sales_Channel_Group')
        
for variable in ['Region_Code']:
    
     rare_encoding(combine, variable, 0.02,'Region_Code_Group')

In [77]:
combine['IsPreviouslyInsuredandVehicleDamaged'] = np.where((combine['Previously_Insured']==0) & (combine['Vehicle_Damage']==1),1,0)
combine['IsVehicleDamagedandDrivingLicense'] = np.where((combine['Vehicle_Damage']==1) & (combine['Driving_License']==1),1,0)
combine['TotalAmountPaidTillDate'] = combine['Annual_Premium']*combine['Vintage']
combine['PremiumperRegion'] = combine.groupby('Region_Code')['Annual_Premium'].transform('mean')
combine['PremiumperPolicy_Sales_Channel'] = combine.groupby('Policy_Sales_Channel')['Annual_Premium'].transform('mean')
combine['AvgVehicleAgePerRegion'] = combine.groupby('Policy_Sales_Channel')['Annual_Premium'].transform('mean')
combine['AvgCustomerAgeRegionWise'] = combine.groupby('Region_Code')['Age'].transform('mean')
combine['AvgCustomerAgeSaleChannelWise'] = combine.groupby('Policy_Sales_Channel')['Age'].transform('mean')
combine['SaleChannelsPerRegion'] = combine.groupby('Region_Code')['Policy_Sales_Channel'].transform('nunique')
combine['RegionwisePreviouslyInsured'] = combine.groupby('Region_Code')['Previously_Insured'].transform('count')
combine['RegionwiseVintage'] = combine.groupby('Region_Code')['Vintage'].transform('mean').astype('int')
combine['SaleChannelwiseVintage'] = combine.groupby('Policy_Sales_Channel')['Vintage'].transform('mean').astype('int')

In [78]:
combine['AvgRegionGenderWisePremium'] = combine.groupby(['Region_Code','Gender'])['Annual_Premium'].transform('mean')
combine['NoPeoplePrevInsuredRegionGenderWise'] = combine.groupby(['Region_Code','Gender'])['Previously_Insured'].transform('count')
combine['NoPeoplePrevInsuredSalesChannelGenderWise'] = combine.groupby(['Policy_Sales_Channel','Gender'])['Previously_Insured'].transform('count')
combine['NoPeoplePrevInsuredSalesChannelRegionWise'] = combine.groupby(['Region_Code','Policy_Sales_Channel'])['Previously_Insured'].transform('count')
combine['AvgCustomerDurationRegionGenderWise'] = combine.groupby(['Region_Code','Gender'])['Vintage'].transform('mean')


In [79]:
combine['InsuranceLicense'] = combine['Driving_License'].astype('str') + '' + combine['Previously_Insured'].astype('str')
combine['InsuranceGender'] = combine['Gender'].astype('str') + '' + combine['Previously_Insured'].astype('str')

In [80]:
# changing data type because cat_feature in catboost cannot be float
combine['Region_Code']=combine['Region_Code'].astype(int)
combine['Policy_Sales_Channel']=combine['Policy_Sales_Channel'].astype(int)

In [82]:
# categorical column 
cat_col=['Gender','Driving_License', 'Region_Code', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage','Policy_Sales_Channel','InsuranceLicense','InsuranceGender','Policy_Sales_Channel_Group','Region_Code_Group']

In [83]:
combine.head(30)

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Age_Group,Policy_Sales_Channel_Group,Region_Code_Group,IsPreviouslyInsuredandVehicleDamaged,IsVehicleDamagedandDrivingLicense,TotalAmountPaidTillDate,PremiumperRegion,PremiumperPolicy_Sales_Channel,AvgVehicleAgePerRegion,AvgCustomerAgeRegionWise,AvgCustomerAgeSaleChannelWise,SaleChannelsPerRegion,RegionwisePreviouslyInsured,RegionwiseVintage,SaleChannelwiseVintage,AvgRegionGenderWisePremium,NoPeoplePrevInsuredRegionGenderWise,NoPeoplePrevInsuredSalesChannelGenderWise,NoPeoplePrevInsuredSalesChannelRegionWise,AvgCustomerDurationRegionGenderWise,InsuranceLicense,InsuranceGender
0,1,44,1,28,0,2,1,40454.0,26,0.594521,1.0,1,26.0,28.0,1,1,24050.734247,38669.911573,33655.174723,33655.174723,46.224374,50.630899,127,141857,0,0,38693.739838,80081,65588,50085,0.422864,10,10
1,1,76,1,3,0,1,0,33536.0,26,0.50137,0.0,1,26.0,3.0,0,0,16813.939726,24476.059548,33655.174723,33655.174723,36.98647,50.630899,64,12343,0,0,24507.248873,6875,65588,2240,0.424436,10,10
2,1,47,1,28,0,2,1,38294.0,26,0.073973,1.0,1,26.0,28.0,1,1,2832.706849,38669.911573,33655.174723,33655.174723,46.224374,50.630899,127,141857,0,0,38693.739838,80081,65588,50085,0.422864,10,10
3,1,21,1,11,1,0,0,28619.0,152,0.556164,0.0,1,152.0,11.0,0,0,15916.868493,27661.274483,30915.071419,30915.071419,34.237566,26.212073,53,12325,0,0,27303.489803,6178,77555,6732,0.422356,11,11
4,0,29,1,41,1,0,0,27496.0,152,0.106849,0.0,1,152.0,41.0,0,0,2937.928767,30957.106547,30915.071419,30915.071419,36.404665,26.212073,68,24393,0,0,31018.709672,11425,101864,11498,0.424287,11,1
5,0,24,1,33,0,0,1,2630.0,160,0.482192,0.0,1,160.0,33.0,1,1,1268.164384,27860.56068,25502.826513,25502.826513,37.723301,23.452235,65,10300,0,0,28186.185261,4858,15660,286,0.422391,10,0
6,1,23,1,11,0,0,1,23367.0,152,0.682192,0.0,1,152.0,11.0,1,1,15940.775342,27661.274483,30915.071419,30915.071419,34.237566,26.212073,53,12325,0,0,27303.489803,6178,77555,6732,0.422356,10,10
7,0,56,1,28,0,1,1,32031.0,26,0.19726,1.0,1,26.0,28.0,1,1,6318.443836,38669.911573,33655.174723,33655.174723,46.224374,50.630899,127,141857,0,0,38639.022695,61776,40952,50085,0.422343,10,0
8,0,24,1,3,1,0,0,27619.0,152,0.076712,0.0,1,152.0,3.0,0,0,2118.717808,24476.059548,30915.071419,30915.071419,36.98647,26.212073,64,12343,0,0,24436.844733,5468,101864,4379,0.424273,11,1
9,0,32,1,6,1,0,0,28771.0,152,0.219178,0.0,1,152.0,Rare,0,0,6305.972603,25098.075707,30915.071419,30915.071419,30.178725,26.212073,38,8348,0,0,25112.02683,4249,101864,5795,0.427021,11,1


In [84]:
combine.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 507877 entries, 0 to 127036
Data columns (total 33 columns):
 #   Column                                     Non-Null Count   Dtype  
---  ------                                     --------------   -----  
 0   Gender                                     507877 non-null  int64  
 1   Age                                        507877 non-null  int64  
 2   Driving_License                            507877 non-null  int64  
 3   Region_Code                                507877 non-null  int64  
 4   Previously_Insured                         507877 non-null  int64  
 5   Vehicle_Age                                507877 non-null  int64  
 6   Vehicle_Damage                             507877 non-null  int64  
 7   Annual_Premium                             507877 non-null  float64
 8   Policy_Sales_Channel                       507877 non-null  int64  
 9   Vintage                                    507877 non-null  float64
 10  Response

In [85]:
train = combine[combine['Response'].isnull()!= True]
test = combine[combine['Response'].isnull()== True]

In [86]:
test.drop(['Response'],axis=1,inplace=True)

## Model

In [87]:
X = train.drop(["Response"], axis=1)
Y = train["Response"]

In [92]:

oof_pred               = np.zeros((len(train),))
y_pred_final           = np.zeros((len(test),))
num_models             = 3

n_splits               = 20
error                  = []

kf=StratifiedKFold(n_splits=n_splits,shuffle=True,random_state=294)
    
for i,(train_idx,val_idx) in enumerate(kf.split(X,Y)):    
    
    wghts                     = [0]*num_models
    test_roc_score            = []
    
    
    X_train, y_train = X.iloc[train_idx,:], Y.iloc[train_idx]

    X_val, y_val = X.iloc[val_idx, :], Y.iloc[val_idx]
    

    print('\nFold: {}\n'.format(i+1))

    model1 = CatBoostClassifier(learning_rate = 0.03,random_state=42,scale_pos_weight=7, custom_metric=['AUC'])
    model1.fit(X_train,y_train,cat_features=cat_col,eval_set=(X_val, y_val),early_stopping_rounds=30,verbose=100)
    testpred1 = model1.predict_proba(X_val)[:,1]
    test_roc_score.append(roc_auc_score(y_val, testpred1))
    print("Test ROC AUC for model 1: %.4f"%(roc_auc_score(y_val, testpred1)))
    
    model2 = CatBoostClassifier(learning_rate = 0.04,random_state=42,scale_pos_weight=7, custom_metric=['AUC'])
    model2.fit(X_train,y_train,cat_features=cat_col,eval_set=(X_val, y_val),early_stopping_rounds=40,verbose=100)
    testpred2 = model2.predict_proba(X_val)[:,1]
    test_roc_score.append(roc_auc_score(y_val, testpred2))
    print("Test ROC AUC for model 2: %.4f"%(roc_auc_score(y_val, testpred2)))
    
    model3 = CatBoostClassifier(learning_rate = 0.05,random_state=42,scale_pos_weight=7, custom_metric=['AUC'])
    model3.fit(X_train,y_train,cat_features=cat_col,eval_set=(X_val, y_val),early_stopping_rounds=20,verbose=100)
    testpred3 = model3.predict_proba(X_val)[:,1]
    test_roc_score.append(roc_auc_score(y_val, testpred3))
    print("Test ROC AUC for model 3: %.4f"%(roc_auc_score(y_val, testpred3)))
    
    wghts              = np.exp(-1000*np.array(test_roc_score/sum(test_roc_score)))
    wghts              = wghts/sum(wghts)
    
    val_pred           = wghts[0]*testpred1+wghts[1]*testpred2 +wghts[2]*testpred3
    print('validation roc_auc_score fold-',i+1,': ',roc_auc_score(y_val, val_pred))
    
    oof_pred[val_idx]  = val_pred
    y_pred_final += (wghts[0]*model1.predict_proba(test)[:,1]+wghts[1]*model2.predict_proba(test)[:,1]+wghts[2]*model3.predict_proba(test)[:,1])/(n_splits)
    
    print('\n')
    
print('OOF ROC_AUC_Score:- ',(roc_auc_score(Y,oof_pred)))




Fold: 1

0:	learn: 0.6688680	test: 0.6688658	best: 0.6688658 (0)	total: 1.09s	remaining: 18m 9s
100:	learn: 0.4239789	test: 0.4208600	best: 0.4208600 (100)	total: 1m 13s	remaining: 10m 51s
200:	learn: 0.4200553	test: 0.4170332	best: 0.4170332 (200)	total: 2m 26s	remaining: 9m 43s
300:	learn: 0.4184494	test: 0.4160873	best: 0.4160873 (300)	total: 3m 38s	remaining: 8m 27s
400:	learn: 0.4173717	test: 0.4156347	best: 0.4156130 (397)	total: 4m 46s	remaining: 7m 8s
500:	learn: 0.4160385	test: 0.4153952	best: 0.4153952 (500)	total: 6m 1s	remaining: 5m 59s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.4153851284
bestIteration = 508

Shrink model to first 509 iterations.
Test ROC AUC for model 1: 0.8615
0:	learn: 0.6611679	test: 0.6611625	best: 0.6611625 (0)	total: 785ms	remaining: 13m 4s
100:	learn: 0.4216088	test: 0.4184350	best: 0.4184350 (100)	total: 1m 14s	remaining: 11m
200:	learn: 0.4186557	test: 0.4160230	best: 0.4160230 (200)	total: 2m 25s	remaining: 9m 40s
300:	

300:	learn: 0.4179715	test: 0.4226168	best: 0.4226168 (300)	total: 3m 36s	remaining: 8m 22s
400:	learn: 0.4168927	test: 0.4223558	best: 0.4223546 (372)	total: 4m 51s	remaining: 7m 15s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.4223546343
bestIteration = 372

Shrink model to first 373 iterations.
Test ROC AUC for model 1: 0.8543
0:	learn: 0.6610728	test: 0.6613680	best: 0.6613680 (0)	total: 784ms	remaining: 13m 3s
100:	learn: 0.4214165	test: 0.4243737	best: 0.4243737 (100)	total: 1m 13s	remaining: 10m 54s
200:	learn: 0.4182162	test: 0.4226300	best: 0.4226300 (200)	total: 2m 24s	remaining: 9m 35s
300:	learn: 0.4168644	test: 0.4222987	best: 0.4222978 (298)	total: 3m 33s	remaining: 8m 15s
Stopped by overfitting detector  (40 iterations wait)

bestTest = 0.42215563
bestIteration = 321

Shrink model to first 322 iterations.
Test ROC AUC for model 2: 0.8545
0:	learn: 0.6535474	test: 0.6539099	best: 0.6539099 (0)	total: 784ms	remaining: 13m 3s
100:	learn: 0.4201515	tes

KeyboardInterrupt: 

In [93]:
y_pred_final

array([0.00100744, 0.2672109 , 0.25997613, ..., 0.00060476, 0.00035905,
       0.00460216])

In [94]:

sample['Response']=y_pred_final
sample.to_csv("cat_final.csv", index = False)