In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.metrics import  auc, roc_curve, classification_report 

from lightgbm import LGBMClassifier, plot_importance

In [None]:
train = pd.read_csv('/kaggle/input/janatahack-crosssell-prediction/train.csv')
test = pd.read_csv('/kaggle/input/janatahack-crosssell-prediction/test.csv')


In [None]:
data = pd.concat([train,test], axis=0)
data.head()

In [None]:
gender_bias= {
'Male' : 0,
'Female' : 1
}


vehicle = { '< 1 Year' :0,
'1-2 Year' : 1,
'> 2 Years' : 2}


vehicle_damage = { 'No' : 0,
'Yes' : 1}

In [None]:
data['Gender'] = data['Gender'].map(gender_bias)
data['Vehicle_Age'] = data['Vehicle_Age'].map(vehicle)
data['Vehicle_Damage'] = data['Vehicle_Damage'].map(vehicle_damage)

In [None]:
group_vars = ['Region_Code', 'Policy_Sales_Channel']

agg_vars = ['Annual_Premium', 'Vintage', 'Age']


for g in group_vars:
    for a in agg_vars:
        data[f'{g}_{a}_count'] = data.groupby(data[g])[a].transform('count')
        data[f'{g}_{a}_mean'] = data.groupby(data[g])[a].transform('mean')
        data[f'{g}_{a}_std'] = data.groupby(data[g])[a].transform('std')
        data[f'{g}_{a}_min'] = data.groupby(data[g])[a].transform('min')
        data[f'{g}_{a}_max'] = data.groupby(data[g])[a].transform('max')

In [None]:
! pip install pycaret

In [None]:
import pycaret

In [None]:
X = data.iloc[:len(train)]
Y = data.iloc[len(train):]
X['Response'].tail()

In [None]:
X.fillna(method='ffill', inplace=True)

In [None]:
X.isnull().sum()

In [None]:
from pycaret.classification import *

In [None]:
X

In [None]:
df = X.drop(columns='id')

In [None]:
df.Response = df.Response.astype('int')

In [None]:
df.Response 

In [None]:
session_1 = setup(data=df, target='Response', log_experiment=True)

In [None]:
best_model = compare_models()

In [None]:
models()

In [None]:
best_model = create_model('rf')

In [None]:
best_model

In [None]:
tuned_gbc = tune_model(best_model)

In [None]:
tuned_gbc

In [None]:
plot_model(tuned_gbc)

In [None]:
plot_model(tuned_gbc, plot= 'boundary')

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(sampling_strategy='minority', random_state=55, k_neighbors=5)


In [None]:
session_2 = setup(data=df, target='Response', log_experiment=False, normalize=True, normalize_method='zscore', transformation=True, transformation_method='quantile',
                 fix_imbalance=True, fix_imbalance_method = sm)

In [None]:
best_model = create_model('catboost')

In [None]:
tuned_Cat = tune_model(best_model)

In [None]:
lightgbm_model = create_model('lightgbm')

In [None]:
lightgbm_tuned = tune_model(lightgbm_model)

# Lets combine both model

In [None]:
blend = blend_models(estimator_list = [tuned_Cat, lightgbm_tuned], method='soft')

In [None]:
plot_model(blend)

In [None]:
blend

In [None]:
plot_model(blend, plot= 'confusion_matrix')

In [None]:
plot_model(blend, plot= 'error')

In [None]:
plot_model(blend, plot= 'boundary')

In [None]:
Final = Y.drop(columns=['id','Response'])

In [None]:
# generate predictions on unseen data
predictions = predict_model(blend, data = Final)

In [None]:
predictions

In [None]:
result=pd.DataFrame(Y["id"],columns=["id","Response"])
result["Response"]=predictions['Score']
result.to_csv("LGBM_prediction.csv",index=0)

# AUC score we got on Unseen Data was 0.85