In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.metrics import  auc, roc_curve, classification_report 

from lightgbm import LGBMClassifier, plot_importance

In [None]:
train = pd.read_csv('/kaggle/input/janatahack-crosssell-prediction/train.csv')
test = pd.read_csv('/kaggle/input/janatahack-crosssell-prediction/test.csv')

In [None]:
data = pd.concat([train,test], axis=0)
data.head()

# Visualization

In [None]:
train.groupby(['Gender','Response'])['id'].count().to_frame()

In [None]:
fig, ax = plt.subplots(nrows=6,ncols=1, figsize=(10,60))
cols = train.columns.values.tolist()
cols = [c for c in cols if c not in ('id', 'Age', 'Annual_Premium', 'Vintage','Policy_Sales_Channel', 'Response')]

for i in range(len(cols)):
    tmp = train.groupby([cols[i],'Response'])['id'].count().to_frame().reset_index()
    tmp = tmp.rename(columns={'id':'Number of Users'})
    sns.barplot(x=cols[i], y='Number of Users', hue='Response', data=tmp, ax = ax[i]).set_title('Count Graph of {}'.format(cols[i]))

In [None]:
#train.loc[train['Response']==0, ['Age','Vintage']]

In [None]:
cont_var = ['Age', 'Annual_Premium', 'Vintage']

fig, ax = plt.subplots(nrows = 3, ncols=1, figsize=(30,15))

for i in range(len(cont_var)):
    print(cont_var[i])
    sns.kdeplot(train.loc[train['Response']==0, cont_var[i]], label='0', ax = ax[i]).set_title('kde plot of {}'.format(cont_var[i]), fontsize=30)
    sns.kdeplot(train.loc[train['Response']==1, cont_var[i]], label='1', ax = ax[i])
    

# Feature Engineering

In [None]:
data

In [None]:
gender_bias= {
'Male' : 0,
'Female' : 1
}


vehicle = { '< 1 Year' :0,
'1-2 Year' : 1,
'> 2 Years' : 2}


vehicle_damage = { 'No' : 0,
'Yes' : 1}



In [None]:
data['Gender'] = data['Gender'].map(gender_bias)
data['Vehicle_Age'] = data['Vehicle_Age'].map(vehicle)
data['Vehicle_Damage'] = data['Vehicle_Damage'].map(vehicle_damage)

In [None]:
sns.kdeplot(data.loc[data['Response']==1, 'Gender'])
sns.kdeplot(data.loc[data['Response']==0, 'Gender'])

In [None]:
data

In [None]:
group_vars = ['Region_Code', 'Policy_Sales_Channel']

agg_vars = ['Annual_Premium', 'Vintage', 'Age']


for g in group_vars:
    for a in agg_vars:
        data[f'{g}_{a}_count'] = data.groupby(data[g])[a].transform('count')
        data[f'{g}_{a}_mean'] = data.groupby(data[g])[a].transform('mean')
        data[f'{g}_{a}_std'] = data.groupby(data[g])[a].transform('std')
        data[f'{g}_{a}_min'] = data.groupby(data[g])[a].transform('min')
        data[f'{g}_{a}_max'] = data.groupby(data[g])[a].transform('max')

# Basic Modelling

In [None]:
data['Response']

In [None]:
X = data.iloc[:len(train)]
Y = data.iloc[len(train):]
X['Response'].tail()

In [None]:
x = X.drop(columns=['Response','id'])
y = X['Response']

In [None]:
x.fillna(method='ffill', inplace=True)

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=1)

In [None]:
model = LGBMClassifier(
        boosting_type = 'gbdt',
        max_depth = 8,
        learning_rate = 0.01,
        n_estimators = 5000,
        objective = 'binary',
        subsample = 0.8,
        reg_lambda = 2)

model.fit(x_train, y_train, eval_metric='auc', 
          eval_set=[(x_test, y_test)], early_stopping_rounds=200, verbose=100)


In [None]:

plot_importance(model, max_num_features=10, figsize=(10,10))

In [None]:
from sklearn.model_selection import cross_val_score
score=cross_val_score(model,x,y,cv=3,scoring="roc_auc")
print(score.mean())

# Lets do some magic now

# 1) Over sampling

In [None]:
X

In [None]:
print(x.shape, y.shape)
print(y.value_counts())

In [None]:


sm = SMOTE(sampling_strategy='minority', random_state=55, k_neighbors=5)
Over_x, Over_y = sm.fit_resample(x, y)

In [None]:
print(Over_y.value_counts())

In [None]:
x_train,x_test,y_train,y_test=train_test_split(Over_x, Over_y, stratify = Over_y, test_size=0.2,random_state=7)

# 2) Power Transformation

In [None]:
scale = RobustScaler()
x_train = scale.fit_transform(x_train)
x_test = scale.transform(x_test)

# 3) lets use base model again

In [None]:
lbmodel = LGBMClassifier(
        boosting_type = 'gbdt',
        max_depth = 8,
        learning_rate = 0.01,
        n_estimators = 5000,
        objective = 'binary',
        subsample = 0.8,
        reg_lambda = 2)

lbmodel.fit(x_train, y_train, eval_metric='auc', 
          eval_set=[(x_test, y_test)], early_stopping_rounds=200, verbose=100)

In [None]:
lbmodel = LGBMClassifier(
        boosting_type = 'gbdt',
        max_depth = 8,
        learning_rate = 0.01,
        n_estimators = 5000,
        objective = 'binary',
        subsample = 0.8,
        reg_lambda = 2)

lbmodel.fit(x_train, y_train)
pred = lbmodel.predict(x_test)



print(classification_report(pred, y_test))
y_score = lbmodel.predict_proba(x_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_score)
print ('Area under curve (AUC): ', auc(fpr,tpr))

# 4) Lets make prediction On Final Data

In [None]:
Final = Y.drop(columns=['id','Response'])

In [None]:
Final = scale.transform(Final)

In [None]:
Final_prediction = lbmodel.predict_proba(x_test)[:,1]s

In [None]:
result=pd.DataFrame(Y["id"],columns=["id","Response"])
result["Response"]=Final_prediction
result.to_csv("LGBM_prediction.csv",index=0)

# Final Score we got is = 0.839727

# Lets Try Stratified Folding But without Over Sampling

In [None]:

scaled_x = pd.DataFrame(scale.fit_transform(x), columns=x.columns)

In [None]:
%%time
folds = 5

auc_score = []
skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
final_prediction = pd.DataFrame()
    
for train, test in skf.split(scaled_x,y):
    X_train, X_test = scaled_x.iloc[train], scaled_x.iloc[test]
    Y_train, Y_test = y.iloc[train], y.iloc[test]
   
    # FInding Best Iteration
    
    lbmodel = LGBMClassifier(
        boosting_type = 'gbdt',
        max_depth = 8,
        learning_rate = 0.01,
        n_estimators = 5000,
        objective = 'binary',
        subsample = 0.8,
        reg_lambda = 2)

    lbmodel.fit(X_train, Y_train, eval_metric='auc', 
              eval_set=[(X_test, Y_test)], early_stopping_rounds=200, verbose=100)
    
    num_iteration=lbmodel.best_iteration_

    
    #   Model Running
    
    lbmodel = LGBMClassifier(
        boosting_type = 'gbdt',
        max_depth = 8,
        learning_rate = 0.01,
        n_estimators = num_iteration,
        objective = 'binary',
        subsample = 0.8,
        reg_lambda = 2)

    lbmodel.fit(x_train, y_train)
    pred = lbmodel.predict(x_test)



    print(classification_report(pred, y_test))
    y_score = lbmodel.predict_proba(x_test)[:,1]
    fpr, tpr, _ = roc_curve(y_test, y_score)
    print ('Area under curve (AUC): ', auc(fpr,tpr))
    score = auc(fpr,tpr)
    auc_score.append(score)
    
    
    # Making FInal Prediction
    
    prediction = pd.DataFrame( lbmodel.predict_proba(Final)[:,1])
    final_prediction = pd.concat([final_prediction, prediction], axis=1)
    
    
        
average_score = np.mean(auc_score)
print('The average auc score is ', average_score)   


In [None]:
final_prediction

# Lets Take their average

In [None]:
a = final_prediction.mean(axis=1)
a

In [None]:
result=pd.DataFrame(Y["id"],columns=["id","Response"])
result["Response"]=a
result.to_csv("LGBM_prediction_Fold.csv",index=0)