In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
pd.set_option('display.max_columns', 100)
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import LabelEncoder
from itertools import combinations
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Basic Pre-processing

In [None]:
df1 = pd.read_csv('../input/jobathon-may-2021-credit-card-lead-prediction/train.csv')
df1.head()

In [None]:
# so that we can separate out the category and numeric features
for i in df1.columns:
    print("Number of unique {} are : {}".format(i,len(df1[i].unique())))

In [None]:
# checking null values
df1.isnull().sum()

#Imputing giving bad roc so we will just let NaN be another category

In [None]:
#checking outliers in numeric features
fig,axs = plt.subplots(1,2,figsize=(18,7))
fig.suptitle('Searching For Outliers..')


ax1 = sns.boxplot(ax=axs[0],y = df1["Age"])
ax2 = sns.boxplot(ax=axs[1],y = df1['Avg_Account_Balance'])

#age seems fine, but account balance too many rich people


In [None]:
# Let`s see how much data we have to sacrifice to remove outliers

Q1 = df1['Avg_Account_Balance'].quantile(0.25)
Q3 = df1['Avg_Account_Balance'].quantile(0.75)
IQR = Q3 - Q1

filter = (df1['Avg_Account_Balance'] >= Q1 - 1.5 * IQR) & (df1['Avg_Account_Balance'] <= Q3 + 1.5 *IQR)
df2 = df1.loc[filter]  
print("data loss percentage {}%".format(((len(df1) - len(df2))/len(df1))*100))

In [None]:
# time to check target variable is imbalance or not
sns.countplot(x='Is_Lead',data=df1)

#Imbalanced

In [None]:
pd.crosstab(df1['Credit_Product'],df1.Is_Lead,normalize='index').sort_values(by=[1],ascending=False).head(5)

In [None]:
# check the effects of all cat features on target
column = ['Gender', 'Region_Code', 'Occupation', 'Channel_Code', 'Credit_Product', 'Is_Active','Vintage']
for i in column:
    print(pd.crosstab(df1[i],df1.Is_Lead,normalize='index').sort_values(by=[1],ascending=False).head(5))
    print('--------------------------------------------------------------------------')

In [None]:
#Let`s group features and check out their influences

comb = combinations(column, 2) 
for i in comb:
    
    df1[f'{i[0]}_{i[1]}']=df1[i[0]].astype(str)+'_'+df1[i[1]].astype(str)
    
    print(pd.crosstab(df1[i[0]],df1.Is_Lead,normalize='index').sort_values(by=[1],ascending=False).head(5))
    print('**'*30)
    print(pd.crosstab(df1[f'{i[0]}_{i[1]}'],df1.Is_Lead,normalize='index').sort_values(by=[1],ascending=False).head(5))
    print('--'*50)


In [None]:
# effect of Age and Avg_Account_Balance on Target
sns.scatterplot(data=df1, x="Avg_Account_Balance", y="Age",hue='Is_Lead')

In [None]:
def process_data():
    
    train = pd.read_csv("../input/jobathon-may-2021-credit-card-lead-prediction/train.csv")
    test = pd.read_csv("../input/jobathon-may-2021-credit-card-lead-prediction/test.csv")
    
    #Removes train rows which has Region_Code not present in test set
    test_region_list=test['Region_Code'].tolist()
    train=train[train['Region_Code'].isin(test_region_list)]
    
    
    #Removing outliers
    Q1 = train['Avg_Account_Balance'].quantile(0.25)
    Q3 = train['Avg_Account_Balance'].quantile(0.75)
    IQR = Q3 - Q1
    filter = (train['Avg_Account_Balance'] >= Q1 - 1.5 * IQR) & (train['Avg_Account_Balance'] <= Q3 + 1.5 *IQR)
    train = train.loc[filter]  
    
    train['train_or_test']='train'
    test['train_or_test']='test'
    df=pd.concat([train,test])
    
    
    
    le = LabelEncoder()
    for col in ['Gender', 'Region_Code', 'Occupation', 'Channel_Code', 'Credit_Product', 'Is_Active','Vintage']:
        df[col]=  df[col].astype('str')
        df[col]= le.fit_transform(df[col])
        

    
    return train,test,df

# Feature Engineering

In [None]:
train = pd.read_csv("../input/jobathon-may-2021-credit-card-lead-prediction/train.csv")
test = pd.read_csv("../input/jobathon-may-2021-credit-card-lead-prediction/test.csv")

print(len (train))
#Removes train rows which has Region_Code not present in test set
test_region_list=test['Region_Code'].tolist()
train1 = train[train['Region_Code'].isin(test_region_list)]
print(len(train1))
((len(train) - len(train1))/len(train))*100

In [None]:
def frequency_encoding(column_name,output_column_name,df):
    fe_pol = (df.groupby(column_name).size()) / len(df)
    df[output_column_name] = df[column_name].apply(lambda x : fe_pol[x])

In [None]:

def feature_engineering(df):
    le = LabelEncoder()
    
     #Interaction Feature (Combining 2 categorical features and performing frequency encoding)
        
    cat_features=[]
    le_features=[]
    columns=['Gender', 'Region_Code', 'Occupation', 'Channel_Code', 'Credit_Product', 'Is_Active','Vintage']

    comb = combinations(columns, 2) 

    for i in list(comb):  
        df[f'{i[0]}_{i[1]}']=df[i[0]].astype(str)+'_'+df[i[1]].astype(str)
        df[f'{i[0]}_{i[1]}_le']=le.fit_transform(df[f'{i[0]}_{i[1]}'])
        le_features.append(f'{i[0]}_{i[1]}_le')
        frequency_encoding(f'{i[0]}_{i[1]}',f'{i[0]}_{i[1]}',df)
        cat_features.append(f'{i[0]}_{i[1]}')   
        
    #Frequency Encoding
    
    frequency_encoding('Region_Code','Region_Code_fe',df)
    
    #Deriving characteristics of each region by creating aggregate features
    
    region_aggregate_features = df.groupby(['Region_Code']).agg({'Age': ['mean', 'max', 'min','std'],
                                                     'Vintage': ['nunique','count'],
                                                     'Avg_Account_Balance': ['mean', 'max', 'min','std'],
                                                     'Occupation': ['nunique','count'],
                                                     'Channel_Code': ['nunique','count'],
                                                     'Is_Active': ['nunique','count'],
                                                     'Credit_Product': ['nunique','count'],
                                                     'Gender': ['nunique','count'],             
                                                     })

    region_aggregate_features.columns = ['region_aggregate_features' + '_'.join(c).strip('_') for c in region_aggregate_features.columns]
    df = pd.merge(df, region_aggregate_features, on = ['Region_Code'], how='left')

 
    region_vintage_aggregate_features = df.groupby(['Region_Code','Vintage']).agg({'Age': ['mean', 'max', 'min','std'],
                                                     'Avg_Account_Balance': ['mean', 'max', 'min','std'],
                                                     'Occupation': ['nunique','count'],
                                                     'Channel_Code': ['nunique','count'],
                                                     'Is_Active': ['nunique','count'],
                                                     'Credit_Product': ['nunique','count'],
                                                     'Gender': ['nunique','count'],             
                                                     })
    region_vintage_aggregate_features.columns = ['region_vintage_aggregate_features' + '_'.join(c).strip('_') for c in region_vintage_aggregate_features.columns]
    df = pd.merge(df, region_vintage_aggregate_features, on = ['Region_Code','Vintage'], how='left')

   
    for i in cat_features:
        df[f'region_{i}_max']=df.groupby('Region_Code')[i].transform('max')
        df[f'region_{i}_min']=df.groupby('Region_Code')[i].transform('min')
        df[f'region_{i}_mean']=df.groupby('Region_Code')[i].transform('mean')
        df[f'region_{i}_std']=df.groupby('Region_Code')[i].transform('std')

    
        df[f'region_vinatge_{i}_max']=df.groupby(['Region_Code','Vintage'])[i].transform('max')
        df[f'region_vinatge_{i}_min']=df.groupby(['Region_Code','Vintage'])[i].transform('min')
        df[f'region_vinatge_{i}_mean']=df.groupby(['Region_Code','Vintage'])[i].transform('mean')
        df[f'region_vinatge_{i}_std']=df.groupby(['Region_Code','Vintage'])[i].transform('std')


        
        
        

    #Deriving characteristics of Occupation by creating aggregate features
    
    Occupation_aggregate_features = df.groupby(['Occupation']).agg({'Age': ['mean', 'max', 'min','std'],
                                                     'Vintage': ['nunique','count'],
                                                     'Avg_Account_Balance': ['mean', 'max', 'min','std'],
                                                     'Region_Code': ['nunique','count'],
                                                     'Channel_Code': ['nunique','count'],
                                                     'Is_Active': ['nunique','count'],
                                                     'Credit_Product': ['nunique','count'],
                                                     'Gender': ['nunique','count'],             
                                                     })

    Occupation_aggregate_features.columns = ['Occupation_aggregate_features' + '_'.join(c).strip('_') for c in Occupation_aggregate_features.columns]
    df = pd.merge(df, Occupation_aggregate_features, on = ['Occupation'], how='left')
    
    #Deriving characteristics of Channel_Code by creating aggregate features
    
    Channel_Code_aggregate_features = df.groupby(['Channel_Code']).agg({'Age': ['mean', 'max', 'min','std'],
                                                     'Vintage': ['nunique','count'],
                                                     'Avg_Account_Balance': ['mean', 'max', 'min','std'],
                                                     'Region_Code': ['nunique','count'],
                                                     'Occupation': ['nunique','count'],
                                                     'Is_Active': ['nunique','count'],
                                                     'Credit_Product': ['nunique','count'],
                                                     'Gender': ['nunique','count'],             
                                                     })

    Channel_Code_aggregate_features.columns = ['Channel_Code_aggregate_features' + '_'.join(c).strip('_') for c in Channel_Code_aggregate_features.columns]
    df = pd.merge(df, Channel_Code_aggregate_features, on = ['Channel_Code'], how='left')
    
    
    #Deriving characteristics of Is_Active by creating aggregate features
    
    Is_Active_aggregate_features = df.groupby(['Is_Active']).agg({'Age': ['mean', 'max', 'min','std'],
                                                     'Vintage': ['nunique','count'],
                                                     'Avg_Account_Balance': ['mean', 'max', 'min','std'],
                                                     'Region_Code': ['nunique','count'],
                                                     'Occupation': ['nunique','count'],
                                                     'Channel_Code': ['nunique','count'],
                                                     'Credit_Product': ['nunique','count'],
                                                     'Gender': ['nunique','count'],             
                                                     })

    Is_Active_aggregate_features.columns = ['Is_Active_aggregate_features' + '_'.join(c).strip('_') for c in Is_Active_aggregate_features.columns]
    df = pd.merge(df, Is_Active_aggregate_features, on = ['Is_Active'], how='left')
    
     #Deriving characteristics of Credit_Product by creating aggregate features
    
    Credit_Product_aggregate_features = df.groupby(['Credit_Product']).agg({'Age': ['mean', 'max', 'min','std'],
                                                     'Vintage': ['nunique','count'],
                                                     'Avg_Account_Balance': ['mean', 'max', 'min','std'],
                                                     'Region_Code': ['nunique','count'],
                                                     'Occupation': ['nunique','count'],
                                                     'Channel_Code': ['nunique','count'],
                                                     'Is_Active': ['nunique','count'],
                                                     'Gender': ['nunique','count'],             
                                                     })

    Credit_Product_aggregate_features.columns = ['Credit_Product_aggregate_features' + '_'.join(c).strip('_') for c in Credit_Product_aggregate_features.columns]
    df = pd.merge(df, Credit_Product_aggregate_features, on = ['Credit_Product'], how='left')
    
    
    #Deriving characteristics of Gender by creating aggregate features
    
    Gender_aggregate_features = df.groupby(['Gender']).agg({'Age': ['mean', 'max', 'min','std'],
                                                     'Vintage': ['nunique','count'],
                                                     'Avg_Account_Balance': ['mean', 'max', 'min','std'],
                                                     'Region_Code': ['nunique','count'],
                                                     'Occupation': ['nunique','count'],
                                                     'Channel_Code': ['nunique','count'],
                                                     'Is_Active': ['nunique','count'],
                                                     'Credit_Product': ['nunique','count'],             
                                                     })

    Gender_aggregate_features.columns = ['Gender_aggregate_features' + '_'.join(c).strip('_') for c in Gender_aggregate_features.columns]
    df = pd.merge(df, Gender_aggregate_features, on = ['Gender'], how='left')
    
    #Deriving characteristics of Interaction_features by creating aggregate features (These interaction feature are selected for aggregating based on its feature importance)
    
    Region_Code_Occupation_grpd = df.groupby(['Region_Code_Occupation']).agg({ 'Age': ['mean', 'max', 'min','std'],
                                                     'Vintage': ['nunique','count'],
                                                     'Avg_Account_Balance': ['mean', 'max', 'min','std'],
                                                     'Channel_Code': ['nunique','count'],
                                                     'Is_Active': ['nunique','count'],
                                                     'Credit_Product': ['nunique','count'],
                                                     'Gender': ['nunique','count']
                                                     })                                                              
                                                     
    Region_Code_Occupation_grpd.columns = ['grpd_by_Region_Code_Occupation_' + '_'.join(c).strip('_') for c in Region_Code_Occupation_grpd.columns]
    df = pd.merge(df, Region_Code_Occupation_grpd, on = ['Region_Code_Occupation'], how='left')


    Region_Code_Credit_Product_grpd = df.groupby(['Region_Code_Credit_Product']).agg({'Age': ['mean', 'max', 'min','std'],
                                                     'Vintage': ['nunique','count'],
                                                     'Avg_Account_Balance': ['mean', 'max', 'min','std'],
                                                     'Gender': ['nunique','count'],
                                                     'Channel_Code': ['nunique','count'],
                                                     'Is_Active': ['nunique','count'],
                                                     'Credit_Product': ['nunique','count'],             
                                                     })                                                              
                                                     
    Region_Code_Credit_Product_grpd.columns = ['grpd_by_Region_Code_Credit_Product_' + '_'.join(c).strip('_') for c in Region_Code_Credit_Product_grpd.columns]
    df = pd.merge(df, Region_Code_Credit_Product_grpd, on = ['Region_Code_Credit_Product'], how='left')
    
    # Occupation_Credit_Product_grpd = df.groupby(['Occupation_Credit_Product']).agg({'Age': ['mean', 'max', 'min','std'],
    #                                                  'Vintage': ['nunique','count'],
    #                                                  'Avg_Account_Balance': ['mean', 'max', 'min','std'],
    #                                                  'Gender': ['nunique','count'],
    #                                                  'Channel_Code': ['nunique','count'],
    #                                                  'Is_Active': ['nunique','count'],
    #                                                  'Region_Code': ['nunique','count'],             
    #                                                  })                                                              
                                                     
    # Occupation_Credit_Product_grpd.columns = ['grpd_by_Occupation_Credit_Product_' + '_'.join(c).strip('_') for c in Occupation_Credit_Product_grpd.columns]
    # df = pd.merge(df, Occupation_Credit_Product_grpd, on = ['Occupation_Credit_Product'], how='left')
    
    Gender_Vintage_grpd = df.groupby(['Gender_Vintage']).agg({'Age': ['mean', 'max', 'min','std'],
                                                     'Occupation': ['nunique','count'],
                                                     'Avg_Account_Balance': ['mean', 'max', 'min','std'],
                                                     'Credit_Product': ['nunique','count'],
                                                     'Channel_Code': ['nunique','count'],
                                                     'Is_Active': ['nunique','count'],
                                                     'Region_Code': ['nunique','count'],             
                                                     })                                                              
                                                     
    Gender_Vintage_grpd.columns = ['grpd_by_Gender_Vintage_' + '_'.join(c).strip('_') for c in Gender_Vintage_grpd.columns]
    df = pd.merge(df, Gender_Vintage_grpd, on = ['Gender_Vintage'], how='left')
    
    Credit_Product_Is_Active_grpd = df.groupby(['Credit_Product_Is_Active']).agg({'Age': ['mean', 'max', 'min','std'],
                                                     'Occupation': ['nunique','count'],
                                                     'Avg_Account_Balance': ['mean', 'max', 'min','std'],
                                                     'Gender': ['nunique','count'],
                                                     'Channel_Code': ['nunique','count'],
                                                     'Vintage': ['nunique','count'],
                                                     'Region_Code': ['nunique','count'],             
                                                     })                                                              
                                                     
    Credit_Product_Is_Active_grpd.columns = ['grpd_by_Credit_Product_Is_Active_' + '_'.join(c).strip('_') for c in Credit_Product_Is_Active_grpd.columns]
    df = pd.merge(df, Credit_Product_Is_Active_grpd, on = ['Credit_Product_Is_Active'], how='left')
    
    Gender_Credit_Product_grpd = df.groupby(['Gender_Credit_Product']).agg({'Age': ['mean', 'max', 'min','std'],
                                                     'Occupation': ['nunique','count'],
                                                     'Avg_Account_Balance': ['mean', 'max', 'min','std'],
                                                      })                                                              
                                                     
    Gender_Credit_Product_grpd.columns = ['grpd_by_Gender_Credit_Product_' + '_'.join(c).strip('_') for c in Gender_Credit_Product_grpd.columns]
    df = pd.merge(df, Gender_Credit_Product_grpd, on = ['Gender_Credit_Product'], how='left')
    
    #Creating Age Bins and deriving characteristics of each age group by creating aggregate features
    
    Age_Bins = KBinsDiscretizer(n_bins=14, encode='ordinal', strategy='quantile')
    df['Age_Bins'] = Age_Bins.fit_transform(df['Age'].values.reshape(-1,1)).astype(int)
    
    age_aggregate_features = df.groupby(['Age_Bins']).agg({'Age': ['mean', 'max', 'min','std'],
                                                     'Vintage': ['nunique','count'],
                                                     'Avg_Account_Balance': ['mean', 'max', 'min','std'],
                                                     'Region_Code': ['nunique','count'],
                                                     'Channel_Code': ['nunique','count'],
                                                     'Is_Active': ['nunique','count'],
                                                     'Credit_Product': ['nunique','count'],
                                                     'Gender': ['nunique','count'],
                                                      'Occupation': ['nunique','count'],      
                                                     })
    age_aggregate_features.columns = ['age_aggregate_features' + '_'.join(c).strip('_') for c in age_aggregate_features.columns]
    df = pd.merge(df, age_aggregate_features, on = ['Age_Bins'], how='left')

    
    return df,le_features


# Data preparation for Machine Learning

In [None]:
def preparedatafortraining(df,train,test):
    
    train=df.loc[df.train_or_test.isin(['train'])]
    test=df.loc[df.train_or_test.isin(['test'])]
    
    drop_columns={'ID','Is_Lead','train_or_test'}
    
    target=['Is_Lead']
    
    x=train.drop(columns=drop_columns,axis=1)
    y=train[target]
    x_test=test.drop(columns=drop_columns,axis=1)
    train_features = [_f for _f in x.columns]
    
    print(x.shape)
    
    return x,y,x_test,train_features

In [None]:
def savedata():
    
    train,test,df=process_data()
    df,cat_features=feature_engineering(df)
    x_train,y_train,x_test,train_features=preparedatafortraining(df,train,test)
    
    #x_train.to_pickle("x_train_lgbm.pkl")
    #y_train.to_pickle("y_train_lgbm.pkl")
    #x_test.to_pickle("x_test_lgbm.pkl")
    
    return x_train,y_train,x_test,cat_features,train_features

# CatBoost

In [None]:
def catboost_model():
    
    x,y,x_test,cat_features,train_features=savedata()
     
    err = [] 

    oofs = np.zeros(shape=(len(x)))
    preds = np.zeros(shape=(len(x_test)))

    Folds=8

    fold = StratifiedKFold(n_splits=Folds, shuffle=True, random_state=2021)
    i = 1

    for train_index, test_index in fold.split(x, y):
        x_train, x_val = x.iloc[train_index], x.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    
        m =  CatBoostClassifier(n_estimators=10000,random_state=2020,eval_metric='AUC')
    
        m.fit(x_train, y_train,eval_set=[(x_val, y_val)], early_stopping_rounds=30,verbose=100,cat_features=cat_features)
    
        pred_y = m.predict_proba(x_val)[:,1]
        oofs[test_index] = pred_y
        print(i, " err_cat: ", roc_auc_score(y_val,pred_y))
        err.append(roc_auc_score(y_val,pred_y))
        preds+= m.predict_proba(x_test)[:,1]
        i = i + 1
    preds=preds/Folds
    
    print(f"Average StratifiedKFold Score : {sum(err)/Folds} ")
    oof_score = roc_auc_score(y, oofs)
    print(f'\nOOF Auc is : {oof_score}')
    
    oofs=pd.DataFrame(oofs,columns=['catboostoof'])
    preds=pd.DataFrame(preds,columns=['catboostpred'])
    
    oofs.to_csv('catboostoof.csv',index=False)
    preds.to_csv('catboostpred.csv',index=False)

In [None]:
catboost_model()

# LightGBM

In [None]:

def lgbm_model():
    
    x,y,x_test,cat_features,train_features=savedata()
    

    params={'lambda': 2.8849054495567423, 
        'alpha': 0.001054193185317787, 
        'colsample_bytree': 0.5, 
        'subsample': 0.4, 
        'learning_rate': 0.014, 
        'max_depth': 13, 
        'random_state': 24,
        'min_child_weight': 5}
    
    err = [] 

    oofs = np.zeros(shape=(len(x)))
    preds = np.zeros(shape=(len(x_test)))

    Folds=8

    fold = StratifiedKFold(n_splits=Folds, shuffle=True, random_state=2020)
    i = 1

    for train_index, test_index in fold.split(x, y):
        x_train, x_val = x.iloc[train_index], x.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    
        m = LGBMClassifier(n_estimators=10000,**params,verbose= -1)
    
        m.fit(x_train, y_train,eval_set=[(x_val, y_val)], early_stopping_rounds=30,verbose=False,eval_metric='auc')
    
        pred_y = m.predict_proba(x_val)[:,1]
        oofs[test_index] = pred_y
        print(i, " err_lgm: ", roc_auc_score(y_val,pred_y))
        err.append(roc_auc_score(y_val,pred_y))
        preds+= m.predict_proba(x_test)[:,1]
        i = i + 1
    preds=preds/Folds

    imp_df = pd.DataFrame()
    imp_df['feature'] = train_features
    imp_df['importance'] = m.booster_.feature_importance(importance_type='gain')
    
    print(f"Average StratifiedKFold Score : {sum(err)/Folds} ")
    oof_score = roc_auc_score(y, oofs)
    print(f'\nOOF Auc is : {oof_score}')
    
    oofs=pd.DataFrame(oofs,columns=['lgbmoof'])
    preds=pd.DataFrame(preds,columns=['lgbmpred'])
    
    oofs.to_csv('lgbmoof.csv',index=False)
    preds.to_csv('lgbmpred.csv',index=False)

    return imp_df

In [None]:
imp = lgbm_model()

In [None]:
# Function to display feature importance...
def display_importances(feature_importance_df_,model):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:30].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(12, 8))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title(model+" Features (avg over folds)")
    plt.tight_layout()
    plt.savefig(model +"_importances-01.png")

In [None]:
# Feature importance based on gain...

display_importances(imp,"LGBM")

# XGBOOST

In [None]:
def xgb_model():
    
    x,y,x_test,cat_features,train_features=savedata()
    
    params={'lambda': 1.417495651744778, 
        'alpha': 0.4281901245971981, 
        'colsample_bytree': 0.7, 
        'subsample': 0.8, 
        'learning_rate': 0.016,
        'max_depth': 9, 
        'random_state': 2020, 
        'min_child_weight': 30}
    
    err = [] 

    oofs = np.zeros(shape=(len(x)))
    preds = np.zeros(shape=(len(x_test)))

    Folds=8

    fold = StratifiedKFold(n_splits=Folds, shuffle=True, random_state=2020)
    i = 1

    for train_index, test_index in fold.split(x, y):
        x_train, x_val = x.iloc[train_index], x.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    
        m = XGBClassifier(n_estimators=10000,**params)
    
        m.fit(x_train, y_train,eval_set=[(x_val, y_val)], early_stopping_rounds=30,verbose=False,eval_metric='auc')
    
        pred_y = m.predict_proba(x_val)[:,1]
        oofs[test_index] = pred_y
        print(i, " err_xgb: ", roc_auc_score(y_val,pred_y))
        err.append(roc_auc_score(y_val,pred_y))
        preds+= m.predict_proba(x_test)[:,1]
        i = i + 1
    preds=preds/Folds
    
    print(f"Average StratifiedKFold Score : {sum(err)/Folds} ")
    oof_score = roc_auc_score(y, oofs)
    print(f'\nOOF Auc is : {oof_score}')
    
    oofs=pd.DataFrame(oofs,columns=['xgboof'])
    preds=pd.DataFrame(preds,columns=['xgbpred'])
    
    oofs.to_csv(Data_dir+'xgbmoof.csv',index=False)
    preds.to_csv(Data_dir+'xgbmpred.csv',index=False)

In [None]:
xgb_model()

# Final Blend

In [None]:
def final_process_data():
    
    train = pd.read_csv("../input/jobathon-may-2021-credit-card-lead-prediction/train.csv")
    test = pd.read_csv("../input/jobathon-may-2021-credit-card-lead-prediction/test.csv")
    sub= pd.read_csv("../input/jobathon-may-2021-credit-card-lead-prediction/sample_submission.csv")
    
    test_region_list=test['Region_Code'].tolist()
    train=train[train['Region_Code'].isin(test_region_list)]
    
    target=train[['Is_Lead']]
    
    lgbmpred = pd.read_csv('../input/mayjobathon-model/lgbmpred.csv')
    xgbpred = pd.read_csv('../input/mayjobathon-model/xgbmpred.csv')
    catboostpred = pd.read_csv('../input/mayjobathon-model/catboostpred.csv')
    
    total_pred = pd.concat([lgbmpred,xgbpred,catboostpred], axis=1)
    
    lgbmoof = pd.read_csv('../input/mayjobathon-model/lgbmoof.csv')
    xgboof = pd.read_csv('../input/mayjobathon-model/xgbmoof.csv')
    catboostoof = pd.read_csv('../input/mayjobathon-model/catboostoof.csv')
    
    total_oof = pd.concat([lgbmoof,xgboof,catboostoof], axis=1)
    
    return train,target,sub,test,total_pred,total_oof

In [None]:
def findbestweight(df1,df2,target):
    max_roc = -1
    max_weight = 0
    max_ensemble_oof  = 0
    weights_list = [0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0]
    for weight in weights_list:
        ensemble_oof = weight*df1 + (1-weight)*df2
        roc_score = roc_auc_score(target,ensemble_oof)
        if roc_score > max_roc:
            max_ensemble_oof = ensemble_oof
            max_roc = roc_score
            max_weight = weight
    print("The best weights for blending is {0} with AUC {1}".format(max_weight, max_roc))
    return max_weight

In [None]:
def blend():
    train,target,sub,test,total_pred,total_oof=final_process_data()
    weight1=findbestweight(total_oof['lgbmoof'],total_oof['xgboof'],target)
    lgb_xgb=weight1*total_oof['lgbmoof'] +(1-weight1)*total_oof['xgboof']
    
    weight2=findbestweight(lgb_xgb,total_oof['catboostoof'],target)
    lgb_xgb_cat=weight2*lgb_xgb +(1-weight2)*total_oof['catboostoof']
    
    lgb_xgb_cat_pred=(weight1*total_pred['lgbmpred']+(1-weight1)*total_pred['xgbpred'])*weight2+total_pred['catboostpred']*(1-weight2)
    
    sub['Is_Lead']=lgb_xgb_cat_pred
    sub.to_csv('./blend.csv',index=False)
    print(sub)

In [None]:
blend()

### Final Score : 0.8732 AUC_ROC Score in private leaderboard