In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv(r'../input/jobathon-may-2021-credit-card-lead-prediction/train.csv')
test = pd.read_csv(r'../input/jobathon-may-2021-credit-card-lead-prediction/test.csv')

In [None]:
train.shape, test.shape

In [None]:
train['source'] = 'train'
test['source'] = 'test'
data = pd.concat([train,test],ignore_index=True)

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.isnull().sum()

In [None]:
data.dtypes

In [None]:
data.nunique()

In [None]:
data['Credit_Product'].value_counts()

In [None]:
data['Credit_Product'].replace(np.nan,'Yes',inplace=True)

In [None]:
sns.distplot(data['Avg_Account_Balance'])
plt.show()

In [None]:
data['Avg_Account_Balance'] = np.log(data['Avg_Account_Balance'])
sns.distplot(data['Avg_Account_Balance'])
plt.show()

In [None]:
train = data.loc[data['source']=="train"]
test = data.loc[data['source']=="test"]

#Drop unnecessary columns:
test.drop(['Is_Lead','source'],axis=1,inplace=True)
train.drop('source',axis=1,inplace=True)

In [None]:
train.drop('ID',axis=1,inplace=True)
test.drop("ID",axis=1,inplace = True)

In [None]:
train.dtypes

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
var_mod = ['Gender','Region_Code','Occupation','Channel_Code','Credit_Product','Is_Active']
for i in var_mod:
    train[i] = le.fit_transform(train[i])
    test[i] = le.fit_transform(test[i])

In [None]:
X = train.drop('Is_Lead',axis=1)
y = train['Is_Lead']

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score

def cross_val(X, y, model, params, folds=9):

    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=21)
    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        print(f"Fold: {fold}")
        x_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        x_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

        alg = model(**params)
        alg.fit(x_train, y_train,
                eval_set=[(x_test, y_test)],
                early_stopping_rounds=100,
                verbose=400)

        pred = alg.predict_proba(x_test)[:, 1]
        roc_score = roc_auc_score(y_test, pred)
        print(f"roc_auc_score: {roc_score}")
        print("-"*50)
    
    return alg

In [None]:
lgb_params= {'learning_rate': 0.045, 
             'n_estimators': 20000, 
             'max_bin': 94,
             'num_leaves': 10, 
             'max_depth': 27, 
             'reg_alpha': 8.457, 
             'reg_lambda': 6.853, 
             'subsample': 0.749}

In [None]:
from lightgbm import LGBMClassifier
lgb_model = cross_val(X, y, LGBMClassifier, lgb_params)

In [None]:
xgb_params= {'n_estimators': 20000, 
             'max_depth': 6, 
             'learning_rate': 0.0201, 
             'reg_lambda': 29.326, 
             'subsample': 0.818, 
             'colsample_bytree': 0.235, 
             'colsample_bynode': 0.820, 
             'colsample_bylevel': 0.453}

In [None]:
from xgboost import XGBClassifier
xgb_model = cross_val(X, y, XGBClassifier, xgb_params)

In [None]:
cat_params= {'n_estimators': 20000, 
                  'depth': 4, 
                  'learning_rate': 0.023, 
                  'colsample_bylevel': 0.655, 
                  'bagging_temperature': 0.921, 
                  'l2_leaf_reg': 10.133}

In [None]:
from catboost import CatBoostClassifier
cat_model = cross_val(X, y, CatBoostClassifier, cat_params)

In [None]:
pred_test_lgb = lgb_model.predict_proba(test)[:,1]
pred_test_xgb = xgb_model.predict_proba(test)[:,1]
pred_test_cat = cat_model.predict_proba(test)[:,1]
prediction = (pred_test_lgb + pred_test_cat+pred_test_xgb)/3

In [None]:
sample_submission = pd.read_csv(r'../input/jobathon-may-2021-credit-card-lead-prediction/sample_submission.csv')

In [None]:
sample_submission['Is_Lead'] = prediction
sample_submission.to_csv(f'submission.csv',index=False)

In [None]:
sample_submission['Is_Lead'] = pred_test_lgb
sample_submission.to_csv(f'pred_test_lgb.csv',index=False)

sample_submission['Is_Lead'] = pred_test_xgb
sample_submission.to_csv(f'pred_test_xgb.csv',index=False)

sample_submission['Is_Lead'] = pred_test_cat
sample_submission.to_csv(f'pred_test_cat.csv',index=False)