In [None]:
import numpy as np # linear b
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)b
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sample_submission = pd.read_csv('../input/jobathon-may-2021-credit-card-lead-prediction/sample_submission.csv')
test =  pd.read_csv('../input/jobathon-may-2021-credit-card-lead-prediction/test.csv')
train = pd.read_csv( '../input/jobathon-may-2021-credit-card-lead-prediction/train.csv')

In [None]:
train.head()

In [None]:
test.head()

## Description of Dataset

- ID - IUnique Identifier for a row
- Gender- Gender of the Customer
- Age-Age of the Customer (in Years)
- Region_Code-Code of the Region for the customers
- Occupation-Occupation Type for the customer
- Channel_Code-Acquisition Channel Code for the Customer  (Encoded)
- Vintage-Vintage for the Customer (In Months)
- Credit_Product-If the Customer has any active credit product (Home loan,
Personal loan, Credit Card etc.)
- Avg_Account_Balance- Average Account Balance for the Customer in last 12 Months
- Is_Active-If the Customer is Active in last 3 Months

## No Of Records in the dataset

In [None]:
train.info()

In [None]:
test.info()

## get min max std deviation of the dataset /check quantiles

In [None]:
train.describe()

In [None]:
test.describe()

In [None]:
train.nunique()

In [None]:
train.head()

In [None]:
Target_col = 'Is_Lead'

In [None]:
categorical_vars = ['Gender', 'Region_Code', 'Occupation', 'Channel_Code', 'Credit_Product', 'Is_Active']
conts = ['Age', 'Vintage', 'Avg_Account_Balance']

In [None]:
train.Is_Lead.plot.hist()

In [None]:
train.Age.plot.hist()

In [None]:
test.Age.plot.hist()

In [None]:
train.Avg_Account_Balance.plot.hist()

In [None]:
test.Avg_Account_Balance.plot.hist()

In [None]:
train.Vintage.plot.hist()

In [None]:
test.Vintage.plot.hist()

## outliers

In [None]:

train = train[train['Avg_Account_Balance']<=max(test.Avg_Account_Balance)].reset_index(drop=True)

In [None]:
for i in categorical_vars:
    print(train.groupby(i)['Is_Lead'].value_counts().nlargest(8))

### All the Entrepreneur have credit card interest

## Missing value analysis

In [None]:
train.isnull().sum()

In [None]:
train = train.fillna('other')
test = test.fillna('other')

In [None]:
train['set'] = 1
test['set'] = 0
concat = pd.concat([train.drop(['ID'], axis=1), test.drop('ID', axis=1)])

In [None]:
results = concat[concat.duplicated(subset=['Gender', 'Age', 'Region_Code', 'Occupation', 'Channel_Code', 'Vintage', 'Credit_Product', 'Is_Active'], keep=False)].sort_values(by=['Gender', 'Age', 'Region_Code', 'Occupation', 'Channel_Code', 'Vintage', 'Credit_Product', 'Is_Active', 'Is_Lead']).fillna(method='ffill')

In [None]:
train = train.drop('set', axis=1)
test = test.drop('set', axis=1)

## Feature Engeneering

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(train['Gender'])
train['Gender'] = le.transform(train['Gender'])
test['Gender'] = le.transform(test['Gender'])

In [None]:
from sklearn.preprocessing import LabelEncoder
ole = LabelEncoder()
ole.fit(train['Occupation'])
train['Occupation'] = ole.transform(train['Occupation'])
test['Occupation'] = ole.transform(test['Occupation'])

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(train['Channel_Code'])
train['Channel_Code'] = le.transform(train['Channel_Code'])
test['Channel_Code'] = le.transform(test['Channel_Code'])

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(train['Credit_Product'])
train['Credit_Product'] = le.transform(train['Credit_Product'])
test['Credit_Product'] = le.transform(test['Credit_Product'])

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(train['Is_Active'])
train['Is_Active'] = le.transform(train['Is_Active'])
test['Is_Active'] = le.transform(test['Is_Active'])

In [None]:
# performing freq encoding

dicto = train['Region_Code'].value_counts(normalize=True).to_dict()
train['Region_Code'] = train['Region_Code'].map(dicto)
dicto = test['Region_Code'].value_counts(normalize=True).to_dict()
test['Region_Code'] = test['Region_Code'].map(dicto)

In [None]:
from scipy import stats

In [None]:
fitted_data, fitted_lambda = stats.boxcox(np.abs(train['Vintage']))
train['Vintage_box'] = fitted_data
fitted_data, fitted_lambda = stats.boxcox(np.abs(test['Vintage']))
test['Vintage_box'] = fitted_data

In [None]:
sns.distplot(fitted_data)

In [None]:
from sklearn.preprocessing import KBinsDiscretizer
est = KBinsDiscretizer(n_bins=8, encode='ordinal', strategy='quantile')
est.fit(train['Age'].values.reshape(-1,1))
train['Age_quantiles'] = est.transform(train['Age'].values.reshape(-1,1)).astype(int)
test['Age_quantiles'] = est.transform(test['Age'].values.reshape(-1,1)).astype(int)

In [None]:
from sklearn.preprocessing import KBinsDiscretizer
est = KBinsDiscretizer(n_bins=8, encode='ordinal', strategy='quantile')
est.fit(train['Vintage'].values.reshape(-1,1))
train['Vintage_quantiles'] = est.transform(train['Vintage'].values.reshape(-1,1)).astype(int)
test['Vintage_quantiles'] = est.transform(test['Vintage'].values.reshape(-1,1)).astype(int)

In [None]:
from sklearn.preprocessing import KBinsDiscretizer
est = KBinsDiscretizer(n_bins=8, encode='ordinal', strategy='quantile')
est.fit(train['Avg_Account_Balance'].values.reshape(-1,1))
train['Avg_Account_Balance_quantiles'] = est.transform(train['Avg_Account_Balance'].values.reshape(-1,1)).astype(int)
test['Avg_Account_Balance_quantiles'] = est.transform(test['Avg_Account_Balance'].values.reshape(-1,1)).astype(int)

In [None]:
train['Avg_Account_Balance_log'] = np.log(train['Avg_Account_Balance'])
test['Avg_Account_Balance_log'] = np.log(test['Avg_Account_Balance'])

train['Vintage_log'] = np.log(train['Vintage'])
test['Vintage_log'] = np.log(test['Vintage'])


In [None]:
conts

In [None]:
train.describe()

In [None]:
train.head()

## Modelling part

In [None]:
X = train.drop(['ID', 'Is_Lead', 'Avg_Account_Balance_quantiles', 'Vintage', 'Avg_Account_Balance'], axis=1).values
y = train[Target_col].values
X_test = test.drop(['ID', 'Avg_Account_Balance_quantiles', 'Vintage', 'Avg_Account_Balance'], axis=1).values


from sklearn.preprocessing import MaxAbsScaler

transformer = MaxAbsScaler().fit(X)
X = transformer.transform(X)
X_test = transformer.transform(X_test)

### Features selected for training

In [None]:
from sklearn.linear_model import Lasso, LinearRegression,Ridge
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVC

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
from sklearn.model_selection import StratifiedKFold,KFold

In [None]:
!pip install catboost

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import ExtraTreeClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn import linear_model
from catboost import CatBoostClassifier

### LGBM

In [None]:
def entreprenur_make1(sample_submission):
  indexEntrepreneur = test[test['Occupation'] == list(ole.classes_).index('Entrepreneur')].index
  sample_submission.loc[indexEntrepreneur, 'Is_Lead'] = 1
  return sample_submission

In [None]:
predictions = np.zeros(test.shape[0])
oobs = np.zeros(train.shape[0])
y = train[Target_col].values
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
params = {'n_estimators': 10000, 'n_jobs': -1, 'random_state': 2, 'learning_rate': 0.014564209621859385, 'colsample_bytree': 0.48762749309989595}

model = lgb.LGBMClassifier(**params)

for i, (train_id, valid_id) in enumerate(skf.split(X,y)):
    print("fold ", i)  
    X_train, y_train = X[train_id], y[train_id]
    X_valid, y_valid = X[valid_id], y[valid_id]
    model.fit(X_train, y_train, eval_set =[(X_valid, y_valid)],  early_stopping_rounds=200, verbose=1000, eval_metric='auc')

    oobs[valid_id] = model.predict_proba(X_valid)[:,1]
    predictions += model.predict_proba(X_test)[:,1]
roc_auc = roc_auc_score(y,oobs)
finals = predictions/5
lgb_predictions = finals
sample_submission[Target_col] = finals
sample_submission.to_csv('lgb_final.csv', index=False)
print("ROCAUC", roc_auc)
lgb_oobs = oobs
print("best oob lightgbm")
pd.DataFrame(data=oobs).to_csv('oob_lgb.csv')

## Submit lgb_final in the end 
## this gave me 0.8507474057 of ROC_AUC