In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.metrics import roc_auc_score,accuracy_score
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
sns.set(style="ticks", context="talk")
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
import optuna
from optuna.samplers import TPESampler

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# For EDA
import statsmodels.api as sm
from scipy import stats

In [4]:
# Loading data from train, test and submission csv files
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_sub = pd.read_csv('sample_submission.csv')

## Data cleaning and imputation 

In [5]:
# Function to check for duplicates 
def remove_duplicate(data):
    data.drop_duplicates(keep="first", inplace=True)
    return "Checked Duplicates"

In [6]:
remove_duplicate(train)

'Checked Duplicates'

In [7]:
# Define function to Impute missing values with a new dummy variable , this is done to preserve as much information as possible
def impute_nan_create_category(DataFrame,ColName):
     DataFrame[ColName] = np.where(DataFrame[ColName].isnull(),"Unknown",DataFrame[ColName])

In [8]:
# Apply on train 
for Columns in ['Credit_Product']:
    impute_nan_create_category(train,Columns)

In [9]:
# Apply on test 
for Columns in ['Credit_Product']:
    impute_nan_create_category(test,Columns)

In [10]:
# Function to display information about dataframe
data_types = ["float32","float64","int32","int64","object","category","datetime64[ns]"]
def display_data_information(data, data_types):
    data.info()
    print("\n")
    for VARIABLE in data_types :
        data_types = data.select_dtypes(include=[ VARIABLE ]).dtypes
    if len(data_types) > 0 :
        print(str(len(data_types))+" "+VARIABLE+" Features\n"+str(data_types)+"\n" )

In [11]:
# Display info about train
display_data_information(train, data_types)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 245725 entries, 0 to 245724
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ID                   245725 non-null  object
 1   Gender               245725 non-null  object
 2   Age                  245725 non-null  int64 
 3   Region_Code          245725 non-null  object
 4   Occupation           245725 non-null  object
 5   Channel_Code         245725 non-null  object
 6   Vintage              245725 non-null  int64 
 7   Credit_Product       245725 non-null  object
 8   Avg_Account_Balance  245725 non-null  int64 
 9   Is_Active            245725 non-null  object
 10  Is_Lead              245725 non-null  int64 
dtypes: int64(4), object(7)
memory usage: 22.5+ MB




In [12]:
# Display info about test
display_data_information(test, data_types)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105312 entries, 0 to 105311
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ID                   105312 non-null  object
 1   Gender               105312 non-null  object
 2   Age                  105312 non-null  int64 
 3   Region_Code          105312 non-null  object
 4   Occupation           105312 non-null  object
 5   Channel_Code         105312 non-null  object
 6   Vintage              105312 non-null  int64 
 7   Credit_Product       105312 non-null  object
 8   Avg_Account_Balance  105312 non-null  int64 
 9   Is_Active            105312 non-null  object
dtypes: int64(3), object(7)
memory usage: 8.0+ MB




In [13]:
# Dropping the ID column 

train = train.drop('ID', axis=1)
test = test.drop('ID', axis=1)

In [14]:
train.head()

Unnamed: 0,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
0,Female,73,RG268,Other,X3,43,No,1045696,No,0
1,Female,30,RG277,Salaried,X1,32,No,581988,No,0
2,Female,56,RG268,Self_Employed,X3,26,No,1484315,Yes,0
3,Male,34,RG270,Salaried,X1,19,No,470454,No,0
4,Female,30,RG282,Salaried,X1,33,No,886787,No,0


In [15]:
test.head()

Unnamed: 0,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active
0,Male,29,RG254,Other,X1,25,Yes,742366,No
1,Male,43,RG268,Other,X2,49,Unknown,925537,No
2,Male,31,RG270,Salaried,X1,14,No,215949,No
3,Male,29,RG272,Other,X1,33,No,868070,No
4,Female,29,RG270,Other,X1,19,No,657087,No


## EDA Part 

### EDA using logistic regression 


In [16]:
train_eda = train 

In [17]:
X_eda = train_eda.drop('Is_Lead', axis=1)
y_eda = train_eda[['Is_Lead']]
X_eda = pd.get_dummies(X_eda, drop_first=True)

In [18]:
X_train_eda, X_test_eda, y_train_eda, y_test_eda = train_test_split(X_eda, y_eda, test_size=0.25)

### Build the logistic regression model:

In [19]:
logit = sm.Logit(y_train_eda, sm.add_constant(X_train_eda))
lg = logit.fit()

Optimization terminated successfully.
         Current function value: 0.366902
         Iterations 7


### Check the summary of the model 

In [20]:
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)
print(lg.summary())

                           Logit Regression Results                           
Dep. Variable:                Is_Lead   No. Observations:               184293
Model:                          Logit   Df Residuals:                   184245
Method:                           MLE   Df Model:                           47
Date:                Mon, 31 May 2021   Pseudo R-squ.:                  0.3318
Time:                        18:51:10   Log-Likelihood:                -67618.
converged:                       True   LL-Null:                   -1.0119e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                       -3.7089      0.100    -36.971      0.000      -3.906      -3.512
Age                          0.0084      0.001     10.796      0.000       0.007       0.

### The pseudo r-square shows that only 33% of the entire variation in the data is explained by the model. It is really not a good model!

### We will calculate the odds ratio from the coefficients using the formula odds ratio=exp(coef). Next we will calculate the probability from the odds ratio using the formula probability = odds / (1+odds) and filter by decending odds ratio 

In [21]:
log_coef = pd.DataFrame(lg.params, columns=['coef'])
log_coef.loc[:, "Odds_ratio"] = np.exp(log_coef.coef)
log_coef['probability'] = log_coef['Odds_ratio']/(1+log_coef['Odds_ratio'])
log_coef['pval']=lg.pvalues
pd.options.display.float_format = '{:.2f}'.format

In [22]:
log_coef = log_coef.sort_values(by="Odds_ratio", ascending=False)
pval_filter = log_coef['pval']<=0.1
log_coef[pval_filter]

Unnamed: 0,coef,Odds_ratio,probability,pval
Credit_Product_Unknown,3.99,54.09,0.98,0.0
Credit_Product_Yes,1.61,4.99,0.83,0.0
Channel_Code_X2,0.95,2.59,0.72,0.0
Channel_Code_X4,0.86,2.36,0.7,0.0
Channel_Code_X3,0.81,2.26,0.69,0.0
Region_Code_RG255,0.36,1.43,0.59,0.0
Region_Code_RG263,0.35,1.42,0.59,0.0
Is_Active_Yes,0.33,1.39,0.58,0.0
Region_Code_RG269,0.33,1.39,0.58,0.0
Region_Code_RG277,0.31,1.36,0.58,0.0


If we analyze the data, we can see that the customers who have unknown credit product  have a 98% probability of becoming the lead while the ones having yes have 83% probability of becoming a lead.

Similarly, the customers in channel code X2 have odds of 2.59 to become a lead as compared to others , this empasise the importance of credit product. 

## Visual EDA

In [None]:
train['Is_Lead'] = le.fit_transform(train['Is_Lead'])
#train['Is_Active'] = le.fit_transform(train['Is_Lead'])

In [None]:
train['Is_Lead'].value_counts().plot.pie(autopct = '%1.1f%%',colors=['Orange','Blue'], figsize = (4,4))

### Data is Imbalanced. Only 23.7% of customers are likely to become lead.

In [None]:
# Check the distribution of target variable with respect to gender
sns.countplot(train['Gender'], hue = train['Is_Lead'],palette=['Orange','Purple'])

### There are more leads in male customers 

In [None]:
# Check the distribution of Age
f,ax = plt.subplots(nrows=2,ncols=1,figsize=(20,10))
axx = ax.flatten()
#plt.figure(figsize=(30,10))
sns.distplot(train['Age'],ax=axx[0], color='Blue')
sns.boxplot(train['Age'],ax=axx[1],color='Orange')

### Most of the people are in age group of 30-55

In [None]:
# Based on this observation we divide age into bins
age_grp_20_to_30 = train[ train['Age'] <31]
age_grp_31_to_40 = train[ train['Age'].between(31,40)]
age_grp_41_to_50 = train[ train['Age'].between(41,50)]
age_grp_50_to_60 = train[ train['Age'].between(51,60)]
age_grp_old = train[ train['Age'] >60]

age_grp = [age_grp_20_to_30,age_grp_31_to_40,age_grp_41_to_50,age_grp_50_to_60,age_grp_old]
age_grp_name = ['age_grp_20_to_30','age_grp_31_to_40','age_grp_41_to_50','age_grp_50_to_60','age_grp_old']
age_grp_dict = dict(zip(age_grp_name, age_grp))

In [None]:
# Check the distribution of target variable with respect to age  
f,ax = plt.subplots(nrows=2, ncols=3, figsize = (20,10))
axx = ax.flatten()
for pos,tup in enumerate(age_grp_dict.items()):
    axx[pos].set_title(tup[0])
    data = tup[1]
    data['Is_Lead'].value_counts().plot.pie(autopct='%1.1f%%', ax = axx[pos],colors=['Red','Blue'])

In [None]:
f,ax = plt.subplots(nrows=2, ncols=3, figsize = (20,10))
axx = ax.flatten()
plt.title('Lead Percentage of Different Age Groups with Genders',fontsize=25,x=-0.5,y=2.5)
for pos,tup in enumerate(age_grp_dict.items()):
    axx[pos].set_title(tup[0])
    temp = tup[1]
    temp.groupby('Gender')['Is_Lead'].value_counts().plot.pie(autopct='%1.1f%%', ax = axx[pos],colors=['Orange','Purple'])

### Age group 50-60 and 41-50 and age group old , males seem to be the dominant customers 

In [None]:
train['Credit_Product'].value_counts().plot.pie(autopct='%1.1f%%',colors = ['Blue','Red', 'Yellow'])

### Distribution of credit product 

In [None]:
f,ax = plt.subplots(nrows=1,ncols=3,figsize = (20,5))
axx = ax.flatten()
#plt.title('Driving_License wise Response',fontsize=40,x=-0.5,y=2)
axx[0].set_title('Credit_Product = Yes')
axx[1].set_title('Credit_Product = No')
axx[2].set_title('Credit_Procuct = Unknown')
train[ train['Credit_Product'] == 'Yes']['Is_Lead'].value_counts().plot.pie(autopct='%1.1f%%',colors = ['Blue','Red'],ax=axx[0])
train[ train['Credit_Product'] == 'No']['Is_Lead'].value_counts().plot.pie(autopct='%1.1f%%',colors = ['Blue','Red'],ax=axx[1])
train[ train['Credit_Product'] == 'Unknown']['Is_Lead'].value_counts().plot.pie(autopct='%1.1f%%',colors = ['Blue','Red'],ax=axx[2])

###  This provides a revealing information that the missing credit product column contains 85.2% lead , dropping it would have been a mistake  

In [None]:
f,ax = plt.subplots(nrows=1,ncols=2,figsize = (20,5))
axx = ax.flatten()
#plt.title('Driving_License wise Response',fontsize=40,x=-0.5,y=2)
axx[0].set_title('Is_Active = Yes')
axx[1].set_title('Is_Active = No')
train[ train['Is_Active'] == 'Yes']['Is_Lead'].value_counts().plot.pie(autopct='%1.1f%%',colors = ['Blue','Red'],ax=axx[0])
train[ train['Is_Active'] == 'No']['Is_Lead'].value_counts().plot.pie(autopct='%1.1f%%',colors = ['Blue','Red'],ax=axx[1])

### Is_active is present in rough;y same manner in trget distribution 


In [None]:
plt.figure(figsize = (40,5))
plt.title('Region Wise Response Count',fontsize=25)
sns.countplot(train['Region_Code'], hue = train['Is_Lead'],palette=['Red','Blue'])

### Region code 283 and 254, 284, 268 have highest number of customers 

In [None]:
u_region = train['Region_Code'].unique()
region_perc = {}
for i in u_region:
    total_region = train[ train['Region_Code'] == i].shape[0]
    buy_region = train[ (train['Region_Code'] == i) & train['Is_Lead'] == 1].shape[0]
    region_perc[i] = (buy_region/total_region)*100

region_perc = sorted(region_perc.items(), key=lambda x: x[1], reverse=True)
region_perc = list(zip(*region_perc))

region = np.array(region_perc[0])
region_perc = np.array(region_perc[1])
region = pd.DataFrame(region)
region_perc = pd.DataFrame(region_perc)

region_res_perc = pd.concat((region,region_perc), axis=1)
region_res_perc.columns = ['Region_Code', 'Buy_Percentage']

In [None]:
plt.figure(figsize=(40,10))
plt.title('Region Wise lead conversion  Percentage',fontsize=25)
ax = sns.barplot(x = region_res_perc['Region_Code'], y = region_res_perc['Buy_Percentage'])

### Which regions have highest lead  percentage

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(train['Credit_Product'],hue=train['Is_Lead'],palette=['Brown','Purple'])

### Importance of Credit_product variable is visible and the missing values contain a lot of leads 

In [None]:
plt.figure(figsize=(7,7))
train['Occupation'].value_counts().plot.pie(autopct='%1.1f%%', colors = ['r', 'b', 'g', 'y'])

### Distribution of Occupation 

In [None]:
plt.figure(figsize = (30,5))
sns.countplot(train['Occupation'], hue = train['Is_Lead'])

### Self Employed people have the most number of leads 

In [None]:
ls = train['Occupation'].unique()

In [None]:
f,ax = plt.subplots(nrows=1, ncols=4,figsize = (20,7))
axx = ax.flatten()
for pos,val in enumerate(ls):
    axx[pos].set_title(str(val))
    train[ train['Occupation'] == val]['Is_Lead'].value_counts().plot.pie(autopct = '%1.1f%%',ax = axx[pos], colors=['Purple', 'Orange'])

### Entrepreneures have most leads whereas salaried the least 

In [None]:
sns.countplot(train['Is_Active'], hue = train['Is_Lead'])

### Number of leads is equally distributed in both active and not active 

In [None]:
f,ax = plt.subplots(nrows=2,ncols=1,figsize=(20,7))
axx = ax.flatten()
#plt.figure(figsize=(30,10))
sns.distplot(train['Avg_Account_Balance'],ax=axx[0], color='Blue')
sns.boxplot(train['Avg_Account_Balance'],ax=axx[1],color='Orange')

### Average account balance is highly skwed 

In [None]:
plt.figure(figsize=(20,7))
sns.distplot(train[ train['Avg_Account_Balance'] < 100000]['Avg_Account_Balance'])#.plot.hist(bins = 500, frequency=(0,10000))

In [None]:
plt.figure(figsize=(20,7))
train['Channel_Code'].value_counts().plot.bar()

### Channel code X4 has least values 

In [None]:
f,ax = plt.subplots(nrows=2,ncols=1,figsize=(20,8))
axx = ax.flatten()
sns.distplot(train['Vintage'],ax=axx[0], color='Blue')
sns.boxplot(train['Vintage'],ax=axx[1],color='Orange')

In [None]:
# To combine and plot xcorr
train_copy =  train 
test_copy = test 
train_copy['is_train'] = 1
test_copy['is_train'] = 0
test_copy['Is_Lead'] = None

In [None]:
data = pd.concat((train_copy,test_copy))
#data.set_index('ID',inplace=True)
data.shape

In [None]:
sns.boxplot('Age', data=data, orient='v', color='Red')

In [None]:
sns.boxplot('Avg_Account_Balance', data=data,orient='v', color='red')

In [None]:
f,ax = plt.subplots(nrows=1,ncols=2,figsize = (20,7))
axx = ax.flatten()
sns.kdeplot(data['Avg_Account_Balance'], legend=False,ax = axx[0])
sns.kdeplot(np.log(data['Avg_Account_Balance']), legend=False,ax = axx[1]) # after using log transformation

In [None]:
corr_check = data.copy()

col_ls = ['Gender', 'Occupation', 'Is_Active', 'Region_Code', 'Channel_Code', 'Credit_Product']

for col in col_ls:
    corr_check[col] = le.fit_transform(corr_check[col])

In [None]:
plt.figure(figsize=(15,7))
sns.heatmap(corr_check.corr(), annot=True, square=True,annot_kws={'size': 10})

### Channel code and vintage are positively related 

## Convert categorical columns , assign labels 

In [None]:
train['Gender']=train['Gender'].replace({'Male':1,'Female':0})
train['Occupation']=train['Occupation'].replace({'Other':0,'Salaried':1,'Self_Employed':2, 'Entrepreneur':3})
train['Occupation']=train['Occupation'].astype(int)
train['Region_Code']=train['Region_Code'].replace({'RG268':0,'RG277':1,'RG270':2,'RG282':3,'RG261':4, 'RG265':5, 'RG283':6, 'RG254':7,
                                                              'RG269':8, 'RG257':9, 'RG279':10, 'RG280':11, 'RG252':12, 'RG284':13, 'RG259':14,
                                                              'RG281':15, 'RG258':16, 'RG266':17, 'RG260':18, 'RG274':19, 'RG256':20, 'RG275':21,
                                                              'RG273':22, 'RG267':23, 'RG272':24, 'RG251':25, 'RG262':26, 'RG264':27, 'RG278':28,
                                                              'RG276':29, 'RG263':30, 'RG250':31, 'RG255':32, 'RG253':33, 'RG271':34})
train['Channel_Code']=train['Channel_Code'].replace({'X3':2,'X1':0,'X2':1,'X4':3})
train['Credit_Product']=train['Credit_Product'].replace({'No':0,'Yes':1, 'Unknown':2})
train['Is_Active']=train['Is_Active'].replace({'No':0,'Yes':1})

In [None]:
test['Gender']=test['Gender'].replace({'Male':1,'Female':0})
test['Occupation']=test['Occupation'].replace({'Other':0,'Salaried':1,'Self_Employed':2, 'Entrepreneur':3})
test['Occupation']=test['Occupation'].astype(int)
test['Region_Code']=test['Region_Code'].replace({'RG268':0,'RG277':1,'RG270':2,'RG282':3,'RG261':4, 'RG265':5, 'RG283':6, 'RG254':7,
                                                              'RG269':8, 'RG257':9, 'RG279':10, 'RG280':11, 'RG252':12, 'RG284':13, 'RG259':14,
                                                              'RG281':15, 'RG258':16, 'RG266':17, 'RG260':18, 'RG274':19, 'RG256':20, 'RG275':21,
                                                              'RG273':22, 'RG267':23, 'RG272':24, 'RG251':25, 'RG262':26, 'RG264':27, 'RG278':28,
                                                              'RG276':29, 'RG263':30, 'RG250':31, 'RG255':32, 'RG253':33, 'RG271':34})
test['Channel_Code']=test['Channel_Code'].replace({'X3':2,'X1':0,'X2':1,'X4':3})
test['Credit_Product']=test['Credit_Product'].replace({'No':0,'Yes':1, 'Unknown':2})
test['Is_Active']=test['Is_Active'].replace({'No':0,'Yes':1})

In [None]:
test.head()

In [None]:
### Create list of categorical features 

In [None]:
features=['Gender', 'Age', 'Region_Code', 'Occupation', 'Channel_Code', 'Vintage', 'Credit_Product', 'Avg_Account_Balance', 'Is_Active']
cat_col=['Gender','Region_Code', 'Occupation', 'Channel_Code', 'Credit_Product', 'Is_Active' ]

### Create test train data

In [None]:
X = train.drop('Is_Lead', axis=1)
y = train[['Is_Lead']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=150303,stratify=y,shuffle=True)

# Model Building, Training and Optimization 

### Initialize catboost classifier 

In [None]:
catb = CatBoostClassifier()
catb= catb.fit(X_train, y_train,cat_features=cat_col,eval_set=(X_test, y_test),early_stopping_rounds=30,verbose=100)
y_pred = catb.predict(X_test)
proba = catb.predict_proba(X_test)[:, 1]
print('CatBoost Base Accuracy : {}'.format(accuracy_score(y_test,y_pred)))
print('CatBoost Base ROC_AUC_SCORE: {}'.format(roc_auc_score(y_test,proba)))

### Initialize base LGBM classifier

In [None]:
model = LGBMClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
proba = model.predict_proba(X_test)[:,1]

In [None]:
# Base model metrics 
print('LGBM Base Accuracy : {}'.format(accuracy_score(y_test,y_pred)))
print('LGBM Base ROC_AUC_SCORE: {}'.format(roc_auc_score(y_test,proba)))

### Tune Model 

In [None]:
def create_model(trial):
    max_depth = trial.suggest_int("max_depth", 2, 30)
    n_estimators = trial.suggest_int("n_estimators", 1, 500)
    learning_rate = trial.suggest_uniform('learning_rate', 0.0000001, 1)
    num_leaves = trial.suggest_int("num_leaves", 2, 5000)
    min_child_samples = trial.suggest_int('min_child_samples', 3, 200)
    reg_alpha = trial.suggest_int("reg_alpha", 1, 10)
    reg_lambda = trial.suggest_int("reg_lambda", 1, 10)
    model = LGBMClassifier(
        learning_rate=learning_rate, 
        n_estimators=n_estimators, 
        max_depth=max_depth,
        num_leaves=num_leaves, 
        min_child_samples=min_child_samples,
        random_state=0
    )
    return model

sampler = TPESampler(seed=0)
def objective(trial):
    model = create_model(trial)
    model.fit(X_train, y_train)
    proba = model.predict_proba(X_test)[:,1]
    score = roc_auc_score(y_test,proba)
    return score

study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=50)

lgb_params = study.best_params
lgb_params['random_state'] = 0
lgb = LGBMClassifier(**lgb_params)
lgb.fit(X_train, y_train)
proba = lgb.predict_proba(X_test)[:,1]
print('Optimized LightGBM roc_auc_score', roc_auc_score(y_test, proba))

In [None]:
lgb

### Fit Tuned model 

In [None]:
LGBM = lgb
LGBM.fit(X, y)
y_pred = LGBM.predict(X_test)
proba = LGBM.predict_proba(X_test)[:,1]

In [None]:
print('LGBM Tuned Accuracy : {}'.format(accuracy_score(y_test,y_pred)))
print('LGBM Tuned ROC_AUC_SCORE: {}'.format(roc_auc_score(y_test,proba)))

In [None]:
test = test.drop(['Is_Lead'], axis=1)

### Stacking the models 

In [None]:
LGBM_proba = LGBM.predict_proba(test)[:, 1] # Class 1 probability of LGBM model
cat_proba = catb.predict_proba(test)[:, 1] # Class 1 probability of CatBoost model

In [None]:
submit_proba = ((LGBM_proba * 0.45) + (cat_proba * 0.55))/2
sample_sub['Is_Lead'] = submit_proba

## Prepare submission 

In [None]:
##sample_sub.to_csv('sixth_submission.csv', index=False) 