In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

*importing training data

In [2]:
df = pd.read_csv('train_s3TEQDk.csv')

In [3]:
df.head()

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
0,NNVBBKZB,Female,73,RG268,Other,X3,43,No,1045696,No,0
1,IDD62UNG,Female,30,RG277,Salaried,X1,32,No,581988,No,0
2,HD3DSEMC,Female,56,RG268,Self_Employed,X3,26,No,1484315,Yes,0
3,BF3NC7KV,Male,34,RG270,Salaried,X1,19,No,470454,No,0
4,TEASRWXV,Female,30,RG282,Salaried,X1,33,No,886787,No,0


In [4]:
df.shape

(245725, 11)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 245725 entries, 0 to 245724
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ID                   245725 non-null  object
 1   Gender               245725 non-null  object
 2   Age                  245725 non-null  int64 
 3   Region_Code          245725 non-null  object
 4   Occupation           245725 non-null  object
 5   Channel_Code         245725 non-null  object
 6   Vintage              245725 non-null  int64 
 7   Credit_Product       216400 non-null  object
 8   Avg_Account_Balance  245725 non-null  int64 
 9   Is_Active            245725 non-null  object
 10  Is_Lead              245725 non-null  int64 
dtypes: int64(4), object(7)
memory usage: 20.6+ MB


*By above code we got to know Credit_Product has Null values

In [6]:
unique_list = ['Gender','Region_Code','Occupation','Channel_Code','Credit_Product','Is_Active','Is_Lead']

*Checking unique entries 

In [7]:
for item in unique_list:
    print(df[item].unique())

['Female' 'Male']
['RG268' 'RG277' 'RG270' 'RG282' 'RG261' 'RG265' 'RG283' 'RG254' 'RG269'
 'RG257' 'RG279' 'RG280' 'RG252' 'RG284' 'RG259' 'RG281' 'RG258' 'RG266'
 'RG260' 'RG274' 'RG256' 'RG275' 'RG273' 'RG267' 'RG272' 'RG251' 'RG262'
 'RG264' 'RG278' 'RG276' 'RG263' 'RG250' 'RG255' 'RG253' 'RG271']
['Other' 'Salaried' 'Self_Employed' 'Entrepreneur']
['X3' 'X1' 'X2' 'X4']
['No' nan 'Yes']
['No' 'Yes']
[0 1]


In [8]:
df['Credit_Product'].isna().sum()

29325

In [9]:
df['Credit_Product'].value_counts()

No     144357
Yes     72043
Name: Credit_Product, dtype: int64

In [10]:
df['Credit_Product'].fillna(method='bfill',inplace=True)

*Null values imputation using backward fill method

In [11]:
df['Credit_Product'].value_counts()

No     163920
Yes     81805
Name: Credit_Product, dtype: int64

In [12]:
df.isna().sum()

ID                     0
Gender                 0
Age                    0
Region_Code            0
Occupation             0
Channel_Code           0
Vintage                0
Credit_Product         0
Avg_Account_Balance    0
Is_Active              0
Is_Lead                0
dtype: int64

In [13]:
df.duplicated().sum()

0

In [14]:
df_new = df.set_index('ID')
df_new.head()

Unnamed: 0_level_0,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
NNVBBKZB,Female,73,RG268,Other,X3,43,No,1045696,No,0
IDD62UNG,Female,30,RG277,Salaried,X1,32,No,581988,No,0
HD3DSEMC,Female,56,RG268,Self_Employed,X3,26,No,1484315,Yes,0
BF3NC7KV,Male,34,RG270,Salaried,X1,19,No,470454,No,0
TEASRWXV,Female,30,RG282,Salaried,X1,33,No,886787,No,0


*set iD as index of dataframe df_new. Now we will perform Label Encoding on Categorical Variables

In [15]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cat_list = ['Gender','Region_Code','Occupation','Channel_Code','Credit_Product','Is_Active']
for i in cat_list:
    df_new[i] = le.fit_transform(df_new[i])

In [16]:
df_new.head()

Unnamed: 0_level_0,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
NNVBBKZB,0,73,18,1,2,43,0,1045696,0,0
IDD62UNG,0,30,27,2,0,32,0,581988,0,0
HD3DSEMC,0,56,18,3,2,26,0,1484315,1,0
BF3NC7KV,1,34,20,2,0,19,0,470454,0,0
TEASRWXV,0,30,32,2,0,33,0,886787,0,0


In [17]:
df_new['Is_Lead'].value_counts()

0    187437
1     58288
Name: Is_Lead, dtype: int64

*By applying value_counts on target variable wee got to know that training dataset is imbalanced.
So we will balance it.

In [18]:
frames = [df_new[df_new['Is_Lead'] == 0].iloc[:40000, :], df_new[df_new['Is_Lead'] == 1]]
frames = pd.concat(frames)
frames['Is_Lead'].value_counts()

1    58288
0    40000
Name: Is_Lead, dtype: int64

In [19]:
xf = frames.drop('Is_Lead',axis=1)
yf = frames['Is_Lead']

In [20]:
from sklearn.model_selection import train_test_split
xf_train,xf_test,yf_train,yf_test = train_test_split(xf,yf,test_size = 0.25,random_state = 12)

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,roc_auc_score
log = LogisticRegression()
log.fit(xf_train,yf_train)
lr_pred = log.predict(xf_test)

In [22]:
print(accuracy_score(yf_test,lr_pred))
print(roc_auc_score(yf_test,lr_pred))

0.5941315318248412
0.5439064680954497


In [44]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
xgb_clf = XGBClassifier(colsample_bytree = 0.8, gamma = 5, max_depth = 5, min_child_weight = 10, subsample = 1.0)
xgb_clf.fit(xf_train,yf_train)
xgb_pred = xgb_clf.predict_proba(xf_test)[:,1]

In [45]:
'''print(accuracy_score(yf_test,xgb_pred))'''
print(roc_auc_score(yf_test,xgb_pred))

0.7938864218790371


In [25]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(xf_train,yf_train)
rf_pred = rf.predict(xf_test)

In [26]:
print(accuracy_score(yf_test,rf_pred))
print(roc_auc_score(yf_test,rf_pred))

0.7151636008464919
0.698239852861285


In [27]:
df_test = pd.read_csv('test_mSzZ8RL.csv')
df_test = df_test.set_index('ID')
df_test.head()

Unnamed: 0_level_0,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
VBENBARO,Male,29,RG254,Other,X1,25,Yes,742366,No
CCMEWNKY,Male,43,RG268,Other,X2,49,,925537,No
VK3KGA9M,Male,31,RG270,Salaried,X1,14,No,215949,No
TT8RPZVC,Male,29,RG272,Other,X1,33,No,868070,No
SHQZEYTZ,Female,29,RG270,Other,X1,19,No,657087,No


In [28]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 105312 entries, VBENBARO to F2NOYPPZ
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   Gender               105312 non-null  object
 1   Age                  105312 non-null  int64 
 2   Region_Code          105312 non-null  object
 3   Occupation           105312 non-null  object
 4   Channel_Code         105312 non-null  object
 5   Vintage              105312 non-null  int64 
 6   Credit_Product       92790 non-null   object
 7   Avg_Account_Balance  105312 non-null  int64 
 8   Is_Active            105312 non-null  object
dtypes: int64(3), object(6)
memory usage: 8.0+ MB


In [29]:
df_test.isna().sum()

Gender                     0
Age                        0
Region_Code                0
Occupation                 0
Channel_Code               0
Vintage                    0
Credit_Product         12522
Avg_Account_Balance        0
Is_Active                  0
dtype: int64

In [30]:
df_test['Credit_Product'].fillna(method='bfill',inplace=True)
df_test.isna().sum()
print(df_test.shape)

(105312, 9)


In [31]:
df_test.duplicated().sum()

1

In [32]:
df_test.drop_duplicates()

Unnamed: 0_level_0,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
VBENBARO,Male,29,RG254,Other,X1,25,Yes,742366,No
CCMEWNKY,Male,43,RG268,Other,X2,49,No,925537,No
VK3KGA9M,Male,31,RG270,Salaried,X1,14,No,215949,No
TT8RPZVC,Male,29,RG272,Other,X1,33,No,868070,No
SHQZEYTZ,Female,29,RG270,Other,X1,19,No,657087,No
...,...,...,...,...,...,...,...,...,...
DBENJOYI,Male,52,RG268,Salaried,X2,86,Yes,4242558,Yes
CWQ72DWS,Male,55,RG277,Other,X2,86,Yes,1159153,No
HDESC8GU,Male,35,RG254,Salaried,X4,15,No,1703727,No
2PW4SFCA,Male,53,RG254,Other,X3,93,No,737178,Yes


In [33]:
for i in cat_list:
    df_test[i] = le.fit_transform(df_test[i])

In [34]:
predictions = log.predict(df_test)

In [35]:
submission = pd.DataFrame({'ID': df_test.index,
                    'Is_Lead': predictions})

submission.to_csv("submission_log.csv", index=False)

In [46]:
predictions_xgb = xgb_clf.predict_proba(df_test)[:,1]
predictions_xgb

array([0.2304947 , 0.56574845, 0.2540544 , ..., 0.73717713, 0.74211156,
       0.25470155], dtype=float32)

In [47]:
submission = pd.DataFrame({'ID': df_test.index,
                    'Is_Lead': predictions_xgb})

submission.to_csv("submission_xgb12.csv", index=False)

In [38]:
predictions_rf = rf.predict(df_test)

In [39]:
submission = pd.DataFrame({'ID': df_test.index,
                    'Is_Lead': predictions_rf})

submission.to_csv("submission_rf.csv", index=False)

In [42]:
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }
xgb_gs = XGBClassifier()
grid = zip([xgb_gs],[params])

best_clf = None
# perform grid search and select the model with best cv set scores
for model_pipeline, param in grid:
    temp = GridSearchCV(model_pipeline, param_grid=param, cv=3, n_jobs=1)
    temp.fit(xf_train, yf_train)
    if best_clf is None:
        best_clf = temp
    else:
        if temp.best_score_ > best_clf.best_score_:
            best_clf = temp
print ("Best CV Score",best_clf.best_score_)
print ("Model Parameters",best_clf.best_params_)

Best CV Score 0.7405176623799447
Model Parameters {'colsample_bytree': 0.8, 'gamma': 5, 'max_depth': 5, 'min_child_weight': 10, 'subsample': 1.0}
