In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Load Dataset

In [None]:
train = pd.read_csv("../input/jobathon-may-2021-credit-card-lead-prediction/train.csv")
test = pd.read_csv("../input/jobathon-may-2021-credit-card-lead-prediction/test.csv")
train

## Dataset Information

In [None]:
train.info()

In [None]:
train.describe(include='all')

## Check Null Value and Handling

In [None]:
train.isna().sum()

In [None]:
## for this case, i will delete the null value

train = train[~train['Credit_Product'].isna()]
train.isna().sum()

In [None]:
## Drop Feature which are not used
#train = train.reset_index().set_index('ID')
train = pd.DataFrame(train.reset_index(drop=True))
train.head()

## Visuzalization

In [None]:
# Visualizing Categorical features
# Note we have filled the missng 'Credit_Product' with 'Missing' for sake of visualization
cat_features = ['Gender','Region_Code','Occupation','Channel_Code','Credit_Product','Is_Active']

plt.figure(figsize=(16, 14))
sns.set(font_scale= 1.2)
sns.set_style('ticks')

for i, feature in enumerate(cat_features):
    plt.subplot(3, 2, i+1)
    sns.countplot(data=train, x=feature, hue='Is_Lead', palette='rainbow')
    if feature == 'Region_Code':
        plt.xticks(rotation=90)
    
sns.despine()

In [None]:
# It was found that the age could be dividen into age groups
plt.figure(figsize=(16, 7))
temp = train.copy()

sns.countplot(data=temp, x='Age', hue='Is_Lead', palette='autumn')

plt.show()

In [None]:
#We shall now plot the numberical variables to look at the distribution
numerical = ['Age','Vintage','Avg_Account_Balance']
sns.pairplot(data=train,x_vars=numerical, hue = 'Is_Lead', palette='Set2')

In [None]:
# We shall log trasform the variables and plot again
temp = train.copy()
temp[numerical] = np.log(train[numerical])
sns.pairplot(data=temp,x_vars=numerical, hue = 'Is_Lead', palette='Set2')

We should apply log transformation in "Avg_Account_Balance"

## Data Preprocessing

Firstly, we will convert categorical to numerical data

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
X_orig = train.copy()
X = np.zeros((len(train['Gender']),1))
for i, name in enumerate(cat_features):
    x = label_encoder.fit_transform(train[name]).reshape(-1,1)
    X = np.hstack((X,x))

X = pd.DataFrame(X).drop([0],axis=1)
X.columns = cat_features
for i, name in enumerate(numerical):
    if name == 'Avg_Account_Balance':
        X = pd.concat([X,np.log(train[name])],axis=1)
    else:
        X = pd.concat([X,train[name]],axis=1)
data = pd.concat([X,train['Is_Lead']],axis=1)
Y = data.iloc[:,-1:]

### Check Distribution of Data

In [None]:
sns.countplot(x = "Is_Lead",data = data)

We must balancing the data firstly

### Train - Test Split

In [None]:
indices=range(len(X))
X_train, X_test, y_train, y_test,indices_train,indices_test = train_test_split(X,Y,indices, test_size = 0.25, stratify=Y,
                                                    random_state=123)

### Standarize Data

In [None]:
X_test_raw = X_test.copy()
scaler = StandardScaler()
scaled_numfeats_train = pd.DataFrame(scaler.fit_transform(X_train[numerical]), 
                                     columns=numerical, index= X_train.index)
for col in numerical:
    X_train[col] = scaled_numfeats_train[col]
    
scaled_numfeats_test = pd.DataFrame(scaler.transform(X_test[numerical]),
                                    columns=numerical, index= X_test.index)

for col in numerical:
    X_test[col] = scaled_numfeats_test[col]

### Balancing data

In [None]:
from imblearn.over_sampling import SMOTENC
smote_nc = SMOTENC(categorical_features=list(range(0,(len(X_train.columns)-len(numerical)))), random_state=0)
X_train, y_train = smote_nc.fit_resample(X_train, y_train)

In [None]:
y_train.value_counts()

## Machine Learning Model

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

In [None]:
key = ['LogisticRegression']
value = [LogisticRegression(),KNeighborsClassifier(algorithm = 'kd_tree', n_jobs = 1, n_neighbors = 1, weights = 'uniform'),
         SVC(C=.5, gamma = 0.1,kernel = 'rbf'),
         DecisionTreeClassifier(),RandomForestClassifier(n_estimators = 1000),GradientBoostingClassifier(),AdaBoostClassifier(),xgb.XGBClassifier()]
models = dict(zip(key,value))
print(models)

In [None]:
predicted =[]
for name,algo in models.items():
    model=algo
    model.fit(X_train,y_train)
    predict = model.predict(X_test)
    acc = accuracy_score(y_test, predict)
    predicted.append(acc)
    print(name,acc)

## Deep Neural Network

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

## Results

## Prediction Using XGB

In [None]:
model = xgb.XGBClassifier(num_class  =2, objective='multi:softprob')
model.fit(X_train,y_train)
predict = model.predict(X_test)
acc = accuracy_score(y_test, predict)
print('XGB Accuracy: ',acc)

In [None]:
tmp = predict = model.predict_proba(X_test)
tmp[:,1]

### Check and Handling the Missing Value

In [None]:
test.isna().sum()

In [None]:
## for this case, i will delete the null value

test = test[~test['Credit_Product'].isna()]
test.isna().sum()

In [None]:
## Drop Feature which are not used
test = test.drop(['ID'],axis=1)
test = pd.DataFrame(test.reset_index(drop=True))
test.head()

## Preprocessing Data Before Predict the Unseen Data

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
X_unseen = np.zeros((len(test['Gender']),1))
for i, name in enumerate(cat_features):
    x_unseen = label_encoder.fit_transform(test[name]).reshape(-1,1)
    X_unseen = np.hstack((X_unseen,x_unseen))

X_unseen = pd.DataFrame(X_unseen).drop([0],axis=1)
X_unseen.columns = cat_features
for i, name in enumerate(numerical):
    if name == 'Avg_Account_Balance':
        X_unseen = pd.concat([X_unseen,np.log(test[name])],axis=1)
    else:
        X_unseen = pd.concat([X_unseen,test[name]],axis=1)
X_unseen.head()

## Standarize Before Predict the Unseen Data

In [None]:
scaled_numfeats_unseen = pd.DataFrame(scaler.fit_transform(X_unseen[numerical]), 
                                      columns=numerical, index= X_unseen.index)
for col in numerical:
    X_unseen[col] = scaled_numfeats_unseen[col]
X_unseen.head()

## Prediction 'Is_Lead' of Unseen Data

In [None]:
d1 = pd.DataFrame(y_test.values,columns = ['target'],index= y_test.index).reset_index()
d2 = X_orig.loc[indices_test].reset_index()
cols = X_test.columns
cols = [x+"_T" for x in cols]
X_test.columns=cols
d3 = X_test.reset_index()

print(d2.shape)
temp = d2.merge(d1, left_index=True, right_index=True)
temp = d3.merge(temp, left_index=True, right_index=True)
temp.tail()

In [None]:
predicted = model.predict(X_test).reshape(-1,1)
predicted_proba = model.predict_proba(X_test)[:,1]
#X_test.reset_index(inplace = True)
#target = pd.concat([X_orig.loc[indices_test],pd.DataFrame(y_test,columns = ['target'])],axis=1)
#scores = pd.concat([pd.DataFrame(predicted_proba,columns = ['prediction_prob']),pd.DataFrame(predicted,columns = ['prediction'])],axis=1)

final = pd.concat([temp,pd.DataFrame(predicted_proba,columns = ['prediction_prob']),pd.DataFrame(predicted,columns = ['prediction'])],axis=1)
#final.dropna(thresh=2,inplace=True)
#final = final[final.ID.notnull()]
final.to_csv("otuput_data_test.csv",index=False)
#target.to_csv("target.csv",index=False)
#scores.to_csv("scores.csv",index=False)

In [None]:
final[["Gender","Gender_T"]].tail()

In [None]:
final.columns

In [None]:
len(predicted_proba)
acc = accuracy_score(final.Is_Lead, final.prediction)
print('XGB Accuracy: ',acc)
print(predicted_proba[0:10])

In [None]:
X_test.tail()