# Import the data

In [None]:
!wget https://www.dropbox.com/sh/ld6fx87zdvlwxiz/AACbD2hgIL5CCzEY19nvXbpDa?dl=0 # Import the data from dropbox

In [None]:
!unzip /content/AACbD2hgIL5CCzEY19nvXbpDa?dl=0 # unzip the data

# Load the Data

In [None]:
# Import basic modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

In [None]:
train = pd.read_csv("/content/train_s3TEQDk.csv") # Read the train file
test = pd.read_csv("/content/test_mSzZ8RL.csv") # Read the test file

In [None]:
# Show first 10 rows of training set
train.head(10)

In [None]:
train.info() #Extract the basic information from the data like dtypes of features, number of non-null values

In [None]:
test.info()

**Credit_Product feature have some null values.**

In [None]:
train.isnull().sum() # Check the count for null values in feature

In [None]:
test.isnull().sum()

In [None]:
# Extract the null columns from training and testing set

null_columns_train = [col for col in train.columns if train[col].isnull().sum() !=0]
null_columns_test = [col for col in test.columns if test[col].isnull().sum() !=0]

# Print the percentage of null values in each column
def perc_null_vals(null_columns, data, dataset_type = None):
  total_val_counts = data.shape[0] # Total values in the dataset
  for col in null_columns:
    null_val_counts = data[col].isnull().sum() # number of null values in the dataset
    perc_null_vals = float(null_val_counts)*100/total_val_counts
    print("The percentage of null values in {} in the {} set is {:.3f}%".format(col, dataset_type, perc_null_vals))

# Call the above function
perc_null_vals(null_columns_train, train, "train")
perc_null_vals(null_columns_test, test, "test")

**Almost equal percentage of null values are present in both the dataset.**

# EDA on trainset

In [None]:
train.head()

In [None]:
train['Occupation'].value_counts()

In [None]:
# Check for unique values in the categorical columns
cat_cols = [cols for cols in train.select_dtypes('O').columns]

def print_unique_vals(data, columns):
  # iterate over each column and print the unique values in each categorical column as well as their counts
  for col in columns:
    print("Unqiue values in {} are".format(col))
    print(data[col].value_counts())
    print()

# Call the above method
print_unique_vals(train, cat_cols)

In [None]:
# Check similarly for test set
cat_cols_test = [col for col in test.select_dtypes('O').columns]
print_unique_vals(test, cat_cols_test)

**Every Categorical column other than ID and Region_Code have same subcategories in both train and test set**

In [None]:
# Create a new column called is_lead_text for plotting a pairplot
train['Is_Lead_text'] = train['Is_Lead'].apply(lambda x : "Yes" if x==1 else "No")

# Create a copy of train set
train_copy = train.copy()
train_copy.drop('Is_Lead',axis = 1, inplace=True)

# Plot the pairplot
sns.pairplot(train_copy, hue = "Is_Lead_text")
plt.show()

In [None]:
# extract numerical columns and plot scatter plots between each
num_cols = [col for col in train_copy.select_dtypes('int')]

# Check the distribution of the numerical features
train_copy.hist(figsize=(12,8))
plt.show()

# Check for class imbalance
sns.countplot(x = train_copy['Is_Lead_text'])
plt.grid()
plt.show()

# Check for class count w.r.t every other categorical_column
sns.countplot(x = train_copy['Is_Lead_text'], hue=train_copy['Gender'])
plt.grid()
plt.show()

sns.countplot(x = train_copy['Is_Lead_text'], hue=train_copy['Is_Active'])
plt.grid()
plt.show()

sns.countplot(x = train_copy['Is_Lead_text'], hue=train_copy['Occupation'])
plt.grid()
plt.show()

sns.countplot(x = train_copy['Is_Lead_text'], hue=train_copy['Channel_Code'])
plt.grid()
plt.show()

sns.countplot(x = train_copy['Is_Lead_text'], hue=train_copy['Credit_Product'])
plt.grid()
plt.show()

In [None]:
train_copy.head()

In [None]:
# plot boxplots w.r.t Is_Lead_text
sns.boxplot(x=train_copy['Credit_Product'], y=train_copy['Avg_Account_Balance'])
plt.show()

sns.boxplot(x=train_copy['Credit_Product'], y=train_copy['Age'])
plt.show()

sns.boxplot(x=train_copy['Credit_Product'], y=train_copy['Vintage'])
plt.show()



1.   Those between age 55-60 or Vintage between 60-80 have a Credit_Product.
2.   Those between age 30-40 do not have a Credit_Product.



In [None]:
# Fill na in credit_product
def fill_na(data, null_indices):
  for i in null_indices:
    if  (55 <= train['Age'].iloc[i] <=60)  and (60 <= train['Vintage'].iloc[i] <= 40):
      train['Credit_Product'].iloc[i] = "Yes"
    elif 30 <= train['Age'].iloc[i] <= 40:
      train['Credit_Product'].iloc[i] = "No"


# Find null indices
null_indices = train[train['Credit_Product'].isnull() == True].index

# Call the above method and fill the null values
fill_na(train, null_indices=null_indices)

In [None]:
train.isnull().sum()

In [None]:
train['Credit_Product'].value_counts()

In [None]:
# For rest of the null values introduce a new category of "Unknown"
train.fillna('Unkown', inplace=True)

In [None]:
# Similarly use fill_na(user defined function on test set)
null_indices_test = test[test['Credit_Product'].isnull()==True].index
fill_na(test, null_indices_test)

In [None]:
test.isnull().sum() # this logic did not work on test set, fill the null vlaues with unknown brand

In [None]:
test.fillna('Unknown', inplace=True)

In [None]:
# Check for null values
train.isnull().sum()

In [None]:
test.isnull().sum()

# Data preprocessing

In [None]:
train.head(10)

In [None]:
# Drop the is_lead_text column, ID and Region_Code
train.drop(['ID', 'Region_Code', 'Is_Lead_text'], axis=1, inplace=True)

In [None]:
# Encode the columns
cat_cols_updated = [cols for cols in train.select_dtypes('O').columns]
cat_cols_updated

In [None]:
from sklearn.preprocessing import LabelEncoder

def encoder(data, cat_cols):

  for col in cat_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].values.reshape((-1,1)))
  return data


In [None]:
encoded_data = encoder(train, cat_cols_updated) # use the encoder function

In [None]:
test.head()

In [None]:
# Store the Sample ID of the test set
test_ID = test['ID']

In [None]:
test.drop(['ID', 'Region_Code'],axis=1, inplace=True)
cat_cols_test_updated = [cols for cols in test.select_dtypes('O').columns]

In [None]:
cat_cols_test_updated

In [None]:
# Call the encoder funciton
encoded_data_test = encoder(test, cat_cols_test_updated)

In [None]:
encoded_data_test.head()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(encoded_data.corr(), annot=True)
plt.show()

In [None]:
# Scale the training and testing data
from sklearn.preprocessing import MinMaxScaler

def scaled_data(data):
  for col in data.columns:
    # Min Max Scaler object
    mms = MinMaxScaler()
    data[col] = mms.fit_transform(data[col].values.reshape((-1,1)))
  return data

In [None]:
final_training_data = scaled_data(encoded_data.drop('Is_Lead', axis=1))
final_testing_data = scaled_data(encoded_data_test)

In [None]:
final_training_data.head()

In [None]:
len(final_training_data.columns)

In [None]:
len(final_testing_data.columns)

# Machine Learninig Modelling

In [None]:
final_training_data.head()

In [None]:
target_variable = encoded_data['Is_Lead'] # Store the target variable

In [None]:
# Model selection 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score

# Split the training data
X_train, X_test, y_train, y_test = train_test_split(final_training_data, target_variable, test_size=0.2, random_state=42)

In [None]:
# Define a function to compute all the metrics
def compute_metrics(y_true, y_pred, y_score, model):

  # 1. Accuracy Score
  acc_score = accuracy_score(y_true=y_true, y_pred=y_pred)

  # 2. ROC_AUC_Score
  roc_score = roc_auc_score(y_true=y_true, y_score=y_score)

  # 3. Precision Score
  prec = precision_score(y_true=y_true, y_pred=y_pred)

  # Recall Score
  rec = recall_score(y_true=y_true, y_pred=y_pred)

  # Create a df of all the metrics
  df_metrics = pd.DataFrame(np.array([acc_score, roc_score, prec, rec]).reshape((1,4)), columns=["Accuracy", "ROC_AUC_Score", "Precision", "Recall"], index=[model])
  return df_metrics

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegressionCV # Logistic Regression Model

# Model object
lg_clf = LogisticRegressionCV(cv=3, verbose=1, random_state=42, n_jobs=-1)

# fit the model
lg_clf.fit(X_train, y_train)

In [None]:
# Make predicitons on the test set and compute the metrics
predictions_1 = lg_clf.predict(X_test)
prediction_prob1 = lg_clf.predict_proba(X_test)
prediction_prob1 = prediction_prob1[ : ,1]

In [None]:
log_reg_results = compute_metrics(y_true = y_test, y_pred = predictions_1, y_score = prediction_prob1, model = "Logistic Regression")

In [None]:
log_reg_results

## Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Model object
dt_clf_default = DecisionTreeClassifier() # Default Model

# Fit the model
dt_clf_default.fit(X_train, y_train)

In [None]:
# Make predictions and compute metrics
predictions_2 = dt_clf_default.predict(X_test)
prediction_prob2 = dt_clf_default.predict_proba(X_test)
prediction_prob2 = prediction_prob2[ : ,1]

# Compute the metrics
dt_clf_default_metrics = compute_metrics(y_true=y_test, y_pred=predictions_2, y_score=prediction_prob2, model = "Decision Tree Default")
dt_clf_default_metrics

In [None]:
# Hyperparameter Tuning

# set parameters
criterion = ["ginin", "entropy"]
splitter = ["best", "random"]
max_depth = [None, 10, 20, 30]
min_samples_split = [2,3,4,5,6,8,9,10]
min_samples_leaf = [1,2,3,4,5]
min_weight_fraction_leaf = [ 0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
max_features = ["auto", "sqrt", "log2", None]
class_weight = [None, "balanced"]


# Create a parameter grid
params = {"criterion" : criterion,
          "splitter" : splitter,
          "max_depth" : max_depth,
          "min_samples_split" : min_samples_split,
          "min_samples_leaf" : min_samples_leaf,
          "min_weight_fraction_leaf" : min_weight_fraction_leaf,
          "max_features" : max_features,
          "class_weight" : class_weight
          }

# Model Object
dt_clf = DecisionTreeClassifier(random_state=42)

# Randomised Search CV
rscv_dt_clf = RandomizedSearchCV(dt_clf, params, n_iter=20, n_jobs=-1, cv=3, verbose=1, random_state=42)

# Fit the model
rscv_dt_clf.fit(X_train, y_train)

In [None]:
dt_best_estimator = rscv_dt_clf.best_estimator_

In [None]:
dt_best_estimator.fit(X_train, y_train)

In [None]:
# Make predicitons and compute metrics
predictions_3 = dt_best_estimator.predict(X_test)
prediction_prob3 = dt_best_estimator.predict_proba(X_test)
prediction_prob3 = prediction_prob3[ : , 1]

# Compute the metrics
dt_clf_best_est_results = compute_metrics(y_test, predictions_3, prediction_prob3, "Decision Tree Best Estimator")
dt_clf_best_est_results

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Model Object
rf_clf_default = RandomForestClassifier() # Default Model

# Fit the model
rf_clf_default.fit(X_train, y_train)

# Make Predicitons and Compute metrics
predictions_4 = rf_clf_default.predict(X_test)
prediction_prob4 = rf_clf_default.predict_proba(X_test)[ : , 1]

# Compute the metrics
rf_clf_default_results = compute_metrics(y_test, predictions_4, prediction_prob4 , model = "Random Forest Default")

In [None]:
rf_clf_default_results

In [None]:
# Hyperparameter Tuninig
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'criterion' : ["gini", "entropy"],
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 500]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

# Fit the model
grid_search.fit(X_train, y_train)

In [None]:
# Make Predicitons and Compute metrics
predictions_8 = grid_search.predict(X_test)
prediction_prob8 = grid_search.predict_proba(X_test)[ : , 1]

# Compute the metrics
rf_clf_best_results = compute_metrics(y_test, predictions_8, prediction_prob8 , model = "Random Forest Best Estimator")

## XGBoost

In [None]:
from xgboost import XGBClassifier

# Model object
xgb_clf = XGBClassifier()

# Fit the object
xgb_clf.fit(X_train, y_train)

In [None]:
# Make predictions and compute metrics
predictions_5 = xgb_clf.predict(X_test)
prediction_prob5 = xgb_clf.predict_proba(X_test)[ : , 1]

# compute the metrics
xgb_clf_results = compute_metrics(y_test, predictions_5, prediction_prob5, model = "XGB Classifier")
xgb_clf_results

## Adaboost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier

# default model
adb_clf = AdaBoostClassifier()

# Fit the model
adb_clf.fit(X_train, y_train)

# Make predictions and compute metrics
predictions_5 = adb_clf.predict(X_test)
prediction_prob5 = adb_clf.predict_proba(X_test)[ : ,1]

In [None]:
adb_clf_results = compute_metrics(y_test, predictions_5, prediction_prob5, "AdaBoost Classifier")

In [None]:
adb_clf_results

## SVM

In [None]:
from sklearn.svm import SVC

# Model object
svm_clf = SVC()

# Fit the model
svm_clf.fit(X_train, y_train)

# Make predictions
predictions_6 = svm_clf.predict(X_test)
#prediction_prob6 = svm_clf.predict_proba(X_test)[ : ,1]

In [None]:
# Compute metrics
svm_clf.decision_function(X_test)

## Neural Networks

In [None]:
import tensorflow as tf
import keras
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
from keras.models import Sequential


# Build a sequential model
model = Sequential()

# Add layers
model.add(Dense(50, activation='relu', input_shape = (X_train.shape[-1],)))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.50))
model.add(Dense(150, activation='relu'))
model.add(Dense(1, activation = 'sigmoid'))

# buld the model
print(model.summary())

# Compile the model
metric = tf.metrics.AUC(from_logits=True)
model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=metric)
hist = model.fit(X_train, y_train, epochs=100, validation_split=0.2, callbacks=EarlyStopping(patience=10))

In [None]:
# make predictions and compute metrics
predictions_7 = model.predict_classes(X_test)
prediction_prob7 = model.predict_proba(X_test)

# Compute metrics
nn_model_metrics = compute_metrics(y_test, predictions_7, prediction_prob7, "Neural Network")
nn_model_metrics

# Predictions

## Approach 1


*   Logistic Regression
*   Decision Tree
*   Random Forest Classifier
*   XGBoost Classifier
*   AdaBoost Classifier
*   Neural Network







In [None]:
final_testing_data.rename(columns={'Credit_Product_Unknown' : 'Credit_Product_Unkown'}, inplace=True)

In [None]:
# Compare the models
approach_1_result_metrics = pd.concat([log_reg_results, 
                                       dt_clf_default_metrics, 
                                       dt_clf_best_est_results, 
                                       rf_clf_default_results,
                                       xgb_clf_results,
                                       adb_clf_results,
                                       nn_model_metrics]).sort_values(['ROC_AUC_Score'], ascending=False)
approach_1_result_metrics

In [None]:
def make_predictions(model, model_object, data, id_column, path="/content/"):

  if model != "NeuralNetwork":
    # make predictions
    predictions = model_object.predict(data)
  else:
    predictions = model_object.predict_classes(data).reshape((-1,))

  # Concatenate the predictions and the ID
  prediction_df = pd.DataFrame({"ID" : test_ID, "Is_Lead" : predictions})

  # store into .csv
  dest = path + model + "_" + "submissions.csv"
  prediction_df.to_csv(dest, index=False)

In [None]:
models = {"LogisticRegression" : lg_clf, 
          "DecisionTree" : dt_best_estimator, 
          "RandomForest" : rf_clf_default,
          "XGBoost" : xgb_clf,
          "AdaBoost" : adb_clf,
          "NeuralNetwork" : model
          }

# Iterate over the itemrs and Call the above method
for mod, model_obj in models.items():
  make_predictions(model = mod, model_object=model_obj, data=final_testing_data, id_column = test_ID)

In [None]:
final_testing_data.shape