# TRAIN DATA

## IMPORTING THE LIBRARIES

In [None]:
print('Importing Essential Libraries...')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import os
import sklearn
%matplotlib inline

print('Done!!!')

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

## LOAD THE DATA

In [None]:
print('Loading the data.....')
df = pd.read_csv('./Data/train_data.csv')
print('Done!!!')

print('The first 5 rows are: ')
df.head()

In [None]:
df.info()
print('The shape of the dataframe is:- Rows: ',df.shape[0],' Columns: ',df.shape[1])

## Data Visualization..

In [None]:
#In the given data how many loans replayed and defaulted loans.
fig, ax = plt.subplots(figsize=(7,5))
sns.countplot(x = 'TARGET',data = df)

In [None]:
#Who is the highest borrower? Male or Female?
fig, ax = plt.subplots(figsize=(10,7))
sns.countplot(x='CODE_GENDER',data=df)

In [None]:
# #How is the distribution of target labels? - Did most people return on time ?
fig, ax = plt.subplots(figsize=(10,7))
sns.countplot(x ='TARGET',data=df, hue='TARGET',palette="Set1")

In [None]:
# #Whether is it Female who has more difficulties or is it Male in repaying the loan?
fig, ax = plt.subplots(figsize=(10,7))
sns.countplot(x='TARGET',hue='CODE_GENDER',data=df)

In [None]:
# #Who owns most number of the cars? M or F?
fig, ax = plt.subplots(figsize=(10,7))
sns.countplot(x='CODE_GENDER', hue='FLAG_OWN_CAR', data=df,palette="Set1")

In [None]:
print('Drawing Histogram!!!')
for h in df.columns:
    if df[h].nunique() < 100:
        fig, ax = plt.subplots(1,1, figsize=(15, 6))
        sns.histplot(df[h], palette='Blues_r')
        fig.text(0.1, 0.95, f'{h}', fontsize=16, fontweight='bold', fontfamily='serif')
        plt.xlabel('value ', fontsize=10)
        plt.ylabel('count',fontsize=10)
        plt.yticks(fontsize=13)
        plt.box(False)

# PRE-PROCESSING

## DROPPING THE COLS WITH 60% NULLS VALUE

In [None]:
## Drop the columns that have > 60% NULL Values:
to_drop=[] #this is a list that stores the names of cols having more than 50% nulls
for features in df.columns:
    percentage = (df[features].isna().sum()/df.shape[0]) *100
    if df[features].isna().sum() > 0 and percentage > 60.0:
        to_drop.append(features)
        print(features,'    ' ,df[features].isna().sum(), percentage)
        df.drop(features,axis=1,inplace=True)


print('The shape of the dataframe is:- Rows: ',df.shape[0],' Columns: ',df.shape[1])

In [None]:
##Check for duplicate data rows and drop the Id Column

df.drop('SK_ID_CURR',axis= 1,inplace= True)

countDuplicateRows = df[df.duplicated(subset = None, keep= False)].shape[0]
print('The number of Duplicate Rows present here are: ',countDuplicateRows)

### REPLACE THE ABSURD VALUES BY NAN VALUES

In [None]:
## Finding absurd values here.
for i in df.columns:
    if df[i].dtype == 'object':
        print("Feature name: ",i," Unique values are: ",df[i].unique())

## Replace them by nan values.
df['CODE_GENDER'] = df['CODE_GENDER'].replace('XNA',np.nan)
df['NAME_TYPE_SUITE'] = df['NAME_TYPE_SUITE'].replace('Other_A',np.nan)
df['NAME_TYPE_SUITE'] = df['NAME_TYPE_SUITE'].replace('Other_B',np.nan)
df['NAME_FAMILY_STATUS'] = df['NAME_FAMILY_STATUS'].replace('Unknown',np.nan)
df['ORGANIZATION_TYPE'] = df['ORGANIZATION_TYPE'].replace('XNA',np.nan)

In [None]:
## Checking for the NULL if it is still existing.
for i in df.columns:
    if df[i].isna().sum() > 0:
        print(i,df[i].dtype,df[i].isna().sum())

In [None]:
## Removing the NULL values from the categorical data:

# from sklearn.impute import SimpleImputer
# impt = SimpleImputer(strategy= 'most_frequent')

for i in df.columns:
    if df[i].dtype == 'object' and df[i].isna().sum() > 0:
        df[i] = df[i].fillna(df[i].mode()[0])


In [None]:
## Removing the null values for the non-categorical data: (Median as it may contain some outliers.)

for i in df.columns:
    if df[i].dtype != 'object':
        df[i] = df[i].fillna(int(df[i].median()))


### CHANGING THE -VE VALUES TO +VE FOR BETTER READABILITY

In [None]:
## Removing the negative values
#DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH

df['DAYS_BIRTH'] = df['DAYS_BIRTH'].abs()
df['DAYS_EMPLOYED'] = df['DAYS_EMPLOYED'].abs()
df['DAYS_REGISTRATION'] = df['DAYS_REGISTRATION'].abs()
df['DAYS_ID_PUBLISH'] = df['DAYS_ID_PUBLISH'].abs() 

### DIVIDE INTO X AND Y

In [None]:
## Break the dataset into X and Y
x = df.drop('TARGET',axis= 1)
y = df['TARGET']

## Checking for the Outliers

In [None]:
#Plotting BoxPlot of the left out columns to check if they have any outliers.
print('Before Removing Outliers')
count = 0
for i in x.columns:
    fig = plt.figure(figsize=(7,4))
    plt.boxplot(x[i])
    plt.suptitle(i)
    plt.show()
    count += 1

print('Total Boxplots printed are: ',count)


### REPLACE THE IQR BY CAPPING IT WITH LOWER AND UPPER QUARTILE VALUES

In [None]:
## IQR method to display outliers if present.

for i in x.columns:
    if x[i].dtype != 'object':
        Q1 = x[i].quantile(0.25)
        Q3 = x[i].quantile(0.75)  
        IQR = Q3 - Q1
        ll = Q1 - (IQR*1.5)
        ul = Q3 + (IQR*1.5)

        l = x[i].loc[x[i] < ll].to_list()
        u = x[i].loc[x[i] > ul].to_list()
        
        #capping the outliers by the lower quartile and upper quartile
        
        #x[i][(x[i]>ul) | (x[i]<ll)] = x[i].median()
        x[i]=np.where(x[i]>ul,ul,np.where(x[i]<ll,ll,x[i])) 
        
        

        ln = x[i].loc[x[i] < ll].to_list()
        un = x[i].loc[x[i] > ul].to_list()

        
        print(i,Q1,Q3,IQR,ll,ul,len(l),len(u),len(l)+len(u),len(ln),len(un),len(ln)+len(un)) 

In [None]:
#Plotting BoxPlot of the left out columns after removal of outliers.
print('After Removing Outliers')
count = 0
for i in x.columns:
    fig = plt.figure(figsize=(7,4))
    plt.boxplot(x[i])
    plt.suptitle(i)
    plt.show()
    count += 1

print('Total Boxplots printed are: ',count)

### LABEL ENCODING THE DATA

In [None]:
## Label Encode the Object columns:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

print('Applying Label Encoding....')
for i in x.columns:
    if x[i].dtype == 'object':
        x[i] = label_encoder.fit_transform(x[i])
print('Done!!!')

### OVERSAMPLING USING THE ADASYN TO CREATE SYNTHETIC DATA

In [None]:
## Oversampling using ADASYN and other SMOTE Techniques.
from collections import Counter
import imblearn
from imblearn.over_sampling import KMeansSMOTE,SMOTE,ADASYN,SVMSMOTE
adasyn = ADASYN(0.75,random_state=30)
X_res,Y_res = adasyn.fit_resample(x,y)


print("The number of classes before fit {}",format(Counter(y)))
print("The number of classes after fit {}",format(Counter(Y_res)))

print('Shape before sampling',x.shape,y.shape)
print('Shape after sampling',X_res.shape,Y_res.shape)


### TRAIN TEST SPLIT THE ENTIRE MODEL

In [None]:
## Train Test Split the Training Data.
from sklearn.model_selection import train_test_split


X_Train,X_Test,Y_Train,Y_Test = train_test_split(X_res,Y_res,train_size= 0.7,shuffle=True,random_state=30)
print("Training Data shape:  ",X_Train.shape,Y_Train.shape)
print("Testing  Data shape:  ",X_Test.shape,Y_Test.shape)

### FEATURE SELECTION USING RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel


# For Random Forest Classifier
sel = SelectFromModel(RandomForestClassifier(n_estimators= 100,criterion= 'entropy'))
sel.fit(X_Train,Y_Train)


# # For Decision Tree Classifier
# sel = SelectFromModel(ExtraTreesClassifier(n_estimators= 100,criterion= 'gini'))
# sel.fit(X_Train,Y_Train)


selected_features = X_Train.columns[(sel.get_support())]
print("The Number of features selected are: ",len(selected_features))
print("The features selected are: ",selected_features)

# pd.series(sel.estimator_,feature_importa).hist()

#Dropping the columns that are not present in the selected_features list

for i in X_Train.columns:
    if i not in selected_features:
        X_Train.drop(i,axis= 1,inplace= True)
        X_Test.drop(i,axis= 1,inplace= True)


### STORE THE VALUES FOR REPRODUCING THE SAME IN TEST DATASET

In [None]:
## Storing the results of the columns left out.
selected_features = []
for i in X_Train.columns:
    selected_features.append(i)

### APPLYING SCALING

In [None]:
## Applying Standard Scalar on the entire dataset.
from sklearn.preprocessing import StandardScaler

print('Applying Scaling on the training data only for the features...')
scaler = StandardScaler()
scaler.fit(X_Train)
scaler.fit(X_Test)
X_Train = scaler.transform(X_Train)
X_Test = scaler.transform(X_Test)
print('Done!!')
#Pass this scaled data as input to the Logistic Regression.

### IMPORTING METRICS FOR COMPARISON

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score,accuracy_score,confusion_matrix,classification_report


# IMPLEMENTING ML CLASSIFICATION MODELS

### LOGISTIC REGRESSION

In [None]:
#Logistic Regression.
from sklearn.linear_model import LogisticRegression
logistic_Regression  = LogisticRegression(random_state= 30,max_iter=10000)

logistic_Regression.fit(X_Train,Y_Train)
Y_Pred = logistic_Regression.predict(X_Test)

lacc = accuracy_score(Y_Pred,Y_Test)
lf1 = f1_score(Y_Pred,Y_Test)
lauc_score = roc_auc_score(Y_Pred,Y_Test)
print('The accuracy of the model on training data is: ')

print('The accuracy  is: ',lacc*100,'%')
print('The value of f1_score is: ',lf1*100,'%')
print('The value of Roc AUC Score is: ',lauc_score*100,'%')

print("The confusion matrix is: \n\n",confusion_matrix(Y_Test,Y_Pred))
print("The classification report is: \n\n",classification_report(Y_Test,Y_Pred))


## k NEAREST NEIGHBOURS

### FIND THE BEST VALUE OF K HERE.

In [None]:
# from sklearn.neighbors import KNeighborsClassifier

# for k in range(1, 100, 5):
#     k = k + 1
#     knn = KNeighborsClassifier(n_neighbors = k).fit(X_Train,Y_Train)
#     acc = knn.score(X_Test,Y_Test)
#     print("Accuracy for k = ",k," is: ",acc)

# Here we are selecting which is the best n value for the KNN algo

### USE THE BEST VALUE TO FIND THE ACCURACY

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors= 2).fit(X_Train,Y_Train)
Y_Pred = knn.predict(X_Test)
kacc = accuracy_score(Y_Pred,Y_Test)
kf1 = f1_score(Y_Pred,Y_Test)
kauc_score = roc_auc_score(Y_Pred,Y_Test)


print('The accuracy of the model on Data is: ')

print('The accuracy  is: ',kacc*100,'%')
print('The value of f1_score is: ',kf1*100,'%')
print('The value of Roc AUC Score is: ',kauc_score*100,'%')

print("The confusion matrix is: \n\n",confusion_matrix(Y_Test,Y_Pred))
print("The classification report is: \n\n",classification_report(Y_Test,Y_Pred))


## SVM

In [None]:
## Tried this but this took a huge amount of time. Like 6 hours.

# from sklearn.svm import SVC


# svc = SVC()
# svc.fit(X_Train,Y_Train)

# Y_Pred = svc.predict(X_Test)
# acc = accuracy_score(Y_Pred,Y_Test)
# f1 = f1_score(Y_Pred,Y_Test)
# auc_score = roc_auc_score(Y_Pred,Y_Test)


# print('The accuracy of the model on Data is: ')

# print('The accuracy  is: ',acc*100,'%')
# print('The value of f1_score is: ',f1*100,'%')
# print('The value of Roc AUC Score is: ',auc_score*100,'%')

# print("The confusion matrix is: \n\n",confusion_matrix(Y_Test,Y_Pred))
# print("The classification report is: \n\n",classification_report(Y_Test,Y_Pred))

## XG BOOST

### HYPERPARAMETER TUNING USING GRIDSEARCHCV TO GET THE BEST PARAMETERS FOR XG BOOST

In [None]:
# from xgboost import XGBClassifier
# xgb_model = XGBClassifier(random_state = 30)
# search_space = {
#     "n_estimators" :   [100,200],
#     "max_depth" :      [3,6,7],
#     "gamma" :          [0.01,0.1],
#     "learning_rate" :  [0.001,0.01,0.1,1]
# }

# from sklearn.model_selection import GridSearchCV
# GS = GridSearchCV(
#     estimator= xgb_model,
#     param_grid= search_space,
#     scoring= ["roc_auc","roc_auc_ovr","roc_auc_ovo","f1","f1_micro","f1_macro","accuracy"],
#     refit= "roc_auc",
#     cv= 5,
#     verbose= 4
# )

# GS.fit(X_Train,Y_Train)

# print("The best estimator is: ",GS.best_estimator_)
# print("The best parameter is: ",GS.best_params_)
# print("The best AUC_ROC score is: ",GS.best_score_)
# df_XGBoost = pd.DataFrame(GS.cv_results_)
# df_XGBoost = df_XGBoost.sort_values("rank_test_roc_auc")
# df_XGBoost.to_csv('./Test_Output/XGBoost_GridSearchCV.csv')

### APPLYING THE BEST PARAMETERS

In [None]:
from xgboost import XGBClassifier

model_XGB = XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0.1, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_bin=256,
              max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
              max_depth=7, max_leaves=0, min_child_weight=1,
              monotone_constraints='()', n_estimators=200, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=30)

model_XGB.fit(X_Train,Y_Train)

Y_Pred = model_XGB.predict(X_Test)
xacc = accuracy_score(Y_Pred,Y_Test)
xf1 = f1_score(Y_Pred,Y_Test)
xauc_score = roc_auc_score(Y_Pred,Y_Test)


print('The accuracy of the model on Data is: ')

print('The accuracy  is: ',xacc*100,'%')
print('The value of f1_score is: ',xf1*100,'%')
print('The value of Roc AUC Score is: ',xauc_score*100,'%')

print("The confusion matrix is: \n\n",confusion_matrix(Y_Test,Y_Pred))
print("The classification report is: \n\n",classification_report(Y_Test,Y_Pred))


## DECISION TREES

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state = 30)
dt.fit(X_Train,Y_Train)
Y_Pred = dt.predict(X_Test)
dacc = accuracy_score(Y_Pred,Y_Test)
df1 = f1_score(Y_Pred,Y_Test)
dauc_score = roc_auc_score(Y_Pred,Y_Test)


print('The accuracy of the model on Data is: ')

print('The accuracy  is: ',dacc*100,'%')
print('The value of f1_score is: ',df1*100,'%')
print('The value of Roc AUC Score is: ',dauc_score*100,'%')

print("The confusion matrix is: \n\n",confusion_matrix(Y_Test,Y_Pred))
print("The classification report is: \n\n",classification_report(Y_Test,Y_Pred))


## RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state = 30).fit(X_Train,Y_Train)
Y_Pred = rf.predict(X_Test)
racc = accuracy_score(Y_Pred,Y_Test)
rf1 = f1_score(Y_Pred,Y_Test)
rauc_score = roc_auc_score(Y_Pred,Y_Test)


print('The accuracy of the model on Data is: ')

print('The accuracy  is: ',racc*100,'%')
print('The value of f1_score is: ',rf1*100,'%')
print('The value of Roc AUC Score is: ',rauc_score*100,'%')

print("The confusion matrix is: \n\n",confusion_matrix(Y_Test,Y_Pred))
print("The classification report is: \n\n",classification_report(Y_Test,Y_Pred))


## GAUSSIAN NAIVE BAYES

In [None]:
##GAUSSIAN NAIVE BAYES

from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(X_Train,Y_Train)
Y_Pred = model.predict(X_Test)
gnbacc = accuracy_score(Y_Pred,Y_Test)
gnbf1 = f1_score(Y_Pred,Y_Test)
gnbauc_score = roc_auc_score(Y_Pred,Y_Test)


print('The accuracy of the model on Data is: ')

print('The accuracy  is: ',gnbacc*100,'%')
print('The value of f1_score is: ',gnbf1*100,'%')
print('The value of Roc AUC Score is: ',gnbauc_score*100,'%')

print("The confusion matrix is: \n\n",confusion_matrix(Y_Test,Y_Pred))
print("The classification report is: \n\n",classification_report(Y_Test,Y_Pred))


## BERNOULLI NAIVE BAYES

In [None]:
## BERNOULLI NAIVE BAYES

from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB()
model.fit(X_Train,Y_Train)
Y_Pred = model.predict(X_Test)
bnbacc = accuracy_score(Y_Pred,Y_Test)
bnbf1 = f1_score(Y_Pred,Y_Test)
bnbauc_score = roc_auc_score(Y_Pred,Y_Test)


print('The accuracy of the model on Data is: ')

print('The accuracy  is: ',bnbacc*100,'%')
print('The value of f1_score is: ',bnbf1*100,'%')
print('The value of Roc AUC Score is: ',bnbauc_score*100,'%')

print("The confusion matrix is: \n\n",confusion_matrix(Y_Test,Y_Pred))
print("The classification report is: \n\n",classification_report(Y_Test,Y_Pred))


## LGM Classifier

In [None]:
import lightgbm as ltb
model = ltb.LGBMClassifier()
# dt = DecisionTreeClassifier()
model.fit(X_Train,Y_Train)
Y_Pred = model.predict(X_Test)

lgmacc = accuracy_score(Y_Pred,Y_Test)
lgmf1 = f1_score(Y_Pred,Y_Test)
lgmauc_score = roc_auc_score(Y_Pred,Y_Test)


print('The accuracy of the model on Data is: ')

print('The accuracy  is: ',lgmacc*100,'%')
print('The value of f1_score is: ',lgmf1*100,'%')
print('The value of Roc AUC Score is: ',lgmauc_score*100,'%')

print("The confusion matrix is: \n\n",confusion_matrix(Y_Test,Y_Pred))
print("The classification report is: \n\n",classification_report(Y_Test,Y_Pred))

## MODEL COMPARISON AND PLOTTING THEM 

In [None]:
from prettytable import PrettyTable


t = PrettyTable(['ALGO','ACCURACY SCORE','F1 SCORE','ROC-AUC SCORE'])
t.add_row(['Logistic Regression',round(lacc,2),round(lf1,2),round(lauc_score,2)])
t.add_row(['KNN',round(kacc,2),round(kf1,2),round(kauc_score,2)])
t.add_row(['XG Boost',round(xacc,2),round(xf1,2),round(xauc_score,2)])
t.add_row(['Decision Trees',round(dacc,2),round(df1,2),round(dauc_score,2)])
t.add_row(['Random Forest',round(racc,2),round(rf1,2),round(rauc_score,2)])
t.add_row(['Gaussian NB', round(gnbacc,2),round(gnbf1,2),round(gnbauc_score,2)])
t.add_row(['Bernoulli NB',round(bnbacc,2),round(bnbf1,2),round(bnbauc_score,2)])
t.add_row(['Light GBM',round(lgmacc,2),round(lgmf1,2),round(lgmauc_score,2)])


print(t)

#Looking at the values it can be concluded that the best algo is XGBOOST!!!!
x_axis = np.array(["Logistic Regression","\nKNN","XG Boost","\nDecision Trees","Random Forest","\nGaussian NB","Bernoulli NB","\nLight GBM"])
y_axis = np.array([lauc_score,kauc_score,xauc_score,dauc_score,rauc_score,gnbauc_score,bnbauc_score,lgmauc_score])
plt.bar(x_axis,y_axis,align= 'center',width= 0.8)
plt.show()

## Hence use XG Boost!!

# TEST DATA

## LOADING THE TEST DATA

In [None]:
## importing the test data.

print('Loading the test data.....')
df_test = pd.read_csv('./Data/test_data.csv')
print('Done!!!')

In [None]:
df_test.head()
df_test.info()

countDuplicateRows = df_test[df_test.duplicated(subset = None, keep= False)].shape[0]
print('The number of Duplicate Rows present here are: ',countDuplicateRows)

id_column = df_test['SK_ID_CURR']

# PRE-PROCESSING IN TEST DATA SAME AS THAT OF TRAIN DATA

In [None]:
## Replace the absurd values by nan values.
df_test['CODE_GENDER'] = df_test['CODE_GENDER'].replace('XNA',np.nan)
df_test['NAME_TYPE_SUITE'] = df_test['NAME_TYPE_SUITE'].replace('Other_A',np.nan)
df_test['NAME_TYPE_SUITE'] = df_test['NAME_TYPE_SUITE'].replace('Other_B',np.nan)
df_test['NAME_FAMILY_STATUS'] = df_test['NAME_FAMILY_STATUS'].replace('Unknown',np.nan)
df_test['ORGANIZATION_TYPE'] = df_test['ORGANIZATION_TYPE'].replace('XNA',np.nan)

In [None]:
## Removing the negative values
#DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH

df_test['DAYS_BIRTH'] = df_test['DAYS_BIRTH'].abs()
df_test['DAYS_EMPLOYED'] = df_test['DAYS_EMPLOYED'].abs()
df_test['DAYS_REGISTRATION'] = df_test['DAYS_REGISTRATION'].abs()
df_test['DAYS_ID_PUBLISH'] = df_test['DAYS_ID_PUBLISH'].abs() 

In [None]:
# Dropping all the columns that are not in X_Train
for i in df_test.columns:
    if i not in selected_features:
        df_test.drop(i,axis=1,inplace= True)

print(df_test.shape)


In [None]:
## Removing the NULL values from object data type.
for i in df_test.columns:
    if df_test[i].dtype == 'object' and df_test[i].isna().sum() > 0:
        df_test[i] = df_test[i].fillna(df_test[i].mode()[0])

## Removing the NULL values from Non Object data type.
for i in df_test.columns:
    if df_test[i].dtype != 'object':
        df_test[i] = df_test[i].fillna(int(df_test[i].median()))

In [None]:
## Capping  Outliers using IQR Method.

for i in df_test.columns:
    if df_test[i].dtype != 'object':
        Q1 = x[i].quantile(0.25)
        Q3 = x[i].quantile(0.75)  
        IQR = Q3 - Q1
        ll = Q1 - IQR*1.5
        ul = Q3 + IQR*1.5
         
        #replace the outliers by the median
        
        #df_test[i][(df_test[i]>ul) | (df_test[i]<ll)] = df_test[i].median()
        df_test[i]=np.where(df_test[i]>ul,ul,np.where(df_test[i]<ll,ll,df_test[i]))

In [None]:
## Label Encode the Object columns:


print('Applying Label Encoding....')
for i in df_test.columns:
    if df_test[i].dtype == 'object':
        df_test[i] = label_encoder.fit_transform(df_test[i])
print('Done!!!')

In [None]:
## Applying Standard Scalar on the entire dataset.
from sklearn.preprocessing import StandardScaler

print('Applying Scaling on the training data only for the features...')
scaler = StandardScaler()

scaler.fit(df_test)

df_Test = scaler.transform(df_test)
print('Done!!')
#Pass this scaled data as input to the Logistic Regression.

## Apply the XG boost algo to get Y_hat

In [None]:
## Applying the best model here.
Y_Pred_final = model_XGB.predict(df_test)

# Here Y_Pred_final is the final data prepared by us now we need to upload it to  kaggle.


print(type(Y_Pred_final))
print(type(id_column))


## CREATING CSV FILE FOR THE KAGGLE SUBMISSION

In [None]:
## Creating CSV for kaggle

df_kaggle = id_column.to_frame()

df_kaggle['TARGET'] = Y_Pred_final.tolist()

df_kaggle.to_csv('./Test_Output/Submission_new_XGB.csv',index= None)

In [None]:
                # ---------------------------------------------- THE END ----------------------------------------------#  