**<div style="background-color:#B0EADC"><center><span style="color:gray;">Default Loans Prediction</span></center></div>**

<center><img src="https://i.gifer.com/8ESB.gif"></center>

# <center><span style="color:#A8AD10;">Calling the libraries and the data</span></center>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
path= "../input/bondora-peer-to-peer-lending-loan-data/LoanData_Bondora.csv"
data= pd.read_csv(path, low_memory=False)

# <center><span style="color:#A8AD10;">Data Exploration and Preprocessing</span></center>

In [None]:
data.shape

*We have 179235 lines of data and 112 columns*

In [None]:
#for i in range(112):
#    print(data.columns[i])

### The features we have :
* ReportAsOfEOD: EOD stands for End of the Day. It points to the top of a trading day in financial markets, the purpose in time when the trading ceases for the day. It's also referred to as end of business, close of business and close of play.

- LoanId

- LoanNumber

- ListedOnUTC: Date when the loan application appeared on Primary Market

- BiddingStartedOn: A bid is an offer made by an investor, trader, or dealer in an effort to buy an asset or to compete for a contract.

- BidsPortfolioManager.

- BidsApi.

- BidsManual.

- UserName.

- NewCreditCustomer.

- LoanApplicationStartedDate.

- LoanDate.

- ContractEndDate.

- FirstPaymentDate.

- MaturityDate_Original: The time between the issue and the maturity date for a particular bond.

- MaturityDate_Last.

- ApplicationSignedHour.

- ApplicationSignedWeekday.

- VerificationType.

- LanguageCode.

- Age.

- DateOfBirth.

- Gender.

- Country.

- AppliedAmount.

- Amount.

- Interest.

- LoanDuration.

- MonthlyPayment.

- County.

- City.

- UseOfLoan.

- Education.

- MaritalStatus.

- NrOfDependants.

- etc...

In [None]:
#for i in data.columns:
#    print(i,data[i].isnull().sum())

### <span style="color:red"> alot of features have more than 90% null Let's drop them</span>

In [None]:
list_of_50_percent_null = [ ]
for i in data.columns:
    if data[i].isnull().sum() >= (90*179235)/100:
        list_of_50_percent_null.append(i)

In [None]:
list_of_50_percent_null

In [None]:
data = data.drop(list_of_50_percent_null, axis=1)

In [None]:
print(data.describe().transpose())

In [None]:
data.dtypes

In [None]:
names= data.columns

# <center><span style="color:#A8AD10;"> Exploration for Object and Bool data types</span></center>

In [None]:
cat_data= data.select_dtypes('object')
data = data.drop(cat_data.columns, axis=1)

bool_data= data.select_dtypes('bool')
data = data.drop(bool_data.columns, axis=1)

In [None]:
cat_data

In [None]:
bool_data

### <span style="color:red"> Fill the object data that have less then 90% null</span>

In [None]:
cat_data =cat_data.fillna("unknown")

In [None]:
date_type= cat_data["BiddingStartedOn"].astype('datetime64[ns]')
cat_data= cat_data.drop(['BiddingStartedOn'],axis=1)

In [None]:
features_cat_data= list(cat_data.columns)
features_cat_data_viz= ["Country","EmploymentDurationCurrentEmployer","Rating","WorseLateCategory",
                   "CreditScoreEsMicroL"]
for i in features_cat_data_viz:
    cat_data[i].value_counts().plot(kind='pie', figsize=(6,6), autopct="%1.2f%%")
    plt.title(i)
    plt.show()


In [None]:
features_bool_data= list(bool_data.columns)
features_bool_data_viz= ["NewCreditCustomer", "ActiveScheduleFirstPaymentReached","Restructured"]
for i in features_bool_data_viz:
    bool_data[i].value_counts().plot(kind='bar', figsize=(5,5))
    plt.title(i)
    plt.show()

# <center><span style="color:#A8AD10;"> Preprocessing for Object and Bool data types</span></center>

In [None]:
cat_array= np.array(cat_data).reshape(-1)
bool_array= np.array(bool_data).reshape(-1)

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
encoder_1= LabelEncoder()
encoder_2= LabelEncoder()

In [None]:
cat_enc= encoder_1.fit_transform(cat_array)
bool_enc= encoder_2.fit_transform(bool_array)

In [None]:
cat_enc= pd.DataFrame(cat_enc.reshape(179235,int(cat_enc.shape[0]/179235)))
bool_enc= pd.DataFrame(bool_enc.reshape(179235,int(bool_enc.shape[0]/179235)))

In [None]:
cat_enc.columns= features_cat_data
bool_enc.columns= features_bool_data

In [None]:
object_data = pd.concat([cat_enc,bool_enc], axis=1)


In [None]:
object_data

# <center><span style="color:#A8AD10;"> Exploration for Numerical data </span></center>

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
data

### <span style="color:red"> Fill the object data that have less then 90% null</span>

In [None]:
names_num = data.columns
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
imp_median.fit(np.array(data))
imp_data= imp_median.transform(data)
data= pd.DataFrame(imp_data)
data.columns= names_num

In [None]:
data

In [None]:
#data.hist(figsize=(20,20))

In [None]:
data.describe().transpose()

# <center><span style="color:#A8AD10;"> Preprocessing for Numerical data </span></center>
<center>We will use Z-Score for normalization</center>

In [None]:
def z_score_normalizer(X):
    m = X.shape[0]
    n = 1
    for i in range(n):
        X = (X - X.mean(axis=0))/X.std(axis=0)
    return X

In [None]:
data = z_score_normalizer(data)

# <center><span style="color:#A8AD10;"> Target Selection </span></center>

In [None]:
Y= object_data['Status']
object_data = object_data.drop(['Status'],axis=1)
Y=encoder_1.inverse_transform(Y)
Y= pd.DataFrame(Y, columns=['Status'])
Y.Status.unique()

### we have 3 type in status 
- Late
- Repaid
- Current
##### We have to drop Current 

In [None]:
Y= Y.loc[Y.Status!='Current']

In [None]:
Y= Y.replace(['Late','Repaid'],[0,1])

In [None]:
Y

# <center><span style="color:#A8AD10;"> Compine all the data togather </span></center>


In [None]:
all_data = pd.concat([object_data,data,Y], axis=1)

In [None]:
all_data

In [None]:
all_data= all_data.dropna()

# <center><span style="color:#A8AD10;"> We are ready for Machine Learning </span></center>
<center><img src="https://media.giphy.com/media/c7PcKQlOqZ8Ws/giphy.gif"></center>


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
Y= all_data["Status"]
X= all_data.drop(["Status"],axis= 1)

**%80 for taining**

In [None]:
X_train, X2, Y_train, Y2= train_test_split(X, Y, test_size=0.2, random_state=24)

In [None]:
X_val, X_test, Y_val, Y_test= train_test_split(X2, Y2, test_size=0.5, random_state=4)

In [None]:
model = DecisionTreeClassifier(random_state=4)

In [None]:
parameters = {"criterion" : ["gini", "entropy"], 
              'max_depth': [20,21],
              'min_samples_split': [50,51]}


In [None]:
grid_search = GridSearchCV(model, parameters, n_jobs=50,verbose=100,cv=2, refit='best_params_')

In [None]:
grid_search.fit(X_train, Y_train)

In [None]:
print("Best parameters for DT Clasiifier",grid_search.best_params_)

In [None]:
preds = grid_search.predict(X_val)
fpr, tpr, thresholds = metrics.roc_curve(Y_val, preds)
print("AUC Score :",metrics.auc(fpr, tpr))

In [None]:
metrics.plot_roc_curve(grid_search, X_val, Y_val)

In [None]:
preds2 = grid_search.predict(X_test)
roc_score2 = metrics.roc_auc_score(Y_test, preds2)
print("Roc Score :\n",roc_score2)

# <center><span style="color:#A8AD10;"> Roc Score on the Validation set = 99.99% </span></center>
# <center><span style="color:#A8AD10;"> Roc Score on the Test set = 99.99% </span></center>

