 # Loan default prediction

In [None]:
# importing important libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import OrdinalEncoder

# Data checking

In [None]:
data = pd.read_csv('../input/loan-prediction-based-on-customer-behavior/Training Data.csv')
data.head()

In [None]:
data.info()

In [None]:
data.isna().sum()

no missing values

In [None]:
data.columns

 # preprocessing

In [None]:
data['Risk_Flag'].unique()

In [None]:
def encoding(df):
    df = df.copy()
    
    #label encoding
    df["Married/Single"].replace({"single":1,"married":2},inplace=True)
    df["House_Ownership"].replace({"rented":0,"norent_noown":1,"owned":2},inplace=True)
    df["Car_Ownership"].replace({"no":1,"yes":1},inplace=True)
    
    ord_enc = OrdinalEncoder()
    df["Profession"] = ord_enc.fit_transform(df[["Profession"]])
    df["CITY"] = ord_enc.fit_transform(df[["CITY"]])
    df["STATE"] = ord_enc.fit_transform(df[["STATE"]])
    
    return df

In [None]:
#label encoding
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop ID column
    df = df.drop('Id', axis=1)
    
    df = encoding(df)
    
    # Split df into X and y
    y = df['Risk_Flag'].copy()
    X = df.drop('Risk_Flag', axis=1).copy()
    
    # Scale X with a standard scaler
    scaler = StandardScaler()
    X = pd.DataFrame(scaler.fit_transform(X),columns = X.columns)

    
    return X, y
     
        


In [None]:
X, y = preprocess_inputs(data)

In [None]:
X

# Training

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)

In [None]:
models = [
    LogisticRegression(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    XGBClassifier()
]
    
for model in models:
    model.fit(X_train,y_train)
    print(f'{model} trained')


In [None]:
model_names = [
    "   Logistic Regression",
    "         Decision Tree",
    "         Random Forest",
    "               XGBoost"
]

for model, name in zip(models, model_names):
    print(name + ": {:.4f}%".format(model.score(X_test, y_test) * 100))

In [None]:
final_model = RandomForestClassifier()
final_model.fit(X_train,y_train)
print('Decision Tree' + ": {:.4f}%".format(final_model.score(X_test, y_test) * 100))

In [None]:
# Read the test data
test = pd.read_csv('../input/loan-prediction-based-on-customer-behavior/Test Data.csv')
test.head()

In [None]:
# preprocessing of test
def test_Preprocess(df):
    test_df = df.copy()
    
    # Drop ID column
    test_df = test_df.drop('ID', axis=1)
    
    # encoding 
    test_df = encoding(test_df)
    
    
    
    # Scale X with a standard scaler
    scaler = StandardScaler()
    test_df = pd.DataFrame(scaler.fit_transform(test_df),columns = test_df.columns)

    
    return test_df
     
        

In [None]:
test_preprocessed = test_Preprocess(test)

In [None]:
test_preprocessed

In [None]:
predictions = final_model.predict(test_preprocessed)
print(predictions)

In [None]:
# Save test predictions to file
output = pd.DataFrame({'id': test.index,
                       'risk_flag': predictions})
output.id = output.id + 1
output.to_csv('submission.csv', index=False)