# Predict the loan sanction Amount

Buying a house requires a lot of careful planning. Once you have finalized your budget and the house that you want to buy, you must ensure that you have sufficient funds to pay the seller.

With rising property rates, most people avail home loans to buy their dream houses. The bank only lends up to 80%  of the total amount based on a person's finances (salary, outgoing expenses, existing loans, etc.). You will need to make the rest of the payment yourself after the bank tells you how much they can lend.

## Task

You work for XYZ bank. Predict the loan amount that can be sanctioned to customers who have applied for a home loan using the features provided in the dataset.

## Loading Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

# Loading Dataset

In [None]:
df = pd.read_csv('/kaggle/input/sanction-loan/train.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()*100/len(df)

In [None]:
# Number of distinct values in variables
for i, column in enumerate(df.columns):
    print("{}. ".format(i) + str(column.title()) + ": {}". format(df[column].nunique()))

We will drop the row where loan sanction amount is null. As we do not have information about our target variable.

In [None]:
df = df[~(df["Loan Sanction Amount (USD)"].isnull()==True)]

In [None]:
df = df.reset_index(drop=True)

In [None]:
# We will drop the unique and identity columns 
df = df.drop(['Customer ID', 'Name'], axis = 1)

In [None]:
var_target = ['Loan Sanction Amount (USD)']

In [None]:
var_categorical = ['Gender', 'Income Stability', 'Profession', 'Type of Employment', 'Location',
                  'Expense Type 1', 'Expense Type 2', 'Dependents', 'No. of Defaults', 'Has Active Credit Card',
                  'Property Type', 'Property Location', 'Co-Applicant']
var_numerical = list(set(df.columns) - set(var_categorical) - set(var_target))

In [None]:
# Function to label the count on top of each bar in graph
def label_values(ax, spacing=5):
    total = 0
    for rect in ax.patches:
        total += rect.get_height()

    for rect in ax.patches:
        y_value = rect.get_height()
        x_value = rect.get_x() + rect.get_width() / 2

        space = spacing
        
        va = 'bottom'
        
        if y_value < 0:
            space *= -1
            va = 'top'
        label = "{:.2f}, {:.2f}".format(y_value, y_value/total*100)
        ax.annotate(
            label,                      
            (x_value, y_value),         
            xytext=(0, space),          
            textcoords="offset points", 
            ha='center',                
            va=va)                      

# Exploratory Data Analysis

## Target Analysis

In [None]:
sns.boxplot(x = df["Loan Sanction Amount (USD)"])
plt.show()

In [None]:
len(df[df["Loan Sanction Amount (USD)"] == 0])

In [None]:
# Box plot to see the values of loan sactioned without 0 USD
target_variable_without_zero = df[~(df["Loan Sanction Amount (USD)"] == 0)]["Loan Sanction Amount (USD)"]
sns.boxplot(x = target_variable_without_zero)
plt.show()

Many values in the Loan Sanction Amount is zero. That means no amount of loan was sactioned to those customers.

## Categorical Variable

In [None]:
for column in var_categorical:
    plt.figure(figsize=(15, 6))
    print(column.title())
    ax = sns.countplot(x = df[column])
    label_values(ax)
    plt.show()

In [None]:
df["Type of Employment"].value_counts()

In [None]:
for column in var_categorical:
    plt.figure(figsize=(15, 6))
    print(column.title())
    ax = sns.boxplot(x = df[column], y = df["Loan Sanction Amount (USD)"])
    label_values(ax)
    plt.show()

In [None]:
for column in var_categorical:
    plt.figure(figsize=(15, 6))
    print(column.title())
    ax = sns.boxplot(x = df[column], y = target_variable_without_zero)
    label_values(ax)
    plt.show()

## Numerical Variables

In [None]:
i = 1
for column in var_numerical:
    print(column.title())
    plt.subplots(figsize=(16, 35))
    plt.subplot(len(var_numerical) + 1, 4, i)
    sns.boxplot(y = df[column])
    i += 1
    plt.subplot(len(var_numerical) + 1, 4, i)
    sns.distplot(x = df[column])
    i += 1
    plt.subplot(len(var_numerical) + 1, 4, i)
    sns.scatterplot(y = df["Loan Sanction Amount (USD)"], x = df[column])
    i += 1
    plt.subplot(len(var_numerical) + 1, 4, i)
    sns.scatterplot(y = target_variable_without_zero, x = df[column])
    i += 1
    plt.show()

In [None]:
# Pairplot between all the variables
sns.pairplot(df[var_numerical + var_target])
plt.show()

In [None]:
# Heatmap linear correlation between numerical variables
plt.figure(figsize=(10, 10))
sns.heatmap(df[var_numerical+var_target].corr(), annot=True)
plt.show()

By looking at the column value and heatmap, we can say that property age has income values in it.  So we will drop the 'Property Age' column.

In [None]:
df = df.drop(["Property Age"], axis = 1)

In [None]:
var_numerical = list( set(var_numerical) - set(['Property Age']))

We can see from heatmap that 'Loan Amount Request (USD)' and 'Property Price' has 0.95 collinearity. <br>
We know that the bank only lends up to 80%  of the total amount based on a person's finances (salary, outgoing expenses, existing loans, etc.). <br>
**So we will create a new column which store the minimum of request amount and 80% of property price.**

In [None]:
loan_to_be_requested = []
for i in range(len(df)):
    value = min(df["Loan Amount Request (USD)"][i], 0.80*df["Property Price"][i])
    loan_to_be_requested.append(value)

In [None]:
df["Loan_To_Be_Requested"] = loan_to_be_requested

In [None]:
len(df[df["Loan_To_Be_Requested"] <0])

In [None]:
len(df[df["Property Price"] <0])

In [None]:
len(df[df["Loan Amount Request (USD)"]<0])

In [None]:
df[df["Loan_To_Be_Requested"]==0]["Loan Sanction Amount (USD)"].head()

In [None]:
len(df[df["Current Loan Expenses (USD)"]<0])

In [None]:
df = df[df["Property Price"] >=0]

In [None]:
df = df[df["Current Loan Expenses (USD)"]>=0]

In [None]:
df = df.reset_index(drop=True)

In [None]:
i = 1
for column in ["Loan_To_Be_Requested", "Current Loan Expenses (USD)"]:
    print(column.title())
    plt.subplots(figsize=(16, 35))
    plt.subplot(len(var_numerical) + 1, 4, i)
    sns.boxplot(y = df[column])
    i += 1
    plt.subplot(len(var_numerical) + 1, 4, i)
    sns.distplot(x = df[column])
    i += 1
    plt.subplot(len(var_numerical) + 1, 4, i)
    sns.scatterplot(y = df["Loan Sanction Amount (USD)"], x = df[column])
    i += 1
    plt.subplot(len(var_numerical) + 1, 4, i)
    sns.scatterplot(y = target_variable_without_zero, x = df[column])
    i += 1
    plt.show()

In [None]:
df = df.drop(["Loan Amount Request (USD)", "Property Price"], axis = 1)

In [None]:
var_numerical = list(set(var_numerical) - set(["Loan Amount Request (USD)", "Property Price"]))

In [None]:
var_numerical = var_numerical + ["Loan_To_Be_Requested"]

In [None]:
column = "Loan_To_Be_Requested"
i = 1
plt.subplots(figsize=(16, 35))
plt.subplot(len(var_numerical) + 1, 4, i)
sns.boxplot(y = df[column])
i += 1
plt.subplot(len(var_numerical) + 1, 4, i)
sns.distplot(x = df[column])
i += 1
plt.subplot(len(var_numerical) + 1, 4, i)
sns.scatterplot(y = df["Loan Sanction Amount (USD)"], x = df[column])
i += 1
plt.subplot(len(var_numerical) + 1, 4, i)
sns.scatterplot(x = target_variable_without_zero, y = df[column])
i += 1
plt.show()

In [None]:
df["Income (USD)"].sort_values(ascending=False)

In [None]:
df[df["Income (USD)"] > 40000]

In [None]:
sns.boxplot(df[df["Income (USD)"] < 40000]["Income (USD)"])
plt.show()

In [None]:
plt.figure(figsize=(16, 16))
sns.boxplot(x=df["Type of Employment"], y = df[df["Income (USD)"] < 40000]["Income (USD)"])
plt.show()

In [None]:
index0 = df[df["Income (USD)"] > 100000].index[0]
index1 = df[df["Income (USD)"] > 100000].index[1]

In [None]:
df["Income (USD)"][index0] = df[df["Type of Employment"] == df["Type of Employment"][index0]]["Income (USD)"].median()
df["Income (USD)"][index1] = df[df["Type of Employment"] == df["Type of Employment"][index1]]["Income (USD)"].median()

In [None]:
sns.boxplot(df["Income (USD)"])
plt.show()

# Handling Missing Values

In [None]:
missing_df = pd.DataFrame({
    "Columns": df.columns[df.isnull().sum()>0],
    "Values": df[df.columns[df.isnull().sum()>0]].isnull().sum()/len(df)*100
})
missing_df = missing_df.reset_index(drop=True)
missing_df

In [None]:
df["Type of Employment"] = df["Type of Employment"].fillna("Other")

In [None]:
df["Type of Employment"].value_counts()

In [None]:
df["Dependents"] = df["Dependents"].fillna(df["Dependents"].mode().values[0])

In [None]:
df["Dependents"].value_counts()

In [None]:
df["Gender"].mode().values[0]

In [None]:
df["Gender"] = df["Gender"].fillna(df["Gender"].mode().values[0])

In [None]:
df["Income (USD)"] = df.groupby(by=["Type of Employment"])["Income (USD)"].transform(lambda x: x.fillna(x.median()))

In [None]:
df["Income Stability"] = df["Income Stability"].fillna(df["Income Stability"].mode().values[0])

In [None]:
sns.boxplot(x = df["Current Loan Expenses (USD)"])

Here we can see that current loan has -999 expense which is wrong data.

In [None]:
len(df[df["Current Loan Expenses (USD)"]== -999]["Current Loan Expenses (USD)"])

In [None]:
for i in range(len(df)):
    if df["Current Loan Expenses (USD)"][i] == -999:
        df["Current Loan Expenses (USD)"][i] = np.nan

In [None]:
df["Current Loan Expenses (USD)"] = df["Current Loan Expenses (USD)"].fillna(df["Current Loan Expenses (USD)"].median())

In [None]:
sns.distplot(x = df["Credit Score"])
plt.show()

We will impute mean values in credit score null places

In [None]:
df["Credit Score"] = df["Credit Score"].fillna(df["Credit Score"].mean())

In [None]:
df["Has Active Credit Card"].value_counts()

In [None]:
df["Has Active Credit Card"] = df["Has Active Credit Card"].fillna(df["Has Active Credit Card"].mode().values[0])

In [None]:
df["Property Location"].value_counts()

In [None]:
df["Property Location"] = df["Property Location"].fillna(df["Property Location"].mode().values[0])

In [None]:
df["Co-Applicant"].value_counts()

In [None]:
for i in range(len(df)):
    if df["Co-Applicant"][i] == -999:
        df["Co-Applicant"][i] = 1

In [None]:
df.isnull().sum()/len(df)*100

In [None]:
len(df)

In [None]:
df[var_categorical].nunique()

# One hot multiclass encoding

For columns 'Type of Employment' and 'Dependents', we will use one hot encoding for multiclass variables. <br/>
Based on the winning solution of KDD 2009 Cup i.e. we are going to limit the number of categories in the these 3 variables to 10 most frequent labels.

In [None]:
def top_labels(df, col, max_col):
    top = list(df[col].value_counts().sort_values(ascending=False).head(max_col).index)
    print("Top Labels: ", top)
    for categories in top:
        df[col+ "_" +str(categories)]=np.where(df[col]==categories,1,0)
        print(col + "_" + str(categories))
    return top

In [None]:
df["Type of Employment"].value_counts()

We will take top 12 columns for Type of Employment.

In [None]:
top_labels_emp = top_labels(df, 'Type of Employment', 12)
df = df.drop(['Type of Employment'], axis = 1)

In [None]:
df["Dependents"].value_counts()

Here we will take only top 8.

In [None]:
top_labels_dep = top_labels(df, 'Dependents', 8)
df = df.drop(['Dependents'], axis = 1)

In [None]:
df["Profession"].value_counts()

In [None]:
top_labels_prof = top_labels(df, 'Profession', 4)
df = df.drop(['Profession'], axis = 1)

In [None]:
var_categorical = list(set(var_categorical) - set(['Type of Employment', 'Dependents', 'Profession']))

For profession column we will take only top 4 columns

In [None]:
# Make dummy variables for the nominal columns
df = pd.get_dummies(df, columns=var_categorical, drop_first=True)

In [None]:
df.head()

# Split the dataset into train and test

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_regression = df[df["Loan Sanction Amount (USD)"]>0]

In [None]:
# We will divide the training and testing set in 70% and 30% respectively
# We used random_state = 100 so that everytime we run it we will have same set of training and testing set
df_train, df_test = train_test_split(df, train_size = 0.7, random_state = 100)

In [None]:
print("Train Dataset: "+ str(len(df_train)) + " Test Dataset: " + str(len(df_test)))

# Dividing into X and Y sets for the model building

In [None]:
# Divide the train data into X and y

y_train = df_train.pop('Loan Sanction Amount (USD)')
X_train = df_train

In [None]:
# Divide the test data into X and y

y_test = df_test.pop('Loan Sanction Amount (USD)')
X_test = df_test

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

# Scaling the numerical variables

In [None]:
min_max_scaler = ['Age', 'Current Loan Expenses (USD)']
robust_scaler = ['Loan_To_Be_Requested','Income (USD)']
standard_scaler = ['Property ID', 'Credit Score']

In [None]:
# Feature scaling
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler

In [None]:
minMaxScaler = MinMaxScaler()
robustScaler = RobustScaler()
standardScaler = StandardScaler()

In [None]:
X_train[min_max_scaler] = minMaxScaler.fit_transform(X_train[min_max_scaler])
X_test[min_max_scaler] = minMaxScaler.transform(X_test[min_max_scaler])

In [None]:
X_train[robust_scaler] = robustScaler.fit_transform(X_train[robust_scaler])
X_test[robust_scaler] = robustScaler.transform(X_test[robust_scaler])

In [None]:
X_train[standard_scaler] = standardScaler.fit_transform(X_train[standard_scaler])
X_test[standard_scaler] = standardScaler.transform(X_test[standard_scaler])

In [None]:
X_train.head()

# Model Building

# 1. Classification

We will run classification to identify whether the loan saction amount will be zero or not.

In [None]:
y_train_c = y_train.apply(lambda x: 1 if x>0 else 0)
y_test_c = y_test.apply(lambda x: 1 if x>0 else 0)

In [None]:
y_train_c.value_counts()*100/len(y_train_c)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [None]:
# Model evaluation function
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score
# Grid Search CV for hyperparameter tuning
from sklearn.model_selection import GridSearchCV
# K Fold cross validation
from sklearn.model_selection import KFold

In [None]:
# Draw ROC curve from training and test data probability
def draw_roc( train_actual, train_probs, test_actual, test_probs ):
    train_fpr, train_tpr, train_thresholds = roc_curve( train_actual, train_probs,
                                              drop_intermediate = False )
    test_fpr, test_tpr, test_thresholds = roc_curve( test_actual, test_probs,
                                              drop_intermediate = False )
    train_auc_score = roc_auc_score( train_actual, train_probs )
    test_auc_score = roc_auc_score( test_actual, test_probs )
    plt.figure(figsize=(5, 5))
    plt.plot( train_fpr, train_tpr, label='ROC curve (area = %0.2f)' % train_auc_score )
    plt.plot( test_fpr, test_tpr, label='ROC curve (area = %0.2f)' % test_auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return None

# 1.1  Logistic Regression

In [None]:
logisticRegression = LogisticRegression()

In [None]:
logisticRegression.fit(X_train, y_train_c)

In [None]:
y_pred_train_c = logisticRegression.predict(X_train)

In [None]:
y_pred_test_c = logisticRegression.predict(X_test)

In [None]:
print("Accuracy train: ", accuracy_score(y_train_c, y_pred_train_c))
print("Accuracy test: ", accuracy_score(y_test_c, y_pred_test_c))

In [None]:
print("ROC AUC train: ", roc_auc_score(y_train_c, y_pred_train_c))
print("ROC AUC test: ", roc_auc_score(y_test_c, y_pred_test_c))

In [None]:
draw_roc(y_train_c, logisticRegression.predict_proba(X_train)[:, 1], y_test_c, logisticRegression.predict_proba(X_test)[:, 1])

# 1.2 Train Random Forest Classifier model with hyperparameter tuning

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Create the param grid for random forest
param_grid_rf = [{
               'max_depth': [5, 6, 7, 8, 9],
               'max_features': [10, 15, 20, 25, 30]}]
print(param_grid_rf)

In [None]:
rf_model = RandomForestClassifier()

In [None]:
folds = KFold(n_splits = 5, shuffle = True, random_state=100)

In [None]:
grid_rf = GridSearchCV(estimator = rf_model, scoring='roc_auc', param_grid = param_grid_rf, cv = folds, 
                           verbose=0, return_train_score=True, n_jobs=3)
grid_rf.fit(X_train, y_train_c)

In [None]:
grid_rf.best_params_

In [None]:
cv_results = pd.DataFrame(grid_rf.cv_results_)
cv_results[["param_max_depth","param_max_features","mean_train_score","mean_test_score"]]

In [None]:
rf_model = RandomForestClassifier(max_depth = grid_rf.best_params_["max_depth"], 
                                  max_features= grid_rf.best_params_["max_features"])

In [None]:
rf_model.fit(X_train, y_train_c)

In [None]:
y_train_pred_rf_c = rf_model.predict(X_train)
y_test_pred_rf_c = rf_model.predict(X_test)

In [None]:
print("Accuracy train: ", accuracy_score(y_train_c, y_train_pred_rf_c))
print("Accuracy test: ", accuracy_score(y_test_c, y_test_pred_rf_c))

In [None]:
print("ROC AUC train: ", roc_auc_score(y_train_c, y_train_pred_rf_c))
print("ROC AUC test: ", roc_auc_score(y_test_c, y_test_pred_rf_c))

In [None]:
act_prob_pred = pd.DataFrame({
    "Actual": y_train_c,
    "Prediction Prob 1" : rf_model.predict_proba(X_train)[:,1],
    "Prediction Prob 2" : rf_model.predict_proba(X_train)[:,0],
    "Prediction": y_train_pred_rf_c
    })

In [None]:
act_prob_pred[act_prob_pred["Actual"]!=act_prob_pred["Prediction"]].to_csv("looks_at_threshold.csv")

In [None]:
draw_roc(y_train_c, rf_model.predict_proba(X_train)[:, 1], y_test_c, rf_model.predict_proba(X_test)[:, 1])

In [None]:
print("Confusion Matrix train: \n", confusion_matrix(y_train_c, y_train_pred_rf_c))
print("Confusion Matrix test: \n", confusion_matrix(y_test_c, y_test_pred_rf_c))

# 2. Linear Regression (statsmodels)

Looking at this confusion matrix we can see that we are prediction that we are going to provide loan but we actually are not. But when we apply regression on top it we are going to get less value so we can proceed with this random forest model.

We can see that there is linear relationship between some of the columns with the target variable.

In [None]:
import statsmodels.api as sm

In [None]:
X_train_const = sm.add_constant(X_train)

In [None]:
lm = sm.OLS(y_train, X_train_const).fit()  

In [None]:
lm.summary()

There are too many variables with high p-values so we will remove some variables using RFE.

# 2.2 Features selection using RFE

In [None]:
# Importing RFE and LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# Running RFE with the output number of the variable equal to 10
lr = LinearRegression()
lr.fit(X_train, y_train)

# running RFE 
# For the first model we are taking half features
rfe = RFE(lr, 10)             
rfe = rfe.fit(X_train, y_train)

In [None]:
# Columns with RFE Support as True
col = X_train.columns[rfe.support_]
len(col), col

#### Building model using statsmodel, for the detailed statistics

In [None]:
# Creating X_train_rfe dataframe with RFE selected variables

X_train_rfe = X_train[col]

In [None]:
# Adding a constant to X_train_rfe as statsmodel does not include it. We have to explicitly define it.

X_train_rfe_const = sm.add_constant(X_train_rfe)

In [None]:
# Running the linear model

lm1 = sm.OLS(y_train, X_train_rfe_const).fit()   

In [None]:
lm1.summary()

In [None]:
# Calculate the VIFs for the new model
vif = pd.DataFrame()

X = X_train_rfe
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

# 2.3 Ridge Regression

In [None]:
# Initialisation of ridge linear regression model
ridge_lr = Ridge(random_state = 42)

In [None]:
# Create the param grid for logistic regression
param_ridge_lr = {
    'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 
     0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
     4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 500, 1000 ]
}
print(param_ridge_lr)

In [None]:
folds = KFold(n_splits = 5, shuffle = True, random_state=100)

In [None]:
grid_ridge = GridSearchCV(estimator = ridge_lr, scoring= 'r2', param_grid = param_ridge_lr, cv = folds, 
                           verbose=0, return_train_score=True, n_jobs=3)
grid_ridge.fit(X_train, y_train)

In [None]:
int(grid_ridge.best_estimator_.alpha)

In [None]:
pd.DataFrame(grid_ridge.cv_results_)[['param_alpha', 'mean_test_score', 'mean_train_score']]

In [None]:
#Fitting ridge model and printing coefficients which have been penalised
alpha = int(grid_ridge.best_estimator_.alpha)
ridge = Ridge(alpha=alpha)

In [None]:
ridge = ridge.fit(X_train, y_train)
ridge

In [None]:
y_pred_train = ridge.predict(X_train)
y_pred_test = ridge.predict(X_test)

metric2 = []
r2_train_lr = r2_score(y_train, y_pred_train)
print("R2 Train Score: ", r2_train_lr)
metric2.append(r2_train_lr)

r2_test_lr = r2_score(y_test, y_pred_test)
print("R2 Test Score: ", r2_train_lr)
metric2.append(r2_test_lr)

In [None]:
sns.scatterplot(x = y_train, y=y_pred_train)
plt.show()

# 2.4 Lasso Regression

In [None]:
# Initialise the lasso model
lasso_lr = Lasso(random_state = 42)

In [None]:
# Create the param grid for logistic regression
param_lasso_lr = {
    'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 
 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 500, 1000 ]
}
print(param_lasso_lr)

In [None]:
grid_lasso = GridSearchCV(estimator = lasso_lr, scoring= 'r2', param_grid = param_lasso_lr, cv = folds, 
                           verbose=0, return_train_score=True, n_jobs=3)
grid_lasso.fit(X_train, y_train)

In [None]:
int(grid_lasso.best_estimator_.alpha)

In [None]:
pd.DataFrame(grid_lasso.cv_results_)[['param_alpha', 'mean_test_score', 'mean_train_score']]

In [None]:
#Fitting lasso model and printing coefficients which have been penalised
alpha = int(grid_lasso.best_estimator_.alpha)
lasso = Lasso(alpha=alpha)

lasso = lasso.fit(X_train, y_train)
lasso

In [None]:
y_pred_train = lasso.predict(X_train)
y_pred_test = lasso.predict(X_test)

metric3 = []
r2_train_lr = r2_score(y_train, y_pred_train)
print("R2 Train Score: ", r2_train_lr)
metric3.append(r2_train_lr)

r2_test_lr = r2_score(y_test, y_pred_test)
print("R2 Test Score: ", r2_train_lr)
metric3.append(r2_test_lr)

In [None]:
sns.scatterplot(x = y_train, y=y_pred_train)
plt.show()

In [None]:
betas = pd.DataFrame(index=df_train.columns)
betas.rows = df_train.columns

In [None]:
betas['Ridge'] = ridge.coef_
betas['Lasso'] = lasso.coef_
betas['Linear Regression'] = lm.params
betas['Linear Regression with RFE'] = lm1.params

In [None]:
pd.set_option('display.max_rows', None)
betas

# Creating Submission File

In [None]:
df_test = pd.read_csv('/kaggle/input/sanction-loan/test.csv')

In [None]:
df_test.head()

In [None]:
df_test.info()

In [None]:
df_test.replace('?', np.NaN, inplace=True)

In [None]:
df_test.isnull().sum()

In [None]:
for column in var_categorical:
    plt.figure(figsize=(15, 6))
    print(column.title())
    ax = sns.countplot(x = df_test[column])
    label_values(ax)
    plt.show()

In [None]:
loan_to_be_requested_test = []
for i in range(len(df_test)):
#     print(df_test["Loan Amount Request (USD)"][i], df_test["Property Price"][i])
    if df_test["Property Price"][i] == np.nan:
        value = df_test["Loan Amount Request (USD)"][i]
    else:
        value = min(df_test["Loan Amount Request (USD)"][i], 0.80*float(df_test["Property Price"][i]))
    loan_to_be_requested_test.append(value)

In [None]:
df_test = df_test.drop(['Loan Amount Request (USD)', 'Property Price', 'Property Age'], axis = 1)

In [None]:
df_test["Loan_To_Be_Requested"] = loan_to_be_requested_test

In [None]:
i = 1
for column in var_numerical:
    print(column.title())
    plt.subplots(figsize=(16, 35))
    plt.subplot(len(var_numerical) + 1, 4, i)
    sns.boxplot(y = df_test[column])
    i += 1
    plt.subplot(len(var_numerical) + 1, 4, i)
    sns.distplot(x = df_test[column])
    i += 1
    plt.show()

In [None]:
df_test["Type of Employment"] = df_test["Type of Employment"].fillna("Other")

In [None]:
df_test["Dependents"] = df_test["Dependents"].fillna(0)

In [None]:
df_test["Gender"] = df_test["Gender"].fillna(df_test["Gender"].mode().values[0])

In [None]:
df_test["Income (USD)"] = df_test["Income (USD)"].fillna(df_test["Income (USD)"].median())

In [None]:
df_test["Income Stability"] = df_test["Income Stability"].fillna(df_test["Income Stability"].mode().values[0])

In [None]:
df_test["Current Loan Expenses (USD)"] = df_test["Current Loan Expenses (USD)"].fillna(df_test["Current Loan Expenses (USD)"].median())

We will impute mean values in credit score null places

In [None]:
df_test["Credit Score"] = df_test["Credit Score"].fillna(df_test["Credit Score"].mean())

In [None]:
df_test["Has Active Credit Card"] = df_test["Has Active Credit Card"].fillna(df_test["Has Active Credit Card"].mode().values[0])

In [None]:
df_test["Property Location"] = df_test["Property Location"].fillna(df_test["Property Location"].mode().values[0])

In [None]:
df_test["Co-Applicant"] = df_test["Co-Applicant"].fillna(1)

For columns 'Type of Employment' and 'Dependents', we will use one hot encoding for multiclass variables. <br/>
Based on the winning solution of KDD 2009 Cup i.e. we are going to limit the number of categories in the these 3 variables to 10 most frequent labels.

In [None]:
def top_labels(df_test, col, max_col, top):
    print("Top Labels: ", top)
    for categories in top:
        df_test[col+ "_" +str(categories)]=np.where(df_test[col]==categories,1,0)
        print(col + "_" + str(categories))

We will take top 12 columns for Type of Employment.

In [None]:
top_labels(df_test, 'Type of Employment', 12, top_labels_emp)
df_test = df_test.drop(['Type of Employment'], axis = 1)

Here we will take only top 8.

In [None]:
top_labels(df_test, 'Dependents', 8, top_labels_dep)
df_test = df_test.drop(['Dependents'], axis = 1)

In [None]:
top_labels(df_test, 'Profession', 4, top_labels_prof)
df_test = df_test.drop(['Profession'], axis = 1)

For profession column we will take only top 4 columns

In [None]:
# Make dummy variables for the nominal columns
df_test = pd.get_dummies(df_test, columns=var_categorical, drop_first=True)

In [None]:
df_test[min_max_scaler] = minMaxScaler.transform(df_test[min_max_scaler])

In [None]:
df_test[robust_scaler] = robustScaler.transform(df_test[robust_scaler])

In [None]:
df_test[standard_scaler] = standardScaler.transform(df_test[standard_scaler])

#### First use classification

In [None]:
class_pred = rf_model.predict(df_test[list(X_train.columns)])

In [None]:
col = list(col)

In [None]:
col

In [None]:
X_df_test = df_test[col]

In [None]:
y_pred_test_sub = lm1.predict(sm.add_constant(X_df_test))

In [None]:
df_test["Loan Sanction Amount (USD)"] = y_pred_test_sub

In [None]:
df_sub = df_test[["Customer ID", "Loan Sanction Amount (USD)"]]

In [None]:
df_sub.head()

In [None]:
for i in range(len(class_pred)):
    if class_pred[i] == 0:
        df_sub["Loan Sanction Amount (USD)"][i] = 0
    if df_sub["Loan Sanction Amount (USD)"][i] < 0:
        df_sub["Loan Sanction Amount (USD)"][i] = 0

In [None]:
df_sub.to_csv("submission.csv", index=False)

In [None]:
df_sub_lasso = pd.DataFrame({
    "Loan Sanction Amount (USD)": lasso.predict(df_test[list(X_train.columns)]),
    "Customer ID" : df_test["Customer ID"]
})

In [None]:
for i in range(len(class_pred)):
    if class_pred[i] == 0:
        df_sub_lasso["Loan Sanction Amount (USD)"][i] = 0
    if df_sub_lasso["Loan Sanction Amount (USD)"][i] < 0:
        df_sub_lasso["Loan Sanction Amount (USD)"][i] = 0

In [None]:
df_sub_lasso.to_csv("submission_lasso.csv", index=False)

In [None]:
df_sub_ridge = pd.DataFrame({
    "Loan Sanction Amount (USD)": ridge.predict(df_test[list(X_train.columns)]),
    "Customer ID" : df_test["Customer ID"]
})

In [None]:
for i in range(len(class_pred)):
    if class_pred[i] == 0:
        df_sub_ridge["Loan Sanction Amount (USD)"][i] = 0
    if df_sub_ridge["Loan Sanction Amount (USD)"][i] < 0:
        df_sub_ridge["Loan Sanction Amount (USD)"][i] = 0

In [None]:
df_sub_ridge.to_csv("submission_ridge.csv", index=False)

In [None]:
df_sub_without_rfe = pd.DataFrame({
    "Loan Sanction Amount (USD)": lm.predict(sm.add_constant(df_test[list(X_train.columns)])),
    "Customer ID" : df_test["Customer ID"]
})

In [None]:
for i in range(len(class_pred)):
    if class_pred[i] == 0:
        df_sub_without_rfe["Loan Sanction Amount (USD)"][i] = 0
    if df_sub_without_rfe["Loan Sanction Amount (USD)"][i] < 0:
        df_sub_without_rfe["Loan Sanction Amount (USD)"][i] = 0

In [None]:
df_sub_without_rfe.to_csv("submission_without_rfe.csv", index=False)