# Reading the input files

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv("/kaggle/input/loan-default-prediction/train_v2.csv.zip")
t = pd.read_csv("/kaggle/input/loan-default-prediction/test_v2.csv.zip")
data.shape

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.select_dtypes(include=['object']).head()

These columns seem to be incorrect, so we drop them.

In [None]:
invalid = data.select_dtypes(include=['object']).columns
data.drop(invalid, axis=1, inplace=True)
t.drop(invalid, axis=1, inplace=True)
t_id = t['id'].copy
t.drop('id', axis=1, inplace = True)

# Exploratory Data Analysis

## Describe the numeric columns

In [None]:
data.describe()

In [None]:
t.describe()

## Missing values

In [None]:
missing = data.isnull().sum()
missing = pd.DataFrame(missing[missing!=0])
missing.columns = ['No. of missing values']
missing['Percentage'] = 100*missing['No. of missing values']/data.id.count()
missing.sort_values(by="Percentage", ascending=False)

## Correlations

In [None]:
correlations = data.iloc[:,1:752].corr()
correlations.head()

As we can see in the above output, there are many features that have very high correlations among themselves. This is the motivation behind performing Principal Component Analysis (PCA) in the further step to reduce the dimensions.

# Train test split
Before we go for data transformation and model building, it is necessary to divide the data into train and test.

In [None]:
x = data.iloc[:,1:751].copy()
y = data.iloc[:,751].copy()
y.value_counts()

We first convert y to binary

In [None]:
y[y>0] = 1
y.value_counts()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, stratify = y, random_state=0)

In [None]:
[X_train.shape, X_test.shape, y_train.shape, y_test.shape]

# Missing value treatment
Since the percentage of missing values is small, we impute them by the mean of the column.

In [None]:
X_train = X_train.fillna(X_train.mean())
X_test = X_test.fillna(X_train.mean())
t = t.fillna(X_train.mean())
[X_train.isnull().sum().sum(), X_test.isnull().sum().sum(), t.isnull().sum().sum()]

# Standardization of Variables
PCA is effected by scale so we need to scale the features in the data before applying PCA. We can transform the data onto unit scale (mean = 0 and variance = 1) which is a requirement for the optimal performance of many machine learning algorithms. StandardScaler helps standardize the dataset’s features. 

In [None]:
from sklearn.preprocessing import StandardScaler
scalar= StandardScaler()
scalar.fit(X_train)
X_train = scalar.transform(X_train)
X_test = scalar.transform(X_test)
X_t = scalar.transform(t)

# Principal Component Analysis
* Given a collection of points in two, three, or higher dimensional space, a "best fitting" line can be defined as one that minimizes the average squared distance from a point to the line. The next best-fitting line can be similarly chosen from directions perpendicular to the first. Repeating this process yields an orthogonal basis in which different individual dimensions of the data are uncorrelated. These basis vectors are called principal components, and several related procedures principal component analysis (PCA).
* PCA is a method used to reduce number of variables in the data by extracting the important ones from a large pool. It reduces the dimension of the data with an aim to retain as much information as possible. In other words, this method combines highly correlated variables together to form a smaller number of an artificial set of variables which is called “principal components” that account for most variance in the data.

In [None]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(X_train)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');

In [None]:
np.cumsum(pca.explained_variance_ratio_)[200]

98.27% of variation is explained by 100 components.

In [None]:
final_pca = PCA(n_components=200)
final_pca.fit(X_train)
X_train = final_pca.transform(X_train)
X_train = pd.DataFrame(data = X_train)
X_test = final_pca.transform(X_test)
X_test = pd.DataFrame(data = X_test)
X_t = final_pca.transform(X_t)
X_t = pd.DataFrame(data = X_t)

Now we can use these variables to fit the model with 200 independent variables to predict loss.

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver= 'saga', class_weight='balanced',max_iter=500, random_state=1).fit(X_train, y_train)
model.coef_[0]

# Validation on test data

In [None]:
y_pred = model.predict(X_test)
y_pred

# Model Evaluation
## Confusion Matrix

In [None]:
import sklearn.metrics as sm
c = pd.DataFrame(sm.confusion_matrix(y_test, y_pred), index=['Actual non defaulter','Actual defaulter'])
c.columns = ['Predicted non defaulter','Predicted defaulter']
c['Actual Total'] = c.sum(axis=1)
c.loc['Predicted Total',:] = c.sum(axis = 0)
c

## Accuracy

In [None]:
print(["The accuracy on the validation data is " + str(round(sm.accuracy_score(y_test, y_pred)*100,ndigits = 2)) + "%"])

## Sensitivity

In [None]:
print("The sensitivity (true positive rate) is " + str(round(100*c.iloc[1,1]/c.iloc[1,2], ndigits=2)) + "%")

## AUC

In [None]:
ns_fpr, ns_tpr, _ = sm.roc_curve(y_test, np.zeros(len(y_test)))
lr_probs = model.predict_proba(X_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
lr_fpr, lr_tpr, _ = sm.roc_curve(y_test, lr_probs)
# plot the roc curve for the model
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
plt.plot(lr_fpr, lr_tpr, marker='.', label='Logistic Regression')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend()
# show the plot
plt.show()

In [None]:
print("The Area under ROC curve is " + str(round(100 * sm.roc_auc_score(y_test, y_pred), ndigits=2)) + "%")

## Classification Report

In [None]:
print(sm.classification_report(y_test, y_pred))

# Prediction on given test data

In [None]:
pred = model.predict(X_t)
sns.countplot(pred);

In [None]:
submission = pd.read_csv("../input/loan-default-prediction/sampleSubmission.csv")
submission['loss'] = pred

In [None]:
submission.head()

In [None]:
submission.to_csv("submit.csv", index=False)