In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/loan-application-data/df1_loan.csv')

print(df.shape)
df.head()

In [None]:
df.drop(['Unnamed: 0', 'Loan_ID'], axis = 1, inplace = True)

# EDA

In [None]:
df.describe()

## Target Variable

In [None]:
target = 'Loan_Status'

In [None]:
df.groupby(target)['LoanAmount'].count().plot.bar()
plt.ylabel('count')
plt.show()

In [None]:
print('Y: {}%'.format(df[target].value_counts()[0] / len(df)))
print('N: {}%'.format(df[target].value_counts()[1] / len(df)))

## Missing values

In [None]:
df.isnull().sum()

In [None]:
nan_features = [feature for feature in df.columns if df[feature].isnull().sum() > 0]

for feature in nan_features:
    print('{}: {}% values missing'.format(feature, (df[feature].isnull().sum() / len(df)) * 100))

In [None]:
for feature in nan_features:
    data = df.copy()
    
    data[feature] = np.where(data[feature].isnull(), 1, 0)
    data[target] = np.where(data[target] == 'Y', 1, 0)
    data.groupby(feature)[target].mean().plot.bar()
    plt.xlabel(feature)
    plt.ylabel(target)
    plt.show()

No significant relationship can be observed between target variable and missing values. The number of missing values is not very large to impact the analysis significantly

In [None]:
nan_numeric = []
nan_categoric = []

for feature in nan_features:
    if df[feature].dtype != 'O':
        nan_numeric.append(feature)
    else:
        nan_categoric.append(feature)
        
print('Numeric missing:', nan_numeric)
print('Categorical missing:', nan_categoric)

In [None]:
data = df.copy()

In [None]:
for feature in nan_categoric:
    data[feature] = np.where(data[feature].isnull(), data[feature].mode(), data[feature])
    
for feature in nan_numeric:
    data[feature] = np.where(data[feature].isnull(), int(data[feature].median()), data[feature])

In [None]:
data[target] = np.where(data[target] == 'Y', 1, 0)

## Numeric Features

In [None]:
data['Total_Income'] = data['ApplicantIncome'] + data['CoapplicantIncome']

In [None]:
num_features = [feature for feature in data.columns if data[feature].dtype != 'O' and feature != target]

data[num_features].head()

### Discrete Variables

In [None]:
dis_features = [feature for feature in num_features if len(data[feature].unique()) < 20]

dis_features

#### Distribution

In [None]:
for feature in dis_features:
    data.groupby(feature)[target].count().plot.bar()
    
    plt.xlabel(feature)
    plt.ylabel('count')
    plt.show()

#### vs Target Variable

In [None]:
for feature in dis_features:
    data.groupby(feature)[target].mean().plot.bar()
    
    plt.xlabel(feature)
    plt.ylabel(target)
    plt.show()

#### Observation
Credit History highly affects the target variable

### Continuous Features

In [None]:
con_features = [feature for feature in num_features if feature not in dis_features]

con_features

#### Distribution

In [None]:
for feature in con_features:
    data.boxplot(column = feature)
    
    plt.xlabel(feature)
    plt.ylabel('count')
    plt.show()

In [None]:
for feature in con_features:
    data[feature].hist(bins = 25)
    
    plt.xlabel(feature)
    plt.ylabel('count')
    plt.show()

#### Outliers and Transformation

In [None]:
for feature in con_features:
   
    extreme = data[feature].median() + 3 * data[feature].std()
    print(feature)
    print('Values to be replaced: {}%'.format((len(data.loc[data[feature] > extreme]) / len(data)) * 100))

In [None]:
for feature in con_features:
    
    extreme = data[feature].median() + 3 * data[feature].std()
    data[feature] = np.where(data[feature] > extreme, extreme, data[feature])
    
    data[feature] = data[feature] ** 0.5
    
    data[feature].hist(bins = 25)
    
    plt.xlabel(feature)
    plt.ylabel('count')
    plt.show()

## Categorical Features

In [None]:
cat_features = [feature for feature in data.columns if feature not in num_features and feature != target]

data[cat_features].head()

In [None]:
for feature in cat_features:
    print('{}: {} categories'.format(feature, len(data[feature].unique())))

### vs Target variable

In [None]:
for feature in cat_features:
    data.groupby(feature)[target].mean().plot.bar()
    
    plt.xlabel(feature)
    plt.ylabel(target)
    plt.show()

No significant relationship can be determined

# Feature Engineering

In [None]:
df = pd.read_csv('/kaggle/input/loan-application-data/df1_loan.csv')

print(df.shape)
df.head()

## Dropping unwanted columns

In [None]:
df.drop(['Unnamed: 0', 'Loan_ID'], axis = 1 , inplace = True)

## Target Variable transformation

In [None]:
df[target] = np.where(df[target] == 'Y', 1, 0)

## Dealing with missing values

In [None]:
for feature in nan_categoric:
    df[feature] = np.where(df[feature].isnull(), df[feature].mode(), df[feature])
    
for feature in nan_numeric:
    df[feature] = np.where(df[feature].isnull(), int(df[feature].median()), df[feature])

## Dealing with outliers and transformation

In [None]:
df['Total_Income'] = df['ApplicantIncome'] + df['CoapplicantIncome']

In [None]:
for feature in con_features:
    
    extreme = df[feature].median() + 3 * df[feature].std()
    df[feature] = np.where(df[feature] > extreme, extreme, df[feature])
    
    df[feature] = df[feature] ** 0.5

In [None]:
dummy_df = pd.get_dummies(df, drop_first = True)
dummy_df.head()

# Feature Selection

In [None]:
X = dummy_df.drop('Loan_Status', axis = 1)
y = dummy_df['Loan_Status']

## Correlation

In [None]:
cor = X[num_features].corr()

sns.heatmap(cor, annot = True, cmap = plt.cm.CMRmap_r)
plt.show()

No significant correlation present

# Models

## Scaling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
scaler = MinMaxScaler()

scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Logistic Regression

In [None]:
model = LogisticRegression()

model.fit(X_train_scaled, y_train)

In [None]:
y_pred_lr = model.predict(X_test_scaled)

print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr, digits = 4))

In [None]:
model = LogisticRegression()

scores = cross_val_score(model, X_train_scaled, y_train, cv = 10)
print(np.mean(scores))

## KNN

In [None]:
scores = []

for i in range(5, 12):
    model = KNeighborsClassifier(n_neighbors = i)
    model.fit(X_train_scaled, y_train)
    scores.append(model.score(X_test_scaled, y_test))

plt.plot(np.arange(5, 12), scores)
plt.xlabel('No of Neighbors')
plt.ylabel('Score')
plt.show()

best_neighbors = np.arange(5, 12)[scores.index(max(scores))]
print('Best score = {}\n Neighbors = {}'.format(max(scores), best_neighbors))

In [None]:
model = KNeighborsClassifier(n_neighbors = best_neighbors)
model.fit(X_train_scaled, y_train)

y_pred_knn = model.predict(X_test_scaled)

print(confusion_matrix(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn, digits = 4))

In [None]:
model = KNeighborsClassifier(n_neighbors = best_neighbors)

scores = cross_val_score(model, X_train_scaled, y_train, cv = 10)
print(np.mean(scores))

## SVM

In [None]:
model = SVC()

model.fit(X_train_scaled, y_train)

In [None]:
y_pred_svc = model.predict(X_test_scaled)

print(confusion_matrix(y_test, y_pred_svc))
print(classification_report(y_test, y_pred_svc, digits = 4))

In [None]:
model = SVC()

scores = cross_val_score(model, X_train_scaled, y_train, cv = 10)
print(np.mean(scores))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Random Forest

In [None]:
model = RandomForestClassifier()

model.fit(X_train, y_train)

In [None]:
y_pred_rf = model.predict(X_test)

print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, digits = 4))

In [None]:
model = RandomForestClassifier()

scores = cross_val_score(model, X_train, y_train, cv = 10)
print(np.mean(scores))

## XGBoost

In [None]:
model = XGBClassifier(use_label_encoder = False)

model.fit(X_train, y_train)

In [None]:
y_pred_xgb = model.predict(X_test)

print(confusion_matrix(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb, digits = 4))

In [None]:
model = XGBClassifier(use_label_encoder = False)

scores = cross_val_score(model, X_train, y_train, cv = 10)
print(np.mean(scores))