In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Reading Credit_Risk dataset
df = pd.read_csv('../input/credit-risk-analysis/Credit_Risk_Analysis/Credit_Risk.csv')
df.head()

In [None]:
# Shape of our dataset
df.shape

In [None]:
df.info()

In [None]:
# Some statistics of dataset
df.describe().T

In [None]:
# Count of duplicated values
df.duplicated().sum()

In [None]:
# Count of missing values
df.isnull().sum()

In [None]:
# Filling missing values for each coulumns 
# I filled missing values of categoric features with their mode (with the largest number of class) 
# I filled missing values of numerical features with their median, because all of them dont distribute normally
df['Gender'].fillna(df['Gender'].value_counts().index[0], inplace = True)
df['Married'].fillna(df['Married'].value_counts().index[0], inplace = True)
df['Dependents'].fillna(df['Dependents'].value_counts().index[0], inplace = True)
df['Self_Employed'].fillna(df['Self_Employed'].value_counts().index[0], inplace = True)
df['LoanAmount'].fillna(df['LoanAmount'].median(), inplace = True)
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].median(), inplace = True)
df['Credit_History'].fillna(df['Credit_History'].value_counts().index[0], inplace = True)

In [None]:
df.isnull().sum()


### Visualization

In [None]:
#On the figures below we can see the number of classes of categorical variables with considering Loan_Status
fig, ax =plt.subplots(3,3, figsize = (15,15))
sns.countplot(df['Gender'], hue = df['Loan_Status'], ax=ax[0,0])
sns.countplot(df['Married'], hue = df['Loan_Status'], ax=ax[0,1])
sns.countplot(df['Dependents'], hue = df['Loan_Status'], ax=ax[0,2])
sns.countplot(df['Education'], hue = df['Loan_Status'], ax=ax[1,0])
sns.countplot(df['Self_Employed'], hue = df['Loan_Status'], ax=ax[1,1])
sns.countplot(df['Credit_History'], hue = df['Loan_Status'], ax=ax[1,2])
sns.countplot(df['Property_Area'], hue = df['Loan_Status'], ax=ax[2,0])
fig.show()

In [None]:
# Counts of classes of Loan_Status. It is clear that Y (who can get loan) is almost two times larger than N (who can not get loan)
sns.countplot(df['Loan_Status']);

In [None]:
# On the figures below, we can see distributions of ApplicantIncome, CoapplicantIncome, LoanAmount and Loan_Amount_Term respectively
# It is clear that distributions of these features are not normal.
sns.displot(data = df, x = 'ApplicantIncome', kde = True, color = 'skyblue');

In [None]:
sns.displot(data = df, x = 'CoapplicantIncome', kde = True, color = 'olive');

In [None]:
sns.displot(data = df, x = 'LoanAmount', kde = True, color = 'gold');

In [None]:
sns.displot(data = df, x = 'Loan_Amount_Term', kde = True, color = 'teal');

### Outlier detection and treatment with BoxPLot method

In [None]:
# For ApplicantIncome
sns.boxplot(df['ApplicantIncome']);

In [None]:
Q1 = df['ApplicantIncome'].quantile(0.25)
Q3 = df['ApplicantIncome'].quantile(0.75)
IQR = Q3-Q1
print("Q1 ", Q1)
print("Q3 ", Q3)
print("IQR ", IQR)

In [None]:
down = Q1 - 1.5*IQR
up = Q3 + 1.5*IQR
print("Down: ", down)
print("Up: ", up)

In [None]:
outliers = df['ApplicantIncome']>up
outliers

In [None]:
df[outliers].index

In [None]:
df.loc[df[outliers].index, 'ApplicantIncome'] = up

In [None]:
# For CoapplicantIncome
sns.boxplot(df['CoapplicantIncome']);

In [None]:
Q1_CI = df['CoapplicantIncome'].quantile(0.25)
Q3_CI = df['CoapplicantIncome'].quantile(0.75)
IQR_CI = Q3_CI-Q1_CI
print("Q1_CI ", Q1_CI)
print("Q3_CI ", Q3_CI)
print("IQR ", IQR_CI)

In [None]:
down_CI = Q1_CI - 1.5*IQR_CI
up_CI = Q3_CI + 1.5*IQR_CI
print("Down: ", down_CI)
print("Up: ", up_CI)

In [None]:
outliers_CI = df['CoapplicantIncome']>up_CI
outliers_CI

In [None]:
df[outliers_CI].index

In [None]:
df.loc[df[outliers_CI].index, 'CoapplicantIncome'] = up_CI

In [None]:
# For LoanAmount
sns.boxplot(df['LoanAmount']);

In [None]:
Q1_LA = df['LoanAmount'].quantile(0.25)
Q3_LA = df['LoanAmount'].quantile(0.75)
IQR_LA = Q3_LA-Q1_LA
print("Q1_LA ", Q1_LA)
print("Q3_LA ", Q3_LA)
print("IQR_LA ", IQR_LA)

In [None]:
down_LA = Q1_LA - 1.5*IQR_LA
up_LA = Q3_LA + 1.5*IQR_LA
print("Down: ", down_LA)
print("Up: ", up_LA)

In [None]:
outliers_LA = df['LoanAmount']>up_LA
outliers_LA

In [None]:
df[outliers_LA].index

In [None]:
df.loc[df[outliers_LA].index, 'LoanAmount'] = up_LA

In [None]:
# For Loan_Amount_Term
sns.boxplot(df['Loan_Amount_Term']);

In [None]:
Q1_LAT = df['Loan_Amount_Term'].quantile(0.25)
Q3_LAT = df['Loan_Amount_Term'].quantile(0.75)
IQR_LAT = Q3_LAT-Q1_LAT
print("Q1_LAT ", Q1_LAT)
print("Q3_LAT ", Q3_LAT)
print("IQR_LAT ", IQR_LAT)

In [None]:
down_LAT = Q1_LAT - 1.5*IQR_LAT
up_LAT = Q3_LAT + 1.5*IQR_LAT
print("Down: ", down_LAT)
print("Up: ", up_LAT)

In [None]:
outliers_LAT = (df['Loan_Amount_Term'] < down_LAT) | (df['Loan_Amount_Term'] > up_LAT) 
outliers_LAT

In [None]:
df[outliers_LAT].index

In [None]:
df.loc[df[outliers_LAT].index, 'Loan_Amount_Term'] = up_LAT

### One-Hot Encoding

In [None]:
df = pd.get_dummies(df, columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area'])

In [None]:
df.head()

In [None]:
# After encoding delete some columns beacues they include the same information with their pair
df.drop(['Gender_Female', 'Married_No', 'Education_Not Graduate', 'Self_Employed_No'], axis = 1, inplace = True)

In [None]:
# Creidy_History column is in float type and we must change it
df['Credit_History'] = df['Credit_History'].replace({1.0: 'Yes', 0.0: 'No'})

In [None]:
df = pd.get_dummies(df, columns = ['Credit_History'])
df.drop('Credit_History_No', axis = 1, inplace = True)

In [None]:
# Preparation of our tagret 'Loan_Status'. I replaced Y with 1 and N with 0
df['Loan_Status'] = df['Loan_Status'].replace({'Y': 1, 'N': 0})

In [None]:
df.reset_index(inplace = True)
df.head()

In [None]:
# I deleted index and Loan_ID columns because Loand_ID is just number which identify loans and it is not informative for models
df.drop(['index', 'Loan_ID'], axis = 1, inplace = True)
df.head()

In [None]:
df.head()

In [None]:
# In this heatmap, we can see correlation between features. Loan_Amount dont have correlation with other features
#because its all values were equal to 360 after filling missing values and outlier treatment steps

plt.figure(figsize = (15,15))
sns.heatmap(df.corr(), annot = True, cbar = True, vmin = -1, vmax= 1);

In [None]:
df.head()

In [None]:
# Definition our independed features (X) and depended feature (Target - y)
X = df.drop('Loan_Status', axis = 1)
y = df[['Loan_Status']]
print("X shape: ", X.shape)
print("y shape: ", y.shape)

In [None]:
# This method rescales the data so that the mean is 0 and the standard deviation is 1.
sc = StandardScaler()
scaled_X = sc.fit_transform(X)
scaled_X

In [None]:
# Train Test Split. Mostly test size is selected as 0.30 in most problems but in our problem, the dataset is not big enough. 
# For this reason I have used 20% of the dataset for testing
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size = 0.20, random_state = 42)
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of y_train: ", y_train.shape)
print("Shape of y_test: ", y_test.shape)

## Prediction Step

In [None]:
models = [ LogisticRegression,
          GradientBoostingClassifier,
          RandomForestClassifier,
          DecisionTreeClassifier,
          KNeighborsClassifier,
          SVC,
          LGBMClassifier]

In [None]:
def MakePrediction(algorithm):
    model = algorithm().fit(X_train, y_train)
    print(algorithm.__name__, " Train Score: ", model.score(X_train, y_train))
    print(algorithm.__name__, " Test Score: ", model.score(X_test, y_test))
    print("_______________________________________________________________")

In [None]:

for i in models:
    MakePrediction(i)

### Tuning of Models (Hyperparameter tuning)

### LogisticRegression

In [None]:
lr_model = LogisticRegression().fit(X_train, y_train)

In [None]:
cross_val_score(lr_model, X_train, y_train, cv = 5).mean()

In [None]:
val_lr = cross_val_score(lr_model, X_test, y_test, cv = 5).mean()
val_lr

### Gradien Boosting 

In [None]:
gbm_model = GradientBoostingClassifier()
gbm_params = {"learning_rate": [0.1, 0.01, 0.001, 0.05],
              "n_estimators": [100,500,1000],
              "max_depth": [3,5,10],
              "min_samples_split": [2, 5, 10]}

gmb_cv_model = GridSearchCV(gbm_model, gbm_params, cv = 5, n_jobs = -1, verbose = 2).fit(X_train, y_train)

In [None]:
gmb_cv_model.best_params_

In [None]:
tuned_gbm = GradientBoostingClassifier(learning_rate = 0.001,
                                       n_estimators = 500,
                                       max_depth = 3,
                                       min_samples_split = 2).fit(X_train, y_train)

In [None]:
print("Train Score: ", tuned_gbm.score(X_train, y_train))
print("Test Score: ", tuned_gbm.score(X_test, y_test))

### Random Forest 

In [None]:
rf_model = RandomForestClassifier()
rf_params = {"max_depth": [2, 5, 8, 10],
             "n_estimators": [100, 500, 1000],
             "max_features": [2,5,8],
             "min_samples_split": [2,5,10]}

rf_cv_model = GridSearchCV(rf_model, rf_params, cv = 5, n_jobs = -1, verbose = 2).fit(X_train, y_train)

In [None]:
rf_cv_model.best_params_

In [None]:
rf_tuned = RandomForestClassifier(max_depth = 2,
                                  n_estimators = 500,
                                  max_features = 5,
                                  min_samples_split = 2).fit(X_train, y_train)

In [None]:
print("Train Score: ", rf_tuned.score(X_train, y_train))
print("Test Score: ", rf_tuned.score(X_test, y_test))

### DecisionTree

In [None]:
dt_model = DecisionTreeClassifier()
dt_params = {"max_depth": list(range(1,10)),
             "min_samples_split": list(range(2,50)),
             "max_features": [2, 5, 7, 10]}

dt_cv_model = GridSearchCV(dt_model, dt_params, cv = 5, n_jobs = -1, verbose = 2).fit(X_train, y_train)

In [None]:
dt_cv_model.best_params_

In [None]:
tuned_dt = DecisionTreeClassifier(max_depth = 4, max_features = 10, min_samples_split = 42).fit(X_train, y_train)

In [None]:
print("Train Score: ", tuned_dt.score(X_train, y_train))
print("Test Score: ", tuned_dt.score(X_test, y_test))

### KNeighbors

In [None]:
knn_model = KNeighborsClassifier()
knn_params = {"n_neighbors": np.arange(1,50)}

knn_cv = GridSearchCV(knn_model, knn_params, cv = 5).fit(X_train, y_train)

In [None]:
knn_cv.best_params_

In [None]:
tuned_knn = KNeighborsClassifier(n_neighbors = 10).fit(X_train, y_train)

In [None]:
print("Train Score: ", tuned_knn.score(X_train, y_train))
print("Test Score: ", tuned_knn.score(X_test, y_test))

### SVC 

In [None]:
svm_model = SVC()
svm_params = {"C": np.arange(1,10), "kernel": ["linear", "rbf"]}
svm_cv_model = GridSearchCV(svm_model, svm_params, cv = 5, n_jobs = -1, verbose = 2).fit(X_train, y_train)

In [None]:
svm_cv_model.best_params_

In [None]:
tuned_svm = SVC(C = 1, kernel = 'linear').fit(X_train, y_train)

In [None]:
print("Train Score: ", tuned_svm.score(X_train, y_train))
print("Test Score: ", tuned_svm.score(X_test, y_test))

###  LGBMClassifier

In [None]:
lgbm_model =  LGBMClassifier()
lgbm_params = {"n_estimators": [100,300, 500, 1000, 2000],
              "subsample": [0.6, 0.8, 1],
              "max_depth": [3, 4, 5, 6],
              "learning_rate": [0.1,0.001, 0.01, 0.02, 0.05],
              "min_child_samples": [5, 10, 20]}

lgbm_cv = GridSearchCV(lgbm_model, lgbm_params, cv = 5, n_jobs = -1, verbose = 2).fit(X_train, y_train)

In [None]:
lgbm_cv.best_params_

In [None]:
tuned_lgbm = LGBMClassifier(n_estimators = 1000,
                            subsample = 0.6,
                            max_depth = 3,
                            learning_rate = 0.001,
                            min_child_samples = 10).fit(X_train, y_train) 

In [None]:
print("Train Score: ", tuned_lgbm.score(X_train, y_train))
print("Test Score: ", tuned_lgbm.score(X_test, y_test))

### Comparision of Tuned Models

In [None]:
tuned_models = ["Validated Logistic Regression",
                "Tuned Gradient Boosting",
                "Tuned Random Forest",
                "Tuned Decision Tree",
                "Tuned KNN", 
                "Tuned SVM", 
                "Tuned LightGBM"] 

test_scores = [val_lr,
               tuned_gbm.score(X_test, y_test),
               rf_tuned.score(X_test, y_test),
               tuned_dt.score(X_test, y_test),
               tuned_knn.score(X_test, y_test),
               tuned_svm.score(X_test, y_test),
               tuned_lgbm.score(X_test, y_test)]

In [None]:
# The best accurate model is Decision Tree for this dataset
tuned_results = pd.DataFrame(test_scores, columns = ["Test Score"], index = tuned_models)
tuned_results.sort_values(by = 'Test Score', ascending = False)

In [None]:
sns.barplot(x = tuned_results["Test Score"], y = tuned_results.index);

plt.xlabel("Test Scores of Each Models")
plt.ylabel("Model Names")
plt.title("Performance of Tuned Models ")
plt.show()

In [None]:
# As we can see the best result belongs to Decision Tree with 0.796748 accuracy score  after hyperparameter tunning.
# Also we can improve our results expanding dataset.
# In the Loan_Status, the number of 'No' values is very little, for this reason our models can not learn characteristics of this values very well.
# I think in this way our models can not classify 'No' values with high accuracy

cm = confusion_matrix(y_test, tuned_dt.predict(X_test))
ax = sns.heatmap(cm, annot = True, cbar = False, fmt = 'g');
ax.set_xlabel('Predicted Labels',fontsize = 15)
ax.set_ylabel('True Labels',fontsize = 15)
ax.set_title('Decision Tree Classifier')
plt.show()

In [None]:
print(classification_report(y_test, tuned_dt.predict(X_test)))

In [None]:
# Also we can see the most informative feature is Credit_History_Yes for Decision Tree Classifier
importances = pd.Series(tuned_dt.feature_importances_,
                            index = X.columns).sort_values(ascending = False)

sns.barplot(x = importances, y = importances.index)
plt.xlabel("Feature Importance Values")
plt.ylabel("Features")
plt.title("Feature Importances")
plt.show()