In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

## Data Upload

In [None]:
Data=pd.read_csv('../input/lending-club-loan-data-analysis/loan_data.csv')

In [None]:
Data.head()

In [None]:
Data.info()

In [None]:
Data.isna().sum()

## Data Analysis

In [None]:
Data.corr()

In [None]:
plt.figure(figsize=(13,8))
sns.countplot(Data['credit.policy'])
plt.show()

In [None]:
plt.figure(figsize=(13,8))
sns.countplot(Data['purpose'])
plt.show()

In [None]:
dummy_purpose=pd.get_dummies(Data['purpose'])

In [None]:
dummy_purpose.head()

In [None]:
New_Data=pd.concat(([Data.iloc[:,0],dummy_purpose,Data.iloc[:,2:]]),axis=1)

In [None]:
New_Data.head()

In [None]:
import statsmodels.api as sm

print(sm.OLS(endog=New_Data.iloc[:,0],exog=New_Data.iloc[:,1:]).fit().summary())

In [None]:
Data.describe().transpose()

In [None]:
def show_data(data_name):
  fig,axs = plt.subplots(nrows=4,ncols=3,figsize=(15,15))
  b=0
  c=0
  a=3
  for i in data_name:
    axs[b,c].scatter(y=Data['credit.policy'],x=Data[f'{i}'])
    axs[b,c].set_ylabel('credit.policy')
    axs[b,c].set_xlabel(f'{i}',color='red')
    c+=1
    if c==a:
      b+=1
      c=0
    elif b==4:
      break
  plt.show()

In [None]:
show_data(Data.columns[2:]) 

## Train-Validation-Test

In [None]:
x=New_Data.copy().drop(columns='credit.policy')
y=New_Data.copy()['credit.policy']

In [None]:
x.head()

In [None]:
y.head()

In [None]:
x_array=x.values
y_array=y.values

In [None]:
x.shape

In [None]:
y.shape

In [None]:
from sklearn.model_selection import train_test_split

x_train,x_val,y_train,y_val=train_test_split(x_array,y_array,test_size=0.05,random_state=42)
x_train,x_test,y_train,y_test=train_test_split(x_train,y_train,test_size=0.20,random_state=42)

In [None]:
x_train.shape

In [None]:
y_train.shape

In [None]:
x_val.shape

In [None]:
y_val.shape

In [None]:
x_test.shape

In [None]:
y_test.shape

## Scaler

In [None]:
from sklearn.preprocessing import StandardScaler

obje_ss=StandardScaler()

x_train_ss=obje_ss.fit_transform(x_train)
x_val_ss=obje_ss.fit_transform(x_val)
x_test_ss=obje_ss.fit_transform(x_test)

In [None]:
x_train

In [None]:
x_train_ss

## Models

In [None]:
from sklearn.metrics import r2_score,classification_report,accuracy_score,confusion_matrix

### LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression

model_log=LogisticRegression(solver='lbfgs',max_iter=200,random_state=42).fit(x_train,y_train)
y_pred=model_log.predict(x_val)
print(model_log)

In [None]:
print(confusion_matrix(y_val,y_pred))

In [None]:
print('Train success rate: %',model_log.score(x_train,y_train)*100)
print('Test success rate: %',accuracy_score(y_val,y_pred)*100)

In [None]:
print(classification_report(y_val,y_pred))

### SVC

In [None]:
from sklearn.svm import SVC

model_svc=SVC(C=1,kernel='rbf',degree=3,random_state=42).fit(x_train_ss,y_train)
y_pred=model_svc.predict(x_val_ss)
print(model_svc)

In [None]:
print(confusion_matrix(y_val,y_pred))

In [None]:
print('Train success rate: %',model_svc.score(x_train_ss,y_train)*100)
print('Test success rate: %',accuracy_score(y_val,y_pred)*100)

In [None]:
print(classification_report(y_val,y_pred))

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model_knn=KNeighborsClassifier(n_neighbors=5,metric='minkowski').fit(x_train,y_train)
y_pred=model_knn.predict(x_val)
print(model_knn)

In [None]:
print(confusion_matrix(y_val,y_pred))

In [None]:
print('Train success rate: %',model_knn.score(x_train,y_train)*100)
print('Test success rate: %',accuracy_score(y_val,y_pred)*100)

In [None]:
print(classification_report(y_val,y_pred))

### Navie Bayes

In [None]:
from sklearn.naive_bayes import  BernoulliNB

model_bnb=BernoulliNB().fit(x_train,y_train)
y_pred=model_bnb.predict(x_val)
print(model_bnb)

In [None]:
print(confusion_matrix(y_val,y_pred))

In [None]:
print('Train success rate: %',model_bnb.score(x_train,y_train)*100)
print('Test success rate: %',accuracy_score(y_val,y_pred)*100)

In [None]:
print(classification_report(y_val,y_pred))

### Tree Models

In [None]:
from sklearn.ensemble import RandomForestClassifier

model_rfr=RandomForestClassifier(n_estimators=100,criterion='gini',random_state=42,n_jobs=-1).fit(x_train,y_train)
y_pred=model_rfr.predict(x_val)
print(model_rfr)

In [None]:
print(confusion_matrix(y_val,y_pred))

In [None]:
print('Train success rate: %',model_rfr.score(x_train,y_train)*100)
print('Test success rate: %',accuracy_score(y_val,y_pred)*100)

In [None]:
print(classification_report(y_val,y_pred))

### Boosting

In [None]:
from xgboost import XGBClassifier

model_xgb=XGBClassifier(max_depth=4,learning_rate=0.1,n_estimators=100,objective='binary:logistic',n_jobs=-1,random_state=42).fit(x_train,y_train)
y_pred=model_xgb.predict(x_val)
print(model_xgb)

In [None]:
print(confusion_matrix(y_val,y_pred))

In [None]:
print('Train success rate: %',model_xgb.score(x_train,y_train)*100)
print('Test success rate: %',accuracy_score(y_val,y_pred)*100)

In [None]:
print(classification_report(y_val,y_pred))

## Test


In [None]:
def test_score(model_name):

  for i in model_name:
    print(f'{i.__class__} \n{classification_report(y_test,i.predict(x_test))}')


In [None]:
model_names=[model_log,model_knn,model_bnb,model_rfr,model_xgb]

In [None]:
test_score(model_names)

## Cross validation

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
def cross_score(model_name):

  for i in model_name:
    print(f'{i.__class__}  | Cross val score: %{cross_val_score(i,X=x_train,y=y_train,cv=5,n_jobs=-1).mean()*100}')

In [None]:
cross_score(model_names)