# Importing Libraries

In [None]:
'''IMPORTING Libraries'''
import pandas as pd
import os,time
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.combine import SMOTEENN
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split,cross_val_score, StratifiedKFold,KFold
from sklearn.metrics import f1_score,ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score,plot_confusion_matrix
from sklearn.model_selection import GridSearchCV
from category_encoders import TargetEncoder
import imblearn

In [None]:
'''Loading Data '''
test=pd.read_csv('../input/analytics-vidhya-loan-prediction/train.csv')
train=pd.read_csv('../input/analytics-vidhya-loan-prediction/train.csv')


'''Checking the data '''
print(train.head())
print(test.head())



In [None]:
'''setting random seed for reproducability'''
seed=7
def set_seed(seed):
    np.random.seed(seed)
    os.PYTHONHASHSEED.set_seed(str(seed))

plt.style.use('Solarize_Light2')
plt.rcParams['font.size']=7



In [None]:
'''Checking for null values and dtypes'''
# print(train.isnull().sum())
# print(test.isnull().sum())

#info()
# print(train.info())
# print(test.info())



In [None]:
'''Imputing the missing values and encoding the categorical values'''
# Target variable: 'Loan_Status'
train['Loan_Status']=train['Loan_Status'].replace({'Y':1,'N':0})

# 1) missing numerical values
numerical_cols=[col for col in train.columns if train[col].dtype in ['float','int']]
numerical_cols.remove('Credit_History')
numerical_cols.remove('Loan_Status')
#imputing with the mean value.
imp_mn=SimpleImputer(strategy='mean')  #impute with mean
train[numerical_cols]=imp_mn.fit_transform(train[numerical_cols])
test[numerical_cols]=imp_mn.fit_transform(test[numerical_cols])

#imputing with most_frequent values:
imp_mf=SimpleImputer(strategy='most_frequent')  #impute with most frequent
train['Credit_History']= imp_mf.fit_transform(np.array(train['Credit_History']).reshape(-1,1))
test['Credit_History']= imp_mf.fit_transform(np.array(test['Credit_History']).reshape(-1,1))



In [None]:
# 2) missing categorical values:
# 2,1) Ordinal Variables: 'Dependents','Education','Property_Area'
ord=['Dependents','Education','Property_Area']
train[ord]=imp_mf.fit_transform(train[ord])
test[ord]=imp_mf.fit_transform(test[ord])

#encoding Education
# train['Education']=train['Education'].replace({'Graduate':1,'Not Graduate':0})
# test['Education']=test['Education'].replace({'Graduate':1,'Not Graduate':0})

#
# train['Property_Area']=train['Property_Area'].replace({'Urban':3,'Semiurban':2,'Rural':1})
# test['Property_Area']=test['Property_Area'].replace({'Urban':3,'Semiurban':2,'Rural':1})

#2,2)Nominal Variables: 'Gender' ,'Married','Self_Employed'
nom=['Gender' ,'Married','Self_Employed']
train[nom]=imp_mf.fit_transform(train[nom])
test[nom]=imp_mf.fit_transform(test[nom])

# train['Married']=train['Married'].replace({'Yes':1 ,'No':0})
# test['Married']=test['Married'].replace({'Yes':1 ,'No':0})
#
# train['Self_Employed']=train['Self_Employed'].replace({'Yes':1,'No':0})
# test['Self_Employed']=test['Self_Employed'].replace({'Yes':1,'No':0})

'''creating new features '''
train['DtIR']=((train['ApplicantIncome'] +train['CoapplicantIncome']) /(train['LoanAmount'] * 10e+3))   #debt to income ratio
test['DtIR']=((test['ApplicantIncome'] + train['CoapplicantIncome'])/(test['LoanAmount']))



In [None]:
'''dropping features and making data ready for testing'''
#setting target and X:
#reset index
train.reset_index(inplace=True,drop=True)
test.reset_index(inplace=True,drop=True)

X1=train.drop(['Loan_ID','Loan_Status','Gender'],axis=1)

y_true= test['Loan_Status']
X_test=test.drop(['Loan_ID','Gender','Loan_Status'],axis=1)

X_test_ID=test.Loan_ID
y=train['Loan_Status']



In [None]:
#target encoding
tr=TargetEncoder()
X=tr.fit_transform(X1,y)
X_test=tr.transform(X_test)

#one hot encoding
# X=pd.get_dummies(X,drop_first=True)
# X_test=pd.get_dummies(X_test,drop_first=True)


In [None]:
assert X.shape==X_test.shape

# EDA

In [None]:
#correlation matrix
plt.figure(figsize=(12,6))
plt.rcParams['font.size']=10
X_c=X.copy()
X_c['Loan_Status']=y
correl=X_c.corr()
sns.heatmap(correl)
plt.title('Correlation Matrix')
plt.show()


In [None]:
'''checking the class balance'''
plt.figure(figsize=(14,7))
sns.countplot(y)
plt.title('class_frequency')
plt.show()


In [None]:
def plot_dist(df,col):
    df1=df[df['Loan_Status']==1]
    df0=df[df['Loan_Status']==0]
    plt.figure(figsize=(14,7))
    plt.title(f'{col} distribution')
    sns.distplot(df1[col],hist=False,color='g',label='Loan Accepted')
    sns.distplot(df0[col],hist=False,color='r',label='Loan Declined')
    plt.legend()
    plt.show()

#ApplicantIncome:
plot_dist(train,'ApplicantIncome')


In [None]:

# #CoapplicantIncome:
plot_dist(train,'CoapplicantIncome')

In [None]:
# #LoanAmount:
plot_dist(train,'LoanAmount')


In [None]:
#Term:
plot_dist(train,'Loan_Amount_Term')


In [None]:
cat_cols=['Gender','Married','Self_Employed','Dependents','Education','Property_Area','Credit_History']

def plot_countplot(df,cols):
    plt.figure(figsize=(20,20))
    for i,col in enumerate(cols):
        plt.subplot(3,3,i+1)
        sns.countplot(x=col,data=df,hue='Loan_Status')
        plt.title(f'{col}',size=10,loc='right')
    plt.tight_layout()
    plt.show()

plot_countplot(train,cat_cols)


In [None]:
def plot_probs(df,cols):
    plt.figure(figsize=(20,15))
    for i,col in enumerate(cols):
        grp_mean=df.groupby(col)['Loan_Status'].mean()
        plt.subplot(3,3,i+1)
        plt.bar(grp_mean.index,grp_mean)
        plt.title(f'{col}',size=10,loc='right')
    plt.tight_layout()
    plt.show()

plot_probs(train,cat_cols)




# Training:

In [None]:
'''Scaling'''

scaler=MinMaxScaler()
scaler.fit(np.vstack((X,X_test)))
X = scaler.transform(X)
X_test=scaler.transform(X_test)
print(X.shape,X_test.shape)


In [None]:
'''Cross Validation on diffrent models'''

skf=KFold(n_splits=7,random_state=seed,shuffle=True)

#logistic:
log_reg=LogisticRegression(random_state=seed ,C= 0.01, penalty= 'l1',solver='liblinear')

#ensembles:
Random_forest=RandomForestClassifier(n_estimators= 500,min_samples_split=8,
bootstrap= True, max_depth= 80, max_features= 3, min_samples_leaf= 5,random_state=seed)
AdaBoost=AdaBoostClassifier(n_estimators=300,random_state=seed)
GBMC=GradientBoostingClassifier(n_estimators=300,random_state=seed)

#KnearestKneighbors:
KNNC=KNeighborsClassifier()


#CROSS VAL SCORE:

models=[log_reg,KNNC,Random_forest,GBMC,AdaBoost]

for model in models:
    score=cross_val_score(model,X,y,cv=skf)
    print(f'Mean Cross val Score with {model} is {score.mean()} +/- {score.std()}')




# HyperParameter Optimization 

In [None]:
'''hyper parameter tuning for random forest '''
# params= {'bootstrap': [True],
#         'max_depth': [80, 90, 100, 110],
#         'max_features': [2, 3],
#         'min_samples_leaf': [3, 4, 5],
#         'min_samples_split': [8, 10, 12],
#         'n_estimators': [300, 600, 1000]}

# grid_search = GridSearchCV(estimator = Random_forest, param_grid = params,
#                           cv = 5, n_jobs = -1, verbose = 2)
#
# grid_search.fit(X,y)
#
# best_params=grid_search.best_params_
#
# print(best_params)

#  Hyperparameter tuning for logistic regression

# params={'penalty':['l1','l2'] , 'C':[1e-3,1e-2,1e-1, 1,1e+1,1e+2,1e+3] }

# grid_search=GridSearchCV(estimator=log_reg,param_grid=params,cv=5,n_jobs=-1,verbose=2)

# grid_search.fit(X,y)

# best_params=grid_search.best_params_
# best_params



# Predictions

In [None]:
'''Predictions'''




'''training model on 5 folds and taking the mean of predictions '''
preds=[]
score=[]

for train_index, test_index in skf.split(X,y):
      X_train, x_test = X[train_index], X[test_index]
      y_train, y_test = y[train_index], y[test_index]
      Random_forest.fit(X_train,y_train)
      KNNC.fit(X_train,y_train)
      log_reg.fit(X_train,y_train)
      score.append((Random_forest.score(x_test,y_test)+log_reg.score(x_test,y_test)+KNNC.score(x_test,y_test))/3)
      

      #prediction
      rf=Random_forest.predict(X_test)
      lr=log_reg.predict(X_test)
      knn=KNNC.predict(X_test)
      
      mean_pred=[(rf[i]+lr[i] + knn[i])/3 for i in range(len(rf))]
      preds.append(mean_pred)
print(np.mean(score))
preds=np.array(preds)
mean_preds=np.mean(a=preds,axis=0)

print(mean_preds)



In [None]:
y_preds=[]
# '''predicting Y if mean > threshold ,N other wise'''
for x in mean_preds:
    if x >.80:
        y_preds.append('Y')
    else :
        y_preds.append('N')

        
        
#SUBMISSION:

sub=pd.DataFrame({'Loan_ID':X_test_ID , 'Loan_Status':y_preds})
print(sub['Loan_Status'].value_counts())
print(sub.head(10))
sub.to_csv('loan_predictions.csv',index=False)


In [None]:
y_preds=sub.Loan_Status.replace({'Y':1,'N':0})
y_true=y_true.replace({'Y':1,'N':0})

assert y_preds.shape==y_true.shape

print(f'Accuracy score on test set is {f1_score(y_preds,y_true)}')

In [None]:
from sklearn.metrics import confusion_matrix


cm=confusion_matrix(y_true,y_preds)
disp=ConfusionMatrixDisplay(cm)
disp.plot()