### Importing necessary libraries:

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Problem Statement:

Have you ever wondered how lenders use various factors such as credit score, annual income, the loan amount approved, tenure, debt-to-income ratio etc. and select your interest rates? 

The process, defined as ‘risk-based pricing’, uses a sophisticated algorithm that leverages different determining factors of a loan applicant. Selection of significant factors will help develop a prediction algorithm which can estimate loan interest rates based on clients’ information. On one hand, knowing the factors will help consumers and borrowers to increase their credit worthiness and place themselves in a better position to negotiate for getting a lower interest rate. On the other hand, this will help lending companies to get an immediate fixed interest rate estimation based on clients information. Here, your goal is to use a training dataset to predict the loan rate category (1 / 2 / 3) that will be assigned to each loan in our test set.

### Reading train and test dataset:

In [None]:
train_set=pd.read_csv('../input/banking-modelclassification/train_loan.csv')
train_set

In [None]:
test_set=pd.read_csv('../input/banking-modelclassification/test_loan.csv')

In [None]:
test_set

In [None]:
train_set.shape,test_set.shape

In [None]:
train_set['Data']='train'
test_set['Data']='test'
test_set['Interest_Rate']=np.nan

In [None]:
combined=pd.concat([train_set,test_set],ignore_index=True,sort=False)

In [None]:
combined

In [None]:
combined.shape

In [None]:
combined.info()

### Checking null values:

In [None]:
combined.isnull().sum()[combined.isnull().sum()!=0]

### Data Cleaning & Imputing null values:

In [None]:
combined.head()

In [None]:
combined['Loan_Amount_Requested'].value_counts()

In [None]:
combined['Loan_Amount_Requested']=combined['Loan_Amount_Requested'].str.replace(',','')

In [None]:
combined.info()

In [None]:
combined['Loan_Amount_Requested'].unique()

In [None]:
combined['Loan_Amount_Requested']=pd.to_numeric(combined['Loan_Amount_Requested'],errors='coerce')

In [None]:
combined.isnull().sum()[combined.isnull().sum()!=0]

In [None]:
combined.info()

In [None]:
combined.head()

In [None]:
combined['Length_Employed']=combined['Length_Employed'].str.replace('<','')

In [None]:
combined['Length_Employed'].value_counts()

In [None]:
combined['Length_Employed']=combined['Length_Employed'].str.replace('+','')

In [None]:
combined['Length_Employed']=combined['Length_Employed'].str.replace('years','')

In [None]:
combined['Length_Employed'].value_counts()

In [None]:
combined['Length_Employed']=combined['Length_Employed'].str.replace('year','')

In [None]:
combined['Length_Employed'].value_counts()

In [None]:
combined['Length_Employed']=pd.to_numeric(combined['Length_Employed'],errors='coerce')

In [None]:
combined.isnull().sum()[combined.isnull().sum()!=0]

In [None]:
combined.info()

In [None]:
combined['Home_Owner'].value_counts()

In [None]:
combined.head()

In [None]:
combined['Purpose_Of_Loan'].value_counts()

In [None]:
a=combined[combined['Home_Owner'].isnull()]

In [None]:
a

In [None]:
a.loc[a.Purpose_Of_Loan=='home_improvement']

In [None]:
combined.loc[combined.Purpose_Of_Loan=='home_improvement','Home_Owner']='Own'

In [None]:
combined.isnull().sum()[combined.isnull().sum()!=0]

In [None]:
combined.loc[combined.Purpose_Of_Loan=='house','Home_Owner']='Rent'

In [None]:
combined.isnull().sum()[combined.isnull().sum()!=0]

In [None]:
combined.head()

In [None]:
pd.DataFrame(combined.groupby('Length_Employed')['Annual_Income'].mean())

In [None]:
combined.Length_Employed=combined.Length_Employed.transform(lambda x:x.fillna(x.mean()))

In [None]:
combined.Annual_Income=combined.groupby('Length_Employed')['Annual_Income'].transform(lambda x:x.fillna(x.mean()))

In [None]:
combined.isnull().sum()[combined.isnull().sum()!=0]

In [None]:
combined.Home_Owner.value_counts()

In [None]:
combined.Home_Owner=combined.Home_Owner.fillna(combined.Home_Owner.mode()[0])

In [None]:
combined.Home_Owner.value_counts()

In [None]:
combined.Home_Owner=combined.Home_Owner.replace(['None','Other'],['Mortgage']*2)

In [None]:
combined.Home_Owner.value_counts()

In [None]:
combined.info()

In [None]:
combined.Income_Verified.value_counts()

In [None]:
combined.Income_Verified=combined.Income_Verified.replace(['VERIFIED - income source','VERIFIED - income'],['Verified']*2)

In [None]:
combined.Income_Verified=combined.Income_Verified.replace('not verified','Not_Verified')

In [None]:
combined.Income_Verified.value_counts()

In [None]:
combined.Purpose_Of_Loan.value_counts()

In [None]:
combined.Gender.value_counts()

In [None]:
combined.Months_Since_Deliquency.value_counts()

In [None]:
plt.figure(figsize=(10,8))
ax=sns.heatmap(combined.corr(),annot=True,linewidths=.5,fmt='.1f')
plt.show()

In [None]:
combined.head()

### Dropping unnecessary columns:

In [None]:
combined.drop(['Months_Since_Deliquency','Number_Open_Accounts'],axis=1,inplace=True)

In [None]:
combined.columns

In [None]:
df=combined.drop('Loan_ID',axis=1)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
from sklearn.feature_extraction import FeatureHasher
fh=FeatureHasher(n_features=6,input_type='string')
hashed_feature=fh.fit_transform(df['Purpose_Of_Loan'])
hashed_feature=hashed_feature.toarray()

In [None]:
fh=pd.DataFrame(hashed_feature)

In [None]:
df=pd.get_dummies(df,columns=['Home_Owner','Income_Verified','Gender'],drop_first=True)

In [None]:
dff=pd.concat([df,fh],axis=1,sort=False)
dff.shape

In [None]:
dff.drop('Purpose_Of_Loan',axis=1,inplace=True)

In [None]:
dff.head()

In [None]:
dff.shape

In [None]:
dff.head(50)

In [None]:
dff.skew()

In [None]:
dff.columns

### Checking distribution of numerical values:

In [None]:
l=['Inquiries_Last_6Mo','Annual_Income']

In [None]:
for i in l:
    sns.distplot(dff[i])
    plt.show()

### Checking outliers of numerical columns:

In [None]:
for i in l:
    sns.boxplot(dff[i])
    plt.show()

In [None]:
import scipy.stats as st
for i in l:
    dff[i]=list(st.boxcox(combined[i]+1)[0])
dff.skew()

In [None]:
for i in l:
    sns.boxplot(dff[i])
    plt.show()

In [None]:
train=dff.loc[dff['Data']=='train']

In [None]:
train.shape

In [None]:
test=dff.loc[dff['Data']=='test']
test.shape

In [None]:
train=train.drop('Data',axis=1)
test=test.drop(['Data','Interest_Rate'],axis=1)

In [None]:
train.shape,test.shape

In [None]:
train_set.shape,test_set.shape

### Splitting train dataset into X and y:

In [None]:
X=train.drop('Interest_Rate',axis=1)
y=train['Interest_Rate']

### Checking whether the target column is balanced or not:

In [None]:
sns.countplot(train['Interest_Rate'])
train['Interest_Rate'].value_counts()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
#from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier as KNN
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
import lightgbm as lgb
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from tpot import TPOTClassifier
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.3, random_state=0)

In [None]:
pipeline_lr=Pipeline([('scalar1',StandardScaler()),
                     ('lr',LogisticRegression())])
pipeline_dt=Pipeline([('scaler2',StandardScaler()),
                     ('dt',DecisionTreeClassifier())])
pipeline_rf=Pipeline([('scalar3',StandardScaler()),
                     ('rfc',RandomForestClassifier())])
pipeline_knn=Pipeline([('scalar4',StandardScaler()),
                     ('knn',KNN())])
pipeline_xgbc=Pipeline([('scalar5',StandardScaler()),
                     ('xgboost',XGBClassifier())])
pipeline_lgbc=Pipeline([('scalar6',StandardScaler()),
                     ('lgbc',lgb.LGBMClassifier())])
pipeline_ada=Pipeline([('scalar7',StandardScaler()),
                     ('adaboost',AdaBoostClassifier())])
pipeline_sgdc=Pipeline([('scalar8',StandardScaler()),
                     ('sgradient',SGDClassifier())])
pipeline_nb=Pipeline([('scalar9',StandardScaler()),
                     ('nb',GaussianNB())])
pipeline_extratree=Pipeline([('scalar10',StandardScaler()),
                     ('extratree',ExtraTreesClassifier())])

In [None]:
pipelines=[pipeline_lr,pipeline_dt,pipeline_rf,pipeline_knn,pipeline_xgbc,pipeline_lgbc,pipeline_ada,pipeline_sgdc,pipeline_nb,pipeline_extratree]

In [None]:
best_accuracy=0.0
best_classifier=0
best_pipeline=""

In [None]:
pipe_dict={0:'Logistic Regression',1:'Random Forest',2:'Decision Tree',3:'KNN',4:'XGBC',5:'LGBC',6:'ADA',7:'SGDC',8:'NB',9:'ExtraTree'}

In [None]:
for i in pipelines:
    i.fit(X_train,y_train)
    predictions=i.predict(X_test)
    print('Classification Report : \n',(classification_report(y_test,predictions)))

In [None]:
for i,model in enumerate(pipelines):
    print('{} Test Accuracy {}'.format(pipe_dict[i],model.score(X_test,y_test)))

In [None]:
for i,model in enumerate(pipelines):
    if model.score(X_test,y_test)>best_accuracy:
        best_accuracy=model.score(X_test,y_test)
        best_classifier=i
        best_pipeline=model
print("Classifier with best accuracy:{}".format(pipe_dict[best_classifier]))

### Hyperparameter tuning using RandomisedSearchCV

In [None]:
# Create a pipeline
pipe = Pipeline([("classifier", RandomForestClassifier())])

# Create dictionary with candidate learning algorithms and their hyperparameters
r_param = [ {"classifier": [RandomForestClassifier()],
             "classifier__n_estimators": [10,15,20],
             "classifier__max_depth":[15,25,30],
             "classifier__min_samples_leaf":[5,10,15,100],
             "classifier__max_leaf_nodes": [5,10]},
           
            {"classifier":[GradientBoostingClassifier()],
            "classifier__learning_rate":np.arange(0.05,0.5,0.1),
            "classifier__n_estimators":np.arange(5,10,20),
            'classifier__max_depth':np.arange(4,15),
            "classifier__min_samples_leaf":[10,15,100],
            "classifier__max_leaf_nodes": [5,10]}

      ]

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import make_scorer
import sklearn.metrics

In [None]:
scorer = sklearn.metrics.make_scorer(sklearn.metrics.f1_score, average = 'micro')

In [None]:
rsearch = RandomizedSearchCV(pipe, r_param, cv=5, verbose=0,n_jobs=-1,random_state=0,scoring=scorer)

In [None]:
rsearch.fit(X_train,y_train)

In [None]:
print(rsearch.best_estimator_)
print("The mean accuracy of the model is through randomized search is :",rsearch.score(X_test,y_test))

In [None]:
GBC=GradientBoostingClassifier(learning_rate=0.42000000000000004,
                                            max_depth=4, max_leaf_nodes=10,
                                            n_estimators=110)

In [None]:
GBC_model=GBC.fit(X_train,y_train)

### Accuracy Score:

In [None]:
accuracy_score=GBC.score(X_test,y_test)
accuracy_score

### F1 Score:

In [None]:
pred=GBC.predict(X_test)

In [None]:
f1_score=f1_score(pred,y_test,average='weighted')
f1_score

### Predictions of target variable of test data

In [None]:
GBC_predictions=GBC.predict(test)
GBC_predictions.shape

In [None]:
test.shape

In [None]:
Submission_gbc = pd.DataFrame(GBC_predictions)

In [None]:
Submission_gbc.to_csv('Submission_loan_gbc.csv',index=False)