In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score

#the model used to fit&predict
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
#pipeline with its' preprocessor's transformers
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer

#used for estimating model accuracy and getting reports
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv(r"C:\Users\sriram\Desktop\Loan_Approval_Prediction\train_u6lujuX_CVtuZ9i.csv")
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB
None


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
[print(col, df[col].nunique()) for col in df.columns if df[col].dtype=='object']
df.drop('Loan_ID', axis=1, inplace=True)

Loan_ID 614
Gender 2
Married 2
Dependents 4
Education 2
Self_Employed 2
Property_Area 3
Loan_Status 2


In [4]:
y = LabelEncoder().fit_transform(df['Loan_Status'])
X = df.drop(['Loan_Status'], axis=1)

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=0)

In [5]:
num_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]
cat_cols = [col for col in X.columns if X[col].dtype == 'object']
print(num_cols)
print(cat_cols)

['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']
['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']


In [6]:
num_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_cols),
    ('cat', cat_transformer, cat_cols)
])

In [7]:
def train_predict(full_pipeline, train_X, train_y, test_X, test_y):
    full_pipeline.fit(train_X, train_y)
    pred_y = full_pipeline.predict(test_X)
    print('accuracy_score:', accuracy_score(test_y, pred_y))
    print('classification_report:', classification_report(test_y, pred_y))
    
    print('cross_val_score:')
    scores = cross_val_score(full_pipeline, X, y, cv=5, scoring='accuracy')
    print(scores)
    print('mean of scores:', scores.mean())

In [8]:
from xgboost import XGBClassifier

models = [
   
    ('XGB', XGBClassifier(learning_rate=0.1,
                        n_estimators=100,         
                        max_depth=6,               
                        min_child_weight = 1,      
                        gamma=0.,                  
                        subsample=0.8,             
                        scale_pos_weight=1,        
                        random_state=27)),
]
for model_name, model in models:
    print('\nModel %s.' % model_name)
    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    train_predict(full_pipeline, train_X, train_y, test_X, test_y)
    print('*' * 50)

  from pandas import MultiIndex, Int64Index



Model XGB.
accuracy_score: 0.8211382113821138
classification_report:               precision    recall  f1-score   support

           0       0.70      0.58      0.63        33
           1       0.85      0.91      0.88        90

    accuracy                           0.82       123
   macro avg       0.78      0.74      0.76       123
weighted avg       0.81      0.82      0.82       123

cross_val_score:












[0.74796748 0.73170732 0.75609756 0.78861789 0.79508197]
mean of scores: 0.7638944422231106
**************************************************


In [10]:
import pickle 

pickle.dump(full_pipeline,open('Decesion_Tree_Model.pkl','wb'))

In [80]:
predicted = full_pipeline.predict(pd.DataFrame(columns=['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area'],data=np.array(['Male', 'Yes', 1 ,'Graduate', 'No', 3 , 0, 1280.0, 36.0, 1.0, 'Rural']).reshape(1,11)))

In [81]:
predicted

array([0])

In [82]:
if(predicted == 1):
    print('loan_approved')
else:
    print("Not_approved")

Not_approved
