## Objective: Prepare a model for supervised classification of applicants into loan accepted and loan rejected!

In [3]:
import pandas as pd
import plotly.express as pe

from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import confusion_matrix, classification_report

## step 1: Data loading

In [4]:
path = "https://raw.githubusercontent.com/techficent/oracle-predictive-analytics/main/Balanced_credit_Risk.csv"

df = pd.read_csv (path)

df

Unnamed: 0,index,person_age,person_income,person_home_ownership,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status
0,0,22,59000,RENT,PERSONAL,D,35000,16.02,YES
1,1,25,9600,MORTGAGE,MEDICAL,C,5500,12.87,YES
2,2,23,65500,RENT,MEDICAL,C,35000,15.23,YES
3,3,24,54400,RENT,MEDICAL,C,35000,14.27,YES
4,4,21,9900,OWN,VENTURE,A,2500,7.14,YES
...,...,...,...,...,...,...,...,...,...
12401,12401,30,102540,MORTGAGE,HOMEIMPROVEMENT,A,1500,7.90,NO
12402,12402,24,60000,RENT,PERSONAL,B,12000,12.21,NO
12403,12403,22,40000,RENT,EDUCATION,C,6000,12.87,NO
12404,12404,22,50000,RENT,PERSONAL,C,8000,13.16,NO


### step 2: Data exploration

In [5]:
df.shape

(12406, 9)

In [6]:
df.index

RangeIndex(start=0, stop=12406, step=1)

In [7]:
df.columns

Index(['index', 'person_age', 'person_income', 'person_home_ownership',
       'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate',
       'loan_status'],
      dtype='object')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12406 entries, 0 to 12405
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   index                  12406 non-null  int64  
 1   person_age             12406 non-null  int64  
 2   person_income          12406 non-null  int64  
 3   person_home_ownership  12406 non-null  object 
 4   loan_intent            12406 non-null  object 
 5   loan_grade             12406 non-null  object 
 6   loan_amnt              12406 non-null  int64  
 7   loan_int_rate          12406 non-null  float64
 8   loan_status            12406 non-null  object 
dtypes: float64(1), int64(4), object(4)
memory usage: 872.4+ KB


In [9]:
df.isna().sum()

index                    0
person_age               0
person_income            0
person_home_ownership    0
loan_intent              0
loan_grade               0
loan_amnt                0
loan_int_rate            0
loan_status              0
dtype: int64

In [10]:
df.nunique()

index                    12406
person_age                  50
person_income             2184
person_home_ownership        4
loan_intent                  6
loan_grade                   7
loan_amnt                  576
loan_int_rate              340
loan_status                  2
dtype: int64

In [11]:
categorical_columns = ["person_home_ownership", "loan_intent", "loan_grade"  ]
real_value_columns = ["person_age", "person_income", "loan_amnt", "loan_int_rate"]

In [12]:
df[       real_value_columns       ].describe()

Unnamed: 0,person_age,person_income,loan_amnt,loan_int_rate
count,12406.0,12406.0,12406.0,12406.0
mean,27.59447,60404.5,10187.790182,11.784964
std,6.247178,44547.57,6692.722063,3.391123
min,20.0,4000.0,800.0,5.42
25%,23.0,34615.5,5000.0,8.94
50%,26.0,50915.0,8800.0,11.83
75%,30.0,73000.0,14000.0,14.42
max,144.0,1362000.0,35000.0,23.22


In [13]:
df [     categorical_columns     ].describe(include="object")

Unnamed: 0,person_home_ownership,loan_intent,loan_grade
count,12406,12406,12406
unique,4,6,7
top,RENT,MEDICAL,B
freq,7299,2480,3550


## step 2b) Pre-processing

### scaling & encoding

In [14]:
le  = LabelEncoder()

sc = StandardScaler()

! [https://miro.medium.com/max/660/1*qXXqcZXhSTgw9tVhx7zNNw.gif]

In [15]:
for col in real_value_columns:
    # replace the column with standard scaled version of the same column
    df[    [col]     ]   =    sc.fit_transform(     df[     [col]     ]        )

df[   real_value_columns   ] #searching for columns mentioned in real_value_columns

Unnamed: 0,person_age,person_income,loan_amnt,loan_int_rate
0,-0.895556,-0.031529,3.707491,1.248910
1,-0.415320,-1.140501,-0.700459,0.319977
2,-0.735477,0.114388,3.707491,1.015939
3,-0.575398,-0.134794,3.707491,0.732836
4,-1.055634,-1.133766,-1.148725,-1.369797
...,...,...,...,...
12401,0.385074,0.945892,-1.298147,-1.145674
12402,-0.575398,-0.009081,0.270784,0.125343
12403,-0.895556,-0.458057,-0.625748,0.319977
12404,-0.895556,-0.233569,-0.326904,0.405497


In [16]:
for col  in  categorical_columns:

    df[col]  =   le.fit_transform(   df[col]   )

df[categorical_columns]

Unnamed: 0,person_home_ownership,loan_intent,loan_grade
0,3,4,3
1,0,3,2
2,3,3,2
3,3,3,2
4,2,5,0
...,...,...,...
12401,0,2,0
12402,3,4,1
12403,3,1,2
12404,3,4,2


### step 3: separate features & labels

In [17]:
features = real_value_columns + categorical_columns 

target = ["loan_status"]

## step 4: split the data into training and testing sets

## x refers to features
## y refers to target



features for training
features for testing

target for training (corresponding to features for training)
target for testing (corresponding to features for testing)

In [18]:
df[target].value_counts()

loan_status
NO             6203
YES            6203
dtype: int64

In [19]:
x_train, x_test, y_train, y_test = train_test_split(   df[features],  df[target],   test_size=0.2 , random_state=10, stratify=df[target]   ) #

### step 5: training the model

In [20]:
model = DecisionTreeClassifier(   )

In [21]:
model.fit(    x_train, y_train        )

### step 6: evaluation of model

In [22]:
ans = model.predict(     x_test    ) #testing features 

print( classification_report(    y_test,  ans     ) )

              precision    recall  f1-score   support

          NO       0.81      0.78      0.79      1241
         YES       0.79      0.81      0.80      1241

    accuracy                           0.80      2482
   macro avg       0.80      0.80      0.80      2482
weighted avg       0.80      0.80      0.80      2482



In [23]:
actual_ans_df = y_test.reset_index(drop=True)

predicted_ans_df = pd.DataFrame(ans)

result_df = pd.concat(    [actual_ans_df,  predicted_ans_df], axis=1     )

result_df.columns = [    "Actual_Answer", "Predicted_ans"    ]

result_df

Unnamed: 0,Actual_Answer,Predicted_ans
0,NO,NO
1,YES,YES
2,YES,YES
3,YES,YES
4,NO,NO
...,...,...
2477,NO,NO
2478,YES,NO
2479,YES,YES
2480,YES,YES


### how can we improve this performance?

#### a) Tune the hyperparameters (automated tuning?)
#### b) Change the features [feature engineering????]
#### c) Try a different algorithm (includes bagging & boosting) [ xgboosting!  ]
#### d) perform a thorough testing to verify the results!

Model answer |              Actual Answer |         Remark
YES                                  YES            TRUE  POSITIVE
YES                                  NO             FALSE POSITIVE
NO                                   YES            FALSE NEGATIVE
NO                                   NO             TRUE  NEGATIVE

TP           FP
FN           TN

In [24]:
print(  confusion_matrix(    y_test,  ans    )  )

[[ 967  274]
 [ 233 1008]]


accuracy : out of all testing samples given to model, for how many samples did model give correct answer???

                TP+TN
                ------------
                TP+TN+FP+FN
*Note: Accuracy parameter is ONLY RELIABLE IF DATA USED IS BALANCED 


(rows)
precision :  Out of all cases marked as POSITIVE, in how many cases the result was actually POSITIVE?
                    TP
                    -------
                    TP + FP

(columns)
recall :   out of all cases THAT SHOULD HAVE BEEN POSITIVE, how many cases did model identify correctly?

                    TP
                -----------
                TP + FN


f1-score : a weighted average of precision and recall

#supervised learning ----> classification (target column should be categorical)

categorizing your data point into 1 of applicable categories

####

# step 1: arrange your data

# step 2a: exploration--->statistical analysis!!!
       2b: preprocessing the data (apply steps that are applicable)
                -change the scale
                - encode categorical columns if applicable!!!
                - removal of outliers (*algorithms that are based on regression require this!)
                - replace/remove missing values from the dataset


(a set of algorithms in mind)
## step 3: identify and separate features & target

## step 4: divide the data into training & testing
            -ratio for training & testing
            - how to stop randomization from occuring each time?
            - how to use stratification?[       K-fold stratified sampling???        ]
            - whether a simple train-test-split can be done or folded sampling required*

### step 5: train the model (apply the algorithm/technique on the available training set)
              ----> tune the hyper-parameters!!!
                            ----> there are libraries that can help! (GridSearchCV)

### step 6: evaluation of the model?
              ----> metric for evaluation? (accuracy, f1-score, etc)
              ----> stratified-folded evaluation???**

### step 7: deployment of the model (not usually a responsibilty of a data scientist!)


# supervised learning --------> prediction (target column should be real-value)

In [26]:
categorical_columns

['person_home_ownership', 'loan_intent', 'loan_grade']

In [28]:
pe.scatter(      x="person_age", y="person_income",    data_frame=df, color="loan_status")

### 3 perspective

    a) Statistician: 
    b) visualization expert
    c) layman perspective

## whenever in doubt, try multiple models!

In [None]:
line-based
    - linear regression 
    - logistic regression
        (straight line equation:   Y = Mi * Xi  +C)

    - SVM 
    

tree-based
    - DecisionTreeClassifier : entropy 
    - XGBoost
    - Random forest


distance-based 
    - knn 
    - kmeans
    - kmediods
    - Agglomerative clustering

other (preprocessing, etc)