# Classifying Cross-Countries Customers for Lending Home Equity Loans  


In [45]:
#Importing libraries from python and read the data set
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
income = pd.read_csv("C:/Users/sasaha/Documents/income.csv")

In [46]:
#Read first five rows from dataset
income.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,high_income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [47]:
#Checking to find any null value
income.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
high_income       0
dtype: int64

In [48]:
#Checking data type for each column
income.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  high_income     32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [49]:
# finding the unique value for each categorical column
for feature in income.select_dtypes(include=['object']).columns:
    print(income[feature].unique())

[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' ?' ' Self-emp-inc' ' Without-pay' ' Never-worked']
[' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' Assoc-voc' ' 7th-8th' ' Doctorate' ' Prof-school'
 ' 5th-6th' ' 10th' ' 1st-4th' ' Preschool' ' 12th']
[' Never-married' ' Married-civ-spouse' ' Divorced'
 ' Married-spouse-absent' ' Separated' ' Married-AF-spouse' ' Widowed']
[' Adm-clerical' ' Exec-managerial' ' Handlers-cleaners' ' Prof-specialty'
 ' Other-service' ' Sales' ' Craft-repair' ' Transport-moving'
 ' Farming-fishing' ' Machine-op-inspct' ' Tech-support' ' ?'
 ' Protective-serv' ' Armed-Forces' ' Priv-house-serv']
[' Not-in-family' ' Husband' ' Wife' ' Own-child' ' Unmarried'
 ' Other-relative']
[' White' ' Black' ' Asian-Pac-Islander' ' Amer-Indian-Eskimo' ' Other']
[' Male' ' Female']
[' United-States' ' Cuba' ' Jamaica' ' India' ' ?' ' Mexico' ' South'
 ' Puerto-Rico' ' Honduras' ' England' ' Canada' ' Germany' 

In [50]:
# Converting all integer column in float
for feature in income.columns:
    if income[feature].dtypes=="int64":
         income[feature]=income[feature].astype('float64')

In [51]:
# Replace all the unexpected value of float data type column by median of column
for  feature in income.columns:
    if income[feature].dtypes=="float64":
         income[feature]=income[feature].replace(' ?', np.nan).fillna(income[feature].median())

In [52]:
# Replace all the unexpected value of categorical column by adding new value Missing
for feature in income.select_dtypes(include=['object']).columns:
    income[feature]=income[feature].replace(' ?', "Missing")

In [53]:
# Checking the uninteded unique value in categorical column
for feature in income.select_dtypes(include=['object']).columns:
    print(income[feature].unique())

[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 'Missing' ' Self-emp-inc' ' Without-pay' ' Never-worked']
[' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' Assoc-voc' ' 7th-8th' ' Doctorate' ' Prof-school'
 ' 5th-6th' ' 10th' ' 1st-4th' ' Preschool' ' 12th']
[' Never-married' ' Married-civ-spouse' ' Divorced'
 ' Married-spouse-absent' ' Separated' ' Married-AF-spouse' ' Widowed']
[' Adm-clerical' ' Exec-managerial' ' Handlers-cleaners' ' Prof-specialty'
 ' Other-service' ' Sales' ' Craft-repair' ' Transport-moving'
 ' Farming-fishing' ' Machine-op-inspct' ' Tech-support' 'Missing'
 ' Protective-serv' ' Armed-Forces' ' Priv-house-serv']
[' Not-in-family' ' Husband' ' Wife' ' Own-child' ' Unmarried'
 ' Other-relative']
[' White' ' Black' ' Asian-Pac-Islander' ' Amer-Indian-Eskimo' ' Other']
[' Male' ' Female']
[' United-States' ' Cuba' ' Jamaica' ' India' 'Missing' ' Mexico' ' South'
 ' Puerto-Rico' ' Honduras' ' England' ' Cana

In [54]:
#Binary classification of high income column based on salary
income['class']=np.where(income["high_income"]==' <=50K',0,1)
income=income.drop(["high_income"],axis=1)
income.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,class
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,0
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,0
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,0
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,0
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,0
5,37.0,Private,284582.0,Masters,14.0,Married-civ-spouse,Exec-managerial,Wife,White,Female,0.0,0.0,40.0,United-States,0
6,49.0,Private,160187.0,9th,5.0,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0.0,0.0,16.0,Jamaica,0
7,52.0,Self-emp-not-inc,209642.0,HS-grad,9.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,45.0,United-States,1
8,31.0,Private,45781.0,Masters,14.0,Never-married,Prof-specialty,Not-in-family,White,Female,14084.0,0.0,50.0,United-States,1
9,42.0,Private,159449.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178.0,0.0,40.0,United-States,1


In [55]:
# Counting each unique value in each categorical column

for feature in income.select_dtypes(include=['object']).columns:
    print(income[feature].value_counts())

 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
Missing               1836
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: workclass, dtype: int64
 HS-grad         10501
 Some-college     7291
 Bachelors        5355
 Masters          1723
 Assoc-voc        1382
 11th             1175
 Assoc-acdm       1067
 10th              933
 7th-8th           646
 Prof-school       576
 9th               514
 12th              433
 Doctorate         413
 5th-6th           333
 1st-4th           168
 Preschool          51
Name: education, dtype: int64
 Married-civ-spouse       14976
 Never-married            10683
 Divorced                  4443
 Separated                 1025
 Widowed                    993
 Married-spouse-absent      418
 Married-AF-spouse           23
Name: marital_status, dtype: int64
 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial

In [56]:
for feature in income.select_dtypes(include=['object']).columns:
    col = pd.Categorical(income[feature])
    income[feature] = col.codes

In [57]:
#Checking data type of column
income.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             32561 non-null  float64
 1   workclass       32561 non-null  int8   
 2   fnlwgt          32561 non-null  float64
 3   education       32561 non-null  int8   
 4   education_num   32561 non-null  float64
 5   marital_status  32561 non-null  int8   
 6   occupation      32561 non-null  int8   
 7   relationship    32561 non-null  int8   
 8   race            32561 non-null  int8   
 9   sex             32561 non-null  int8   
 10  capital_gain    32561 non-null  float64
 11  capital_loss    32561 non-null  float64
 12  hours_per_week  32561 non-null  float64
 13  native_country  32561 non-null  int8   
 14  class           32561 non-null  int32  
dtypes: float64(6), int32(1), int8(8)
memory usage: 1.9 MB


In [58]:
# Counting the binary classification of each clas
print(income['class'].value_counts())

0    24720
1     7841
Name: class, dtype: int64


In [59]:
# dropping any null value in column
income["native_country"].dropna()

0        38
1        38
2        38
3        38
4         4
         ..
32556    38
32557    38
32558    38
32559    38
32560    38
Name: native_country, Length: 32561, dtype: int8

In [60]:
#Dropping categorical column if it has only single unique value

drop_columns = []
for col in income.columns:
    col_series = income[col].dropna().unique()
    if len(col_series) == 1:
        drop_columns.append(col)
income = income.drop(drop_columns, axis=1)
print(drop_columns)

[]


In [61]:
#drop any row if it has NaN or Null Values
income = income.dropna(axis=0)

In [62]:
#taking features and predicting variables for cross-validation

features= income.drop(['class'],axis=1)
predict=income['class']

In [63]:
#taking all features as float64
for feature in features.columns:
    features[feature]=features[feature].astype('float64')

In [64]:
# data type of each column
features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32561 entries, 0 to 32560
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             32561 non-null  float64
 1   workclass       32561 non-null  float64
 2   fnlwgt          32561 non-null  float64
 3   education       32561 non-null  float64
 4   education_num   32561 non-null  float64
 5   marital_status  32561 non-null  float64
 6   occupation      32561 non-null  float64
 7   relationship    32561 non-null  float64
 8   race            32561 non-null  float64
 9   sex             32561 non-null  float64
 10  capital_gain    32561 non-null  float64
 11  capital_loss    32561 non-null  float64
 12  hours_per_week  32561 non-null  float64
 13  native_country  32561 non-null  float64
dtypes: float64(14)
memory usage: 3.7 MB


In [65]:
#Making all the data in standardized unit
features = (features - features.min())/(features.max() - features.min())
features.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,0.30137,0.75,0.044302,0.6,0.8,0.666667,0.0,0.2,1.0,1.0,0.02174,0.0,0.397959,0.926829
1,0.452055,0.625,0.048238,0.6,0.8,0.333333,0.214286,0.0,1.0,1.0,0.0,0.0,0.122449,0.926829
2,0.287671,0.375,0.138113,0.733333,0.533333,0.0,0.357143,0.2,1.0,1.0,0.0,0.0,0.397959,0.926829
3,0.493151,0.375,0.151068,0.066667,0.4,0.333333,0.357143,0.0,0.5,1.0,0.0,0.0,0.397959,0.926829
4,0.150685,0.375,0.221488,0.6,0.8,0.333333,0.642857,1.0,0.5,0.0,0.0,0.0,0.397959,0.097561


In [66]:
#Building the model by cross validating using logistic binary classification
# Finding the predicting class for each observation 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
lr = LogisticRegression()
predictions = cross_val_predict(lr, features, predict, cv=5)

In [67]:
# assigning the predictied class to new column in data set
income["predicted_class"]=predictions

In [68]:
# taking first 10 columns 
income.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,class,predicted_class
0,39.0,6,77516.0,9,13.0,4,0,1,4,1,2174.0,0.0,40.0,38,0,0
1,50.0,5,83311.0,9,13.0,2,3,0,4,1,0.0,0.0,13.0,38,0,0
2,38.0,3,215646.0,11,9.0,0,5,1,4,1,0.0,0.0,40.0,38,0,0
3,53.0,3,234721.0,1,7.0,2,5,0,2,1,0.0,0.0,40.0,38,0,0
4,28.0,3,338409.0,9,13.0,2,9,5,2,0,0.0,0.0,40.0,4,0,0
5,37.0,3,284582.0,12,14.0,2,3,5,4,0,0.0,0.0,40.0,38,0,0
6,49.0,3,160187.0,6,5.0,3,7,1,2,0,0.0,0.0,16.0,22,0,0
7,52.0,5,209642.0,11,9.0,2,3,0,4,1,0.0,0.0,45.0,38,1,0
8,31.0,3,45781.0,12,14.0,4,9,1,4,0,14084.0,0.0,50.0,38,1,1
9,42.0,3,159449.0,9,13.0,2,3,0,4,1,5178.0,0.0,40.0,38,1,1


In [69]:
# counting the each predicitng class
income["predicted_class"].value_counts()

0    27690
1     4871
Name: predicted_class, dtype: int64

In [70]:
#counting each class
income["class"].value_counts()

0    24720
1     7841
Name: class, dtype: int64

In [71]:
# Find out how accurate the model predict
correct_predictions = income[income['class']== income['predicted_class']]
accuracy = len(correct_predictions) / len(income)
print(accuracy)

0.823162679278892


In [72]:
#Sensitivity or True Positive Rate 
#How effective is this model at identifying positive outcomes?
true_positive_filter = (income["predicted_class"] == 1) & (income["class"] == 1)
true_positives = len(income[true_positive_filter])


false_negative_filter = (income["predicted_class"] == 0) & (income["class"] == 1)
false_negatives = len(income[false_negative_filter])

print(true_positives)
print(false_negatives)

sensitivity = true_positives / (true_positives + false_negatives)

print(sensitivity)

3477
4364
0.443438336946818


In [73]:
#How effective is this model at identifying negative outcomes?
#Specificity or True Negative Rate - The proportion of applicants that were correctly rejected:
true_negative_filter = (income["predicted_class"] == 0) & (income["class"] == 0)
true_negatives = len(income[true_negative_filter])

false_positive_filter = (income["predicted_class"] == 1) & (income["class"] == 0)
false_positives = len(income[false_positive_filter])
print(true_negatives)
print(false_positives)

specificity = (true_negatives) / (false_positives + true_negatives)
print(specificity)

23326
1394
0.9436084142394822


In [74]:
#Finding the how far model can predict outcomes
#finding how far the model wrongly prditct negative outcome as positive 
tpr = true_positives / (true_positives + false_negatives)
fpr= false_positives / (false_positives + true_negatives)

print(tpr)
print(fpr)

0.443438336946818
0.0563915857605178


The model is very conservative.Since,The model predicts to lend only less than half those customers who are supposed to have loans


The Model is very good to predict deafault loan since predict default rate is just over 5%

In [75]:
# building the model based on approprite weight for predicting variable
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
lr = LogisticRegression(class_weight="balanced")
predictions = cross_val_predict(lr, features, predict, cv=5)
income_balanced=income.copy()
income_balanced["predicted_class"]=predictions



In [76]:
#How effective is this balanced model at identifying positive outcomes?
balancedtrue_positive_filter = (income_balanced["predicted_class"] == 1) & (income_balanced["class"] == 1)
balancedtrue_positives = len(income_balanced[balancedtrue_positive_filter])


balancedfalse_negative_filter = (income_balanced["predicted_class"] == 0) & (income_balanced["class"] == 1)
balancedfalse_negatives = len(income_balanced[balancedfalse_negative_filter])

print(balancedtrue_positives)
print(balancedfalse_negatives)

balancedsensitivity = balancedtrue_positives / (balancedtrue_positives + balancedfalse_negatives)

print(balancedsensitivity)

6096
1745
0.7774518556306593


In [77]:
#How effective is this balanced model at identifying negative outcomes?
balancedtrue_negative_filter = (income_balanced["predicted_class"] == 0) & (income_balanced["class"] == 0)
balancedtrue_negatives = len(income_balanced[balancedtrue_negative_filter])

balancedfalse_positive_filter = (income_balanced["predicted_class"] == 1) & (income_balanced["class"] == 0)
balancedfalse_positives = len(income_balanced[balancedfalse_positive_filter])
print(balancedtrue_negatives)
print(balancedfalse_positives)

balancedspecificity = (balancedtrue_negatives) / (balancedfalse_positives + balancedtrue_negatives)
print(balancedspecificity)

18867
5853
0.7632281553398058


In [78]:
#Finding the how far the balanced model can predict outcomes
#finding how far the balanced model wrongly prditct negative outcome as positive 
balancedtpr = balancedtrue_positives / (balancedtrue_positives + balancedfalse_negatives)
balancedfpr= balancedfalse_positives / (balancedfalse_positives + balancedtrue_negatives)

print(balancedtpr)
print(balancedfpr)

0.7774518556306593
0.2367718446601942


The model is very good if the lending instituition wants to increase more lending rate to more customer.

The balanced model predicts to lend more than 75 % those customers who are supposed to have loans at the expense 1 out of 4  deafult loans

In [79]:
# Bulding model based on giving more penalty if model wrongly predict positive outcomes
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
penalty = {
    0: 1,
    1:5
}    
lr = LogisticRegression(class_weight=penalty)
predictions = cross_val_predict(lr, features, predict, cv=5)
income_balanced=income.copy()
income_balanced["predicted_class"]=predictions
    
balancedtrue_positive_filter = (income_balanced["predicted_class"] == 1) & (income_balanced["class"] == 1)
balancedtrue_positives = len(income_balanced[balancedtrue_positive_filter])


balancedfalse_negative_filter = (income_balanced["predicted_class"] == 0) & (income_balanced["class"] == 1)
balancedfalse_negatives = len(income_balanced[balancedfalse_negative_filter])

print(balancedtrue_positives)
print(balancedfalse_negatives)

balancedsensitivity = balancedtrue_positives / (balancedtrue_positives + balancedfalse_negatives)

print(balancedsensitivity)
    
balancedtrue_negative_filter = (income_balanced["predicted_class"] == 0) & (income_balanced["class"] == 0)
balancedtrue_negatives = len(income_balanced[balancedtrue_negative_filter])

balancedfalse_positive_filter = (income_balanced["predicted_class"] == 1) & (income_balanced["class"] == 0)
balancedfalse_positives = len(income_balanced[balancedfalse_positive_filter])
print(balancedtrue_negatives)
print(balancedfalse_positives)

balancedspecificity = (balancedtrue_negatives) / (balancedfalse_positives + balancedtrue_negatives)
print(balancedspecificity)
balancedtpr = balancedtrue_positives / (balancedtrue_positives + balancedfalse_negatives)
balancedfpr= balancedfalse_positives / (balancedfalse_positives + balancedtrue_negatives)

print(balancedtpr)
print(balancedfpr)


6810
1031
0.8685116694299196
16155
8565
0.6535194174757282
0.8685116694299196
0.34648058252427183


In [80]:
#Finding the how far model can predict outcomes
#finding how far the model wrongly prditct negative outcome as positive
balancedtpr = balancedtrue_positives / (balancedtrue_positives + balancedfalse_negatives)
balancedfpr= balancedfalse_positives / (balancedfalse_positives + balancedtrue_negatives)

print(balancedtpr)
print(balancedfpr)

0.8685116694299196
0.34648058252427183


The model is very aggresive if the lending instituition wants to increase more lending rate to more customer if economy is good.

The balanced model predicts to lend more than 85 % those customers who are supposed to have loans at the expense huge  deafult loans possibility

In [81]:
# Bulding the model based on Random Forest classifier regression
##Finding the how far the balanced Random Forest classifier regression model can predict outcomes
#finding how far the balanced Random Forest classifier regression model wrongly prditct negative outcome as positive 

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
rf = RandomForestClassifier(class_weight="balanced", random_state=1)
predictions = cross_val_predict(rf, features, predict, cv=5)
income_balanced=income.copy()
income_balanced["predicted_class"]=predictions
    
balancedtrue_positive_filter = (income_balanced["predicted_class"] == 1) & (income_balanced["class"] == 1)
balancedtrue_positives = len(income_balanced[balancedtrue_positive_filter])


balancedfalse_negative_filter = (income_balanced["predicted_class"] == 0) & (income_balanced["class"] == 1)
balancedfalse_negatives = len(income_balanced[balancedfalse_negative_filter])

print(balancedtrue_positives)
print(balancedfalse_negatives)

balancedsensitivity = balancedtrue_positives / (balancedtrue_positives + balancedfalse_negatives)

print(balancedsensitivity)
    
balancedtrue_negative_filter = (income_balanced["predicted_class"] == 0) & (income_balanced["class"] == 0)
balancedtrue_negatives = len(income_balanced[balancedtrue_negative_filter])

balancedfalse_positive_filter = (income_balanced["predicted_class"] == 1) & (income_balanced["class"] == 0)
balancedfalse_positives = len(income_balanced[balancedfalse_positive_filter])
print(balancedtrue_negatives)
print(balancedfalse_positives)

balancedspecificity = (balancedtrue_negatives) / (balancedfalse_positives + balancedtrue_negatives)
print(balancedspecificity)
balancedtpr = balancedtrue_positives / (balancedtrue_positives + balancedfalse_negatives)
balancedfpr= balancedfalse_positives / (balancedfalse_positives + balancedtrue_negatives)

print(balancedtpr)
print(balancedfpr)

4772
3069
0.608595842367045
23106
1614
0.9347087378640777
0.608595842367045
0.06529126213592233


The Random Forest classifier model is very effective  if the lending instituition wants to prevent  more lending  to more customer as well as to decrease deafult loans possibility 
