# Doing the same project using a new method

Setting the environment

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

Data Exploration

In [2]:
df = pd.read_csv(r'C:\Users\Saidabrorkhon\Downloads\Bank_Personal_Loan_Modelling.csv')

In [3]:
df.shape

(5000, 14)

In [4]:
df.describe()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
count,5000.0,5000.0,4997.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,4998.0,5000.0,4998.0
mean,2500.5,45.3384,20.104263,73.7742,93152.503,1.937938,1.881,56.4988,0.096,0.1044,0.060424,0.5968,0.294118
std,1443.520003,11.463166,11.467194,46.033729,2121.852197,1.747659,0.839869,101.713802,0.294621,0.305809,0.238295,0.490589,0.455691
min,1.0,23.0,-3.0,8.0,9307.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1250.75,35.0,10.0,39.0,91911.0,0.7,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2500.5,45.0,20.0,64.0,93437.0,1.5,2.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,3750.25,55.0,30.0,98.0,94608.0,2.5,3.0,101.0,0.0,0.0,0.0,1.0,1.0
max,5000.0,67.0,43.0,224.0,96651.0,10.0,3.0,635.0,1.0,1.0,1.0,1.0,1.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  5000 non-null   int64  
 1   Age                 5000 non-null   int64  
 2   Experience          4997 non-null   float64
 3   Income              5000 non-null   int64  
 4   ZIP Code            5000 non-null   int64  
 5   Family              5000 non-null   object 
 6   CCAvg               5000 non-null   float64
 7   Education           5000 non-null   int64  
 8   Mortgage            5000 non-null   int64  
 9   Personal Loan       5000 non-null   int64  
 10  Securities Account  5000 non-null   int64  
 11  CD Account          4998 non-null   float64
 12  Online              5000 non-null   int64  
 13  CreditCard          4998 non-null   float64
dtypes: float64(4), int64(9), object(1)
memory usage: 547.0+ KB


In [6]:
df.head(1)

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1.0,49,91107,4,1.6,1,0,0,1,0.0,0,0.0


In [None]:
# df['Family'] = df['Family'].str.split('(').str[0].str.strip()

In [3]:
def extract_number(value):
  try:
    return int(value)
  except ValueError:
    try:
      return int(''.join(filter(str.isdigit, value)))
    except ValueError:
      return pd.NA
    
df['Family'] = df['Family'].apply(extract_number)

In [8]:
df.isnull().sum()

ID                    0
Age                   0
Experience            3
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            2
Online                0
CreditCard            2
dtype: int64

Data preprocessing - Missing values

In [4]:
for col in df.columns:
  if df[col].dtype == 'object':
    df[col].fillna(df[col].mode()[0], inplace=True)
  else:
    df[col].fillna(df[col].mean(), inplace=True)

In [10]:
df.isnull().sum()

ID                    0
Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64

Encoding

In [13]:
encoder = LabelEncoder()
if col in df.select_dtypes(include='object').columns:
  cardinality = df[col].nunique()
  if cardinality >= 5:
    df[col] = encoder.fit_transform(df[col])
  else:
    df = pd.get_dummies(df, columns=[col], dtype=int, drop_first=True)


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  5000 non-null   int64  
 1   Age                 5000 non-null   int64  
 2   Experience          5000 non-null   float64
 3   Income              5000 non-null   int64  
 4   ZIP Code            5000 non-null   int64  
 5   Family              5000 non-null   int64  
 6   CCAvg               5000 non-null   float64
 7   Education           5000 non-null   int64  
 8   Mortgage            5000 non-null   int64  
 9   Personal Loan       5000 non-null   int64  
 10  Securities Account  5000 non-null   int64  
 11  CD Account          5000 non-null   float64
 12  Online              5000 non-null   int64  
 13  CreditCard          5000 non-null   float64
dtypes: float64(4), int64(10)
memory usage: 547.0 KB


Scaling

In [6]:
scaler = StandardScaler()

In [13]:
original_columns = df.columns

In [8]:
df['ZIP Code'] = scaler.fit_transform(df['ZIP Code'].values.reshape(-1,1))

In [None]:
# df = scaler.fit_transform(df)

In [9]:
df

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1.0,49,-0.964114,4,1.6,1,0,0,1,0.0,0,0.0
1,2,45,19.0,34,-1.443932,3,1.5,1,0,0,1,0.0,0,0.0
2,3,39,15.0,11,0.738814,1,1.0,1,0,0,0,0.0,0,0.0
3,4,35,9.0,100,0.452243,1,2.7,2,0,0,0,0.0,0,0.0
4,5,35,8.0,45,-0.859007,4,1.0,2,0,0,0,0.0,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,29,3.0,40,-0.214694,1,1.9,3,0,0,0,0.0,1,0.0
4996,4997,30,4.0,15,-0.525774,4,0.4,1,85,0,0,0.0,1,0.0
4997,4998,63,39.0,24,-0.061039,2,0.3,3,0,0,0,0.0,0,0.0
4998,4999,65,40.0,49,-1.469855,3,0.5,2,0,0,0,0.0,1,0.0


In [15]:
df = pd.DataFrame(df, columns=original_columns)

In [16]:
df

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,-1.731704,-1.774417,-1.666660,-0.538229,-0.964114,1.397414,-0.193385,-1.049078,-0.555524,-0.325875,2.928915,-0.253645,-1.216618,-0.645626
1,-1.731012,-0.029524,-0.096336,-0.864109,-1.443932,0.525991,-0.250611,-1.049078,-0.555524,-0.325875,2.928915,-0.253645,-1.216618,-0.645626
2,-1.730319,-0.552992,-0.445297,-1.363793,0.738814,-1.216855,-0.536736,-1.049078,-0.555524,-0.325875,-0.341423,-0.253645,-1.216618,-0.645626
3,-1.729626,-0.901970,-0.968738,0.569765,0.452243,-1.216855,0.436091,0.141703,-0.555524,-0.325875,-0.341423,-0.253645,-1.216618,-0.645626
4,-1.728933,-0.901970,-1.055978,-0.625130,-0.859007,1.397414,-0.536736,0.141703,-0.555524,-0.325875,-0.341423,-0.253645,-1.216618,1.549503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,1.728933,-1.425438,-1.492179,-0.733757,-0.214694,-1.216855,-0.021710,1.332484,-0.555524,-0.325875,-0.341423,-0.253645,0.821951,-0.645626
4996,1.729626,-1.338194,-1.404939,-1.276892,-0.525774,1.397414,-0.880087,-1.049078,0.280238,-0.325875,-0.341423,-0.253645,0.821951,-0.645626
4997,1.730319,1.540880,1.648468,-1.081363,-0.061039,-0.345432,-0.937312,1.332484,-0.555524,-0.325875,-0.341423,-0.253645,-1.216618,-0.645626
4998,1.731012,1.715370,1.735708,-0.538229,-1.469855,0.525991,-0.822862,0.141703,-0.555524,-0.325875,-0.341423,-0.253645,0.821951,-0.645626


Model train

In [10]:
x = df.drop(['ID', 'Personal Loan'], axis = 1)
y = df['Personal Loan']

In [11]:
x_train,x_temp,y_train,y_temp = train_test_split(x,y,test_size=0.2, random_state=42)
x_test,x_val,y_test,y_val = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

In [12]:
print(y_train.dtypes)

int64


In [13]:
print(y_train.unique())

[0 1]


In [14]:
model = DecisionTreeClassifier()

In [15]:
model.fit(x_train, y_train)

In [16]:
y_pred = model.predict(x_test)

In [17]:
y_pred

array([0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [19]:
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy * 100)

Accuracy: 98.8
