In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_absolute_error ,mean_squared_error,r2_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
data = pd.read_csv('bank.csv')

In [3]:
data.head(7)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes
5,42,management,single,tertiary,no,0,yes,yes,unknown,5,may,562,2,-1,0,unknown,yes
6,56,management,married,tertiary,no,830,yes,yes,unknown,6,may,1201,1,-1,0,unknown,yes


In [4]:
data.index

RangeIndex(start=0, stop=11162, step=1)

In [5]:
data.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
deposit      object
dtype: object

In [6]:
data.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0
mean,41.231948,1528.538524,15.658036,371.993818,2.508421,51.330407,0.832557
std,11.913369,3225.413326,8.42074,347.128386,2.722077,108.758282,2.292007
min,18.0,-6847.0,1.0,2.0,1.0,-1.0,0.0
25%,32.0,122.0,8.0,138.0,1.0,-1.0,0.0
50%,39.0,550.0,15.0,255.0,2.0,-1.0,0.0
75%,49.0,1708.0,22.0,496.0,3.0,20.75,1.0
max,95.0,81204.0,31.0,3881.0,63.0,854.0,58.0


In [7]:
data.shape

(11162, 17)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11162 entries, 0 to 11161
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        11162 non-null  int64 
 1   job        11162 non-null  object
 2   marital    11162 non-null  object
 3   education  11162 non-null  object
 4   default    11162 non-null  object
 5   balance    11162 non-null  int64 
 6   housing    11162 non-null  object
 7   loan       11162 non-null  object
 8   contact    11162 non-null  object
 9   day        11162 non-null  int64 
 10  month      11162 non-null  object
 11  duration   11162 non-null  int64 
 12  campaign   11162 non-null  int64 
 13  pdays      11162 non-null  int64 
 14  previous   11162 non-null  int64 
 15  poutcome   11162 non-null  object
 16  deposit    11162 non-null  object
dtypes: int64(7), object(10)
memory usage: 1.4+ MB


Split the data into training and testing

In [9]:
data.head(3)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes


In [10]:
data = pd.get_dummies(data , columns=['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome'] , drop_first=True)

In [11]:
data.head(3)

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,deposit,job_blue-collar,job_entrepreneur,...,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown
0,59,2343,5,1042,1,-1,0,yes,False,False,...,False,False,False,True,False,False,False,False,False,True
1,56,45,5,1467,1,-1,0,yes,False,False,...,False,False,False,True,False,False,False,False,False,True
2,41,1270,5,1389,1,-1,0,yes,False,False,...,False,False,False,True,False,False,False,False,False,True


In [12]:
inputs = data.drop(columns=['deposit'], axis=1)
target = data['deposit']

In [13]:
inputs.head(3)

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_blue-collar,job_entrepreneur,job_housemaid,...,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown
0,59,2343,5,1042,1,-1,0,False,False,False,...,False,False,False,True,False,False,False,False,False,True
1,56,45,5,1467,1,-1,0,False,False,False,...,False,False,False,True,False,False,False,False,False,True
2,41,1270,5,1389,1,-1,0,False,False,False,...,False,False,False,True,False,False,False,False,False,True


In [14]:
target.head(3)

0    yes
1    yes
2    yes
Name: deposit, dtype: object

In [15]:
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.3, random_state=33)

In [16]:
X_train

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_blue-collar,job_entrepreneur,job_housemaid,...,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown
9585,38,151,17,50,1,-1,0,False,False,False,...,True,False,False,False,False,False,False,False,False,True
10182,53,974,26,197,2,-1,0,True,False,False,...,False,False,False,True,False,False,False,False,False,True
1701,32,4733,30,532,1,-1,0,False,False,False,...,False,False,False,False,False,False,False,False,False,True
95,37,122,19,1622,2,-1,0,False,False,False,...,False,False,False,True,False,False,False,False,False,True
6208,28,167,18,126,2,-1,0,False,False,False,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10898,35,97,29,135,4,-1,0,False,False,False,...,False,False,False,True,False,False,False,False,False,True
578,49,566,25,979,2,-1,0,True,False,False,...,True,False,False,False,False,False,False,False,False,True
5848,38,173,18,205,4,-1,0,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2439,53,0,6,1452,1,98,2,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [17]:
X_test

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_blue-collar,job_entrepreneur,job_housemaid,...,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown
276,42,105,11,1159,4,-1,0,False,False,False,...,False,True,False,False,False,False,False,False,False,True
6821,60,104,5,22,1,-1,0,True,False,False,...,False,False,False,True,False,False,False,False,False,True
8544,25,-276,4,105,1,-1,0,False,False,False,...,False,True,False,False,False,False,False,False,False,True
1868,40,0,11,657,1,-1,0,False,False,False,...,False,False,False,True,False,False,False,False,False,True
1056,53,1777,21,796,5,154,1,True,False,False,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1586,74,9480,21,211,1,-1,0,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3648,49,293,28,452,1,-1,0,False,False,False,...,False,True,False,False,False,False,False,False,False,True
2108,36,255,22,970,2,92,2,False,False,False,...,False,False,False,True,False,False,False,True,False,False
9003,24,541,5,622,4,-1,0,False,False,False,...,False,True,False,False,False,False,False,False,False,True


In [18]:
# Create a Decision Tree Classifier
model_clf = DecisionTreeClassifier()

In [19]:
model_clf.fit(X_train, y_train)

In [20]:
prediction = model_clf.predict(X_test)

In [21]:
accuracy = accuracy_score(y_test, prediction)
print("Accuracy:", accuracy)

Accuracy: 0.7891908032248433


In [22]:
y_test

276     yes
6821     no
8544     no
1868    yes
1056    yes
       ... 
1586    yes
3648    yes
2108    yes
9003     no
8095     no
Name: deposit, Length: 3349, dtype: object

In [27]:
conf_matrix = confusion_matrix(y_test, prediction)
class_report = classification_report(y_test, prediction)

In [30]:
print(f"Confusion Matrix:\n {conf_matrix}")
print("\n")
print(f"Classification Report:\n{class_report}")

Confusion Matrix:
 [[1387  338]
 [ 368 1256]]


Classification Report:
              precision    recall  f1-score   support

          no       0.79      0.80      0.80      1725
         yes       0.79      0.77      0.78      1624

    accuracy                           0.79      3349
   macro avg       0.79      0.79      0.79      3349
weighted avg       0.79      0.79      0.79      3349

