# PRODIGY-DS- 03

### Build a decision tree classifier to predict whether a customer will purchase a product or service based on their demographic and behavioral data. Use a dataset such as the Bank Marketing dataset from the UCI Machine Learning Repository.

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
data = pd.read_csv("bank-full.csv")
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [3]:
from sklearn.preprocessing import LabelEncoder
to_encod = LabelEncoder()

In [4]:
data.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [5]:
to_encode = ['job', 'marital', 'education', 'default','housing',
       'loan', 'contact', 'month','poutcome', 'y']
for i in to_encode:
    data[i]=to_encod.fit_transform(data[i])

In [6]:
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,4,1,2,0,2143,1,0,2,5,8,261,1,-1,0,3,0
1,44,9,2,1,0,29,1,0,2,5,8,151,1,-1,0,3,0
2,33,2,1,1,0,2,1,1,2,5,8,76,1,-1,0,3,0
3,47,1,1,3,0,1506,1,0,2,5,8,92,1,-1,0,3,0
4,33,11,2,3,0,1,0,0,2,5,8,198,1,-1,0,3,0


In [7]:
df1 = data.copy()

## Scaling

In [8]:
feature_scale=[feature for feature in df1.columns if feature not in ['y']]

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(df1[feature_scale])

In [9]:
scaler.transform(df1[feature_scale])

array([[0.51948052, 0.36363636, 0.5       , ..., 0.        , 0.        ,
        1.        ],
       [0.33766234, 0.81818182, 1.        , ..., 0.        , 0.        ,
        1.        ],
       [0.19480519, 0.18181818, 0.5       , ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.7012987 , 0.45454545, 0.5       , ..., 0.21215596, 0.01090909,
        0.66666667],
       [0.50649351, 0.09090909, 0.5       , ..., 0.        , 0.        ,
        1.        ],
       [0.24675325, 0.18181818, 0.5       , ..., 0.21674312, 0.04      ,
        0.33333333]])

In [10]:
data1 = pd.concat([df1[['y']].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(df1[feature_scale]), columns=feature_scale)],
                    axis=1)

In [11]:
data1.head()

Unnamed: 0,y,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,0,0.519481,0.363636,0.5,0.666667,0.0,0.092259,1.0,0.0,1.0,0.133333,0.727273,0.05307,0.0,0.0,0.0,1.0
1,0,0.337662,0.818182,1.0,0.333333,0.0,0.073067,1.0,0.0,1.0,0.133333,0.727273,0.030704,0.0,0.0,0.0,1.0
2,0,0.194805,0.181818,0.5,0.333333,0.0,0.072822,1.0,1.0,1.0,0.133333,0.727273,0.015453,0.0,0.0,0.0,1.0
3,0,0.376623,0.090909,0.5,1.0,0.0,0.086476,1.0,0.0,1.0,0.133333,0.727273,0.018707,0.0,0.0,0.0,1.0
4,0,0.194805,1.0,1.0,1.0,0.0,0.072812,0.0,0.0,1.0,0.133333,0.727273,0.04026,0.0,0.0,0.0,1.0


In [12]:
data.y.value_counts()

0    39922
1     5289
Name: y, dtype: int64

In [14]:
# Split X and Y
X = data.drop(columns = ['y'])
y = data['y']

# Balancing Dataset

In [15]:
# Since data(target variable) is not balamced,we have to balance data using smote.
from collections import Counter
from imblearn.over_sampling import SMOTE #SMOTE(synthetic minority oversampling techinque)
sm = SMOTE() # obeject creation
print("unbalanced data   :  ",Counter(y))
X_sm,y_sm = sm.fit_resample(X,y)
print("balanced data:    :",Counter(y_sm))

unbalanced data   :   Counter({0: 39922, 1: 5289})
balanced data:    : Counter({0: 39922, 1: 39922})


- Since data(target variable) is not balamced,we have to balance data using smote.

# splitting the training and testing data

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_sm,y_sm,random_state=42)

## Decision Tree with GridSearchCV

In [17]:
# Training the model
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
classifier_dtg=DecisionTreeClassifier(random_state=42,splitter='best')
parameters=[{'min_samples_split':[2,3,4,5],'criterion':['gini']},{'min_samples_split':[2,3,4,5],'criterion':['entropy']}]

model_griddtree=GridSearchCV(estimator=classifier_dtg, param_grid=parameters, scoring='accuracy',cv=10)
model_griddtree.fit(X_train,y_train)

In [18]:
model_griddtree.best_params_

{'criterion': 'entropy', 'min_samples_split': 3}

In [19]:
# Predicting the model
y_predict_dtree = model_griddtree.predict(X_test)

In [23]:
y_predict_dtree

array([1, 1, 1, ..., 0, 0, 0])

In [22]:
# Finding accuracy, precision, recall and confusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score 
from sklearn.metrics import classification_report
print(accuracy_score(y_test,y_predict_dtree))
print(classification_report(y_test,y_predict_dtree))

0.8904363508842242
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      9949
           1       0.88      0.90      0.89     10012

    accuracy                           0.89     19961
   macro avg       0.89      0.89      0.89     19961
weighted avg       0.89      0.89      0.89     19961

