Task: Build a decision tree classifier to predict whether a customer will purchase a product or service based on their demographic and behavioral data. Use a dataset such as the Bank Marketing dataset from the UCI Machine Learning Repository.

Step-1: Load the dataset into the environment

In [30]:
import pandas as pd
import numpy as np
data=pd.read_csv('bank-full.csv')

Step-2: Cleaning the dataset

In [31]:
print(data.columns)

Index(['age;"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"'], dtype='object')


In [32]:
# we got to know that all columns are present in same column. this doesn't contribute to the algorithm. So we need to split the columns

# split columns using a delimiter
split_cols=data['age;"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"'].str.split(';',expand=True)

split_cols.columns = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 
                      'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 
                      'poutcome', 'y']
data=pd.concat([data.drop('age;"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"',axis=1),split_cols],axis=1)
print(data.head())
# save the new DataFrame with separated columns
data.to_csv('data_cleaned.csv',index=False)

  age             job    marital    education default balance housing   loan  \
0  58    "management"  "married"   "tertiary"    "no"    2143   "yes"   "no"   
1  44    "technician"   "single"  "secondary"    "no"      29   "yes"   "no"   
2  33  "entrepreneur"  "married"  "secondary"    "no"       2   "yes"  "yes"   
3  47   "blue-collar"  "married"    "unknown"    "no"    1506   "yes"   "no"   
4  33       "unknown"   "single"    "unknown"    "no"       1    "no"   "no"   

     contact day  month duration campaign pdays previous   poutcome     y  
0  "unknown"   5  "may"      261        1    -1        0  "unknown"  "no"  
1  "unknown"   5  "may"      151        1    -1        0  "unknown"  "no"  
2  "unknown"   5  "may"       76        1    -1        0  "unknown"  "no"  
3  "unknown"   5  "may"       92        1    -1        0  "unknown"  "no"  
4  "unknown"   5  "may"      198        1    -1        0  "unknown"  "no"  


In [33]:
# creating a new dataframe for the cleaned dataset
df_cleaned=pd.read_csv('data_cleaned.csv')
print(df_cleaned.head())
print(df_cleaned.dtypes)

   age             job    marital    education default  balance housing  \
0   58    "management"  "married"   "tertiary"    "no"     2143   "yes"   
1   44    "technician"   "single"  "secondary"    "no"       29   "yes"   
2   33  "entrepreneur"  "married"  "secondary"    "no"        2   "yes"   
3   47   "blue-collar"  "married"    "unknown"    "no"     1506   "yes"   
4   33       "unknown"   "single"    "unknown"    "no"        1    "no"   

    loan    contact  day  month  duration  campaign  pdays  previous  \
0   "no"  "unknown"    5  "may"       261         1     -1         0   
1   "no"  "unknown"    5  "may"       151         1     -1         0   
2  "yes"  "unknown"    5  "may"        76         1     -1         0   
3   "no"  "unknown"    5  "may"        92         1     -1         0   
4   "no"  "unknown"    5  "may"       198         1     -1         0   

    poutcome     y  
0  "unknown"  "no"  
1  "unknown"  "no"  
2  "unknown"  "no"  
3  "unknown"  "no"  
4  "unknown

In [34]:
# string columns are having data with double quotes
# we need to remove them
df_cleaned=df_cleaned.apply(lambda x: x.str.replace('"','') if x.dtype=="object" else x)
print(df_cleaned.head())
df_cleaned.to_csv('data_cleaned.csv',index=False)

   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may       261         1     -1         0  unknown  no  
1  unknown    5   may       151         1     -1         0  unknown  no  
2  unknown    5   may        76         1     -1         0  unknown  no  
3  unknown    5   may        92         1     -1         0  unknown  no  
4  unknown    5   may       198         1     -1         0  unknown  no  


In [35]:
# replacing the "unknown" value with ""
df_cleaned=df_cleaned.apply(lambda x: x.str.replace('unknown','') if x.dtype=="object" else x)
# replacing the empty string with null values
df_cleaned.replace("", np.nan, inplace=True)
df_cleaned.to_csv('data_cleaned.csv',index=False)

In [37]:
# checking for null values in the dataset
print(df_cleaned.isnull().sum())


age              0
job            288
marital          0
education     1857
default          0
balance          0
housing          0
loan             0
contact      13020
day              0
month            0
duration         0
campaign         0
pdays            0
previous         0
poutcome     36959
y                0
dtype: int64


In [38]:
# we got to know that some columns have missing values and all of them are string/ object type columns
# so we replace them with either the previous or latter cell value
df_cleaned['job'] = df_cleaned['job'].fillna(method='ffill').fillna(method='bfill')
df_cleaned['education'] = df_cleaned['education'].fillna(method='ffill').fillna(method='bfill')
df_cleaned['contact'] = df_cleaned['contact'].fillna(method='ffill').fillna(method='bfill')
df_cleaned['poutcome'] = df_cleaned['poutcome'].fillna(method='ffill').fillna(method='bfill')

  df_cleaned['job'] = df_cleaned['job'].fillna(method='ffill').fillna(method='bfill')
  df_cleaned['education'] = df_cleaned['education'].fillna(method='ffill').fillna(method='bfill')
  df_cleaned['contact'] = df_cleaned['contact'].fillna(method='ffill').fillna(method='bfill')
  df_cleaned['poutcome'] = df_cleaned['poutcome'].fillna(method='ffill').fillna(method='bfill')


In [39]:
# cross checking if there are any missing values left
print(df_cleaned.isnull().sum())

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


In [40]:
# saving the work into the csv file previously made
df_cleaned.to_csv('data_cleaned.csv',index=False)