Define a hypothesis to check the propensity for an individual to purchase a term deposit plan against their age.

In [1]:
# Import libraries
import pandas as pd
import altair as alt

In [2]:
file_url = r'C:\Users\Duezel\OneDrive - Isource Supply Consumer Goods Trading\XPS 9560 Documents\PythonCodes\Chapter 3\Chapter-3\data\bank-full.csv'

In [3]:
# Create a dataframe
df = pd.read_csv(file_url,
                 sep = ';')

In [4]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [5]:
# Get `yes` values in y column (filter it only)
# These are the summary of customers who bought
yes_y = df['y'] == 'yes'
df_sub1 = df[yes_y].groupby('age')['y'].agg(agecount = 'count').reset_index()
df_sub1

Unnamed: 0,age,agecount
0,18,7
1,19,11
2,20,15
3,21,22
4,22,40
...,...,...
69,87,3
70,90,2
71,92,2
72,93,2


In [6]:
# Plot a line chart using altair
alt.Chart(df_sub1).mark_line().encode(x = 'age',
                                      y = 'agecount')

In [7]:
# Change the color of the plot (red)
alt.Chart(df_sub1).mark_line(color = 'red').encode(x = 'age',
                                                   y = 'agecount')

In [8]:
# Change the name of the axis y
alt.Chart(df_sub1).mark_line(color = 'salmon').encode(x = 'age',
                                                    y = alt.Y('agecount',
                                                                axis = alt.Axis(title = 'Number of Customers')))

In [11]:
# Group the data per age using `groupby()` method and find the total number of customers under each age group using the agg() method
ageTotal = df.groupby('age')['y'].agg(agesum = 'count').reset_index()
ageTotal

Unnamed: 0,age,agesum
0,18,12
1,19,35
2,20,50
3,21,79
4,22,129
...,...,...
72,90,2
73,92,2
74,93,2
75,94,1


In [12]:
# Get all the details with the identification of Yes/No
ageProp = df.groupby(['age', 'y'])['y'].agg(ageCat = 'count').reset_index()
ageProp

Unnamed: 0,age,y,ageCat
0,18,no,5
1,18,yes,7
2,19,no,24
3,19,yes,11
4,20,no,35
...,...,...,...
143,92,yes,2
144,93,yes,2
145,94,no,1
146,95,no,1


In [15]:
# Merge both the DataFrames
ageComb = pd.merge(ageProp,
                   ageTotal,
                   left_on = ['age'],
                   right_on = ['age'])
ageComb .head()

Unnamed: 0,age,y,ageCat,agesum
0,18,no,5,12
1,18,yes,7,12
2,19,no,24,35
3,19,yes,11,35
4,20,no,35,50


In [17]:
ageComb['catProp'] = (ageComb['ageCat']/ageComb['agesum'])*100
ageComb.head()

Unnamed: 0,age,y,ageCat,agesum,catProp
0,18,no,5,12,41.666667
1,18,yes,7,12,58.333333
2,19,no,24,35,68.571429
3,19,yes,11,35,31.428571
4,20,no,35,50,70.0


In [19]:
# Visualize the relationship
alt.Chart(ageComb).mark_line(color = 'salmon').encode(x = 'age',
                                                      y = 'catProp').facet(column = 'y')

We are able to get two meaningful plots showing the propensity of people to buy term deposit plans.

We can see in the first graph, with the age group starting 20 to 60, individuals would not be inclined to purchase term deposit. However, in the second graph, we can see the opposite, where the group of individuals aging 60 and up are inclined to purchase deposit plan.