In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from mlxtend.frequent_patterns import fpgrowth, apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

In [3]:
df = pd.read_csv('/kaggle/input/bankfullcsv/bank-full.csv')
df.head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,Target
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
df.shape

(45211, 17)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  Target     45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [6]:
df.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


In [7]:
cols = df.columns
cols = cols.str.replace('catg_','')
cols = cols.str.replace('_',' ')
df.columns = cols
df.head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,Target
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [8]:
static_data = df.drop('age', axis= 1)
static_data = static_data.groupby('balance').sum().reset_index()

for col in static_data.columns[1:]:
    static_data[col] = static_data[col].apply(lambda x: 1 if x>0 else 0)
    
static_data.head(5)

Unnamed: 0,balance,day,duration,campaign,pdays,previous
0,-8019,1,1,1,0,0
1,-6847,1,1,1,0,0
2,-4057,1,1,1,0,0
3,-3372,1,1,1,0,0
4,-3313,1,1,1,0,0


In [9]:
patterns = fpgrowth(static_data.iloc[:,1:], min_support= 0.00002, use_colnames= True)

frequent_sets = patterns.sort_values('support', ascending= False)[:10]
frequent_sets.support = frequent_sets.support * len(static_data)
frequent_sets



Unnamed: 0,support,itemsets
0,7168.0,(campaign)
5,7168.0,"(day, campaign)"
1,7168.0,(day)
9,7167.0,"(day, duration)"
6,7167.0,"(duration, campaign)"
15,7167.0,"(campaign, day, duration)"
2,7167.0,(duration)
18,3176.0,"(previous, duration, campaign)"
3,3176.0,(previous)
10,3176.0,"(day, previous)"


In [10]:
rules = association_rules(patterns, metric= 'confidence', min_threshold= 0.1)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(day),(campaign),1.000000,1.000000,1.000000,1.000000,1.000000,0.000000,inf
1,(campaign),(day),1.000000,1.000000,1.000000,1.000000,1.000000,0.000000,inf
2,(duration),(campaign),0.999860,1.000000,0.999860,1.000000,1.000000,0.000000,inf
3,(campaign),(duration),1.000000,0.999860,0.999860,0.999860,1.000000,0.000000,1.000000
4,(previous),(campaign),0.443080,1.000000,0.443080,1.000000,1.000000,0.000000,inf
...,...,...,...,...,...,...,...,...,...
175,(previous),"(campaign, pdays, duration, day)",0.443080,0.442243,0.442243,0.998111,2.256927,0.246294,295.239211
176,(duration),"(campaign, pdays, previous, day)",0.999860,0.442243,0.442243,0.442305,1.000140,0.000062,1.000111
177,(pdays),"(campaign, day, previous, duration)",0.442243,0.443080,0.442243,1.000000,2.256927,0.246294,inf
178,(day),"(campaign, pdays, previous, duration)",1.000000,0.442243,0.442243,0.442243,1.000000,0.000000,1.000000
