# Import libraries

In [None]:
#Here we want to understand which features are strongly associated to survived = yes. eg. if 1st class, Male->Yes OR female->yes
#Install 'mlxtend' Library if not installed already
!pip install mlxtend



In [None]:
import mlxtend

In [None]:
import pandas as pd
from mlxtend.frequent_patterns import apriori,association_rules
from mlxtend.preprocessing import TransactionEncoder

In [None]:
titanic = pd.read_csv("Titanic.csv")
titanic

Unnamed: 0,Class,Gender,Age,Survived
0,3rd,Male,Child,No
1,3rd,Male,Child,No
2,3rd,Male,Child,No
3,3rd,Male,Child,No
4,3rd,Male,Child,No
...,...,...,...,...
2196,Crew,Female,Adult,Yes
2197,Crew,Female,Adult,Yes
2198,Crew,Female,Adult,Yes
2199,Crew,Female,Adult,Yes


In [None]:
titanic['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
Crew,885
3rd,706
1st,325
2nd,285


In [None]:
titanic['Gender'].value_counts()

Unnamed: 0_level_0,count
Gender,Unnamed: 1_level_1
Male,1731
Female,470


In [None]:
titanic['Age'].value_counts()

Unnamed: 0_level_0,count
Age,Unnamed: 1_level_1
Adult,2092
Child,109


In [None]:
titanic['Survived'].value_counts()

Unnamed: 0_level_0,count
Survived,Unnamed: 1_level_1
No,1490
Yes,711


# Pre-Processing
As the data is not in transaction formation
# We are using **transaction Encoder**

In [None]:
array = titanic.values

In [None]:
encoder = TransactionEncoder()
onehot = encoder.fit_transform(array)
onehot_df = pd.DataFrame(onehot, columns=encoder.columns_)

In [None]:
onehot_df

Unnamed: 0,1st,2nd,3rd,Adult,Child,Crew,Female,Male,No,Yes
0,False,False,True,False,True,False,False,True,True,False
1,False,False,True,False,True,False,False,True,True,False
2,False,False,True,False,True,False,False,True,True,False
3,False,False,True,False,True,False,False,True,True,False
4,False,False,True,False,True,False,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...
2196,False,False,False,True,False,True,True,False,False,True
2197,False,False,False,True,False,True,True,False,False,True
2198,False,False,False,True,False,True,True,False,False,True
2199,False,False,False,True,False,True,True,False,False,True


# **OR Use get_dummies() method**

In [None]:
df=pd.get_dummies(titanic)
df

Unnamed: 0,Class_1st,Class_2nd,Class_3rd,Class_Crew,Gender_Female,Gender_Male,Age_Adult,Age_Child,Survived_No,Survived_Yes
0,False,False,True,False,False,True,False,True,True,False
1,False,False,True,False,False,True,False,True,True,False
2,False,False,True,False,False,True,False,True,True,False
3,False,False,True,False,False,True,False,True,True,False
4,False,False,True,False,False,True,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...
2196,False,False,False,True,True,False,True,False,False,True
2197,False,False,False,True,True,False,True,False,False,True
2198,False,False,False,True,True,False,True,False,False,True
2199,False,False,False,True,True,False,True,False,False,True


# Apriori Algorithm

Practical steps for choosing min_support
Start with a moderate value like 0.1 (10%)

Observe number of frequent itemsets:

Too many? ➔ Increase min_support

Too few or none? ➔ Decrease min_support

Adjust based on business logic:

If a "rare event" is still valuable (e.g., buying a luxury item), use lower support.

If you want only very frequent combinations (e.g., bread + milk), use higher support.

Thumb Rules for min_support:
Dataset size:	Suggested min_support
10k+ transactions:	0.05–0.2,
100–10k transactions:	0.01–0.1,
<100 transactions:	0.005–0.05

In [None]:
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)#df-dataframe, min_support default=0.5
# min_support=0.1: keep only those itemsets appearing in at least 10% of transactions.
# min_support is a float between 0 and 1 for minumum support of the itemsets returned.
# The support is computed as the fraction transactions_where_item(s)_occur / total_transactions.
frequent_itemsets # output is frequent itemsets: 1-item itemsets, 2-item itemsets etc. with min support criteria

Unnamed: 0,support,itemsets
0,0.14766,(Class_1st)
1,0.129487,(Class_2nd)
2,0.320763,(Class_3rd)
3,0.40209,(Class_Crew)
4,0.213539,(Gender_Female)
5,0.786461,(Gender_Male)
6,0.950477,(Age_Adult)
7,0.676965,(Survived_No)
8,0.323035,(Survived_Yes)
9,0.144934,"(Age_Adult, Class_1st)"


In [None]:
rules = association_rules(frequent_itemsets, metric="lift",num_itemsets=2201, min_threshold=1.2) # min_threshold is Confidence 1.2
rules # http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/
rules.sort_values('lift',ascending = False)[0:20]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
14,"(Age_Adult, Gender_Female)",(Survived_Yes),0.193094,0.323035,0.143571,0.743529,2.301699,1.0,0.081195,2.639542,0.700873,0.385366,0.621146,0.593987
17,(Survived_Yes),"(Age_Adult, Gender_Female)",0.323035,0.193094,0.143571,0.444444,2.301699,1.0,0.081195,1.452431,0.835403,0.385366,0.311499,0.593987
3,(Survived_Yes),(Gender_Female),0.323035,0.213539,0.156293,0.483826,2.265745,1.0,0.087312,1.523634,0.825219,0.410992,0.343674,0.60787
2,(Gender_Female),(Survived_Yes),0.213539,0.323035,0.156293,0.731915,2.265745,1.0,0.087312,2.525187,0.710327,0.410992,0.60399,0.60787
15,"(Age_Adult, Survived_Yes)",(Gender_Female),0.297138,0.213539,0.143571,0.48318,2.262724,1.0,0.080121,1.521732,0.793974,0.391089,0.342854,0.57776
16,(Gender_Female),"(Age_Adult, Survived_Yes)",0.213539,0.297138,0.143571,0.67234,2.262724,1.0,0.080121,2.145099,0.709577,0.391089,0.533821,0.57776
23,"(Class_Crew, Survived_No)","(Age_Adult, Gender_Male)",0.30577,0.757383,0.304407,0.995542,1.31445,1.0,0.072822,54.427079,0.344592,0.401198,0.981627,0.698731
24,"(Age_Adult, Gender_Male)","(Class_Crew, Survived_No)",0.757383,0.30577,0.304407,0.40192,1.31445,1.0,0.072822,1.160764,0.986022,0.401198,0.138498,0.698731
8,(Class_Crew),"(Age_Adult, Gender_Male)",0.40209,0.757383,0.39164,0.974011,1.286022,1.0,0.087104,9.33548,0.371976,0.510059,0.892882,0.745554
7,"(Age_Adult, Gender_Male)",(Class_Crew),0.757383,0.40209,0.39164,0.517097,1.286022,1.0,0.087104,1.238157,0.916706,0.510059,0.192348,0.745554


In [None]:
rules[rules.lift>1]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(Class_Crew),(Gender_Male),0.40209,0.786461,0.39164,0.974011,1.238474,1.0,0.075412,8.216621,0.322047,0.491448,0.878295,0.735995
1,(Gender_Male),(Class_Crew),0.786461,0.40209,0.39164,0.497978,1.238474,1.0,0.075412,1.191004,0.90173,0.491448,0.160372,0.735995
2,(Gender_Female),(Survived_Yes),0.213539,0.323035,0.156293,0.731915,2.265745,1.0,0.087312,2.525187,0.710327,0.410992,0.60399,0.60787
3,(Survived_Yes),(Gender_Female),0.323035,0.213539,0.156293,0.483826,2.265745,1.0,0.087312,1.523634,0.825219,0.410992,0.343674,0.60787
4,"(Class_3rd, Gender_Male)",(Survived_No),0.231713,0.676965,0.191731,0.827451,1.222295,1.0,0.03487,1.872135,0.236717,0.267427,0.46585,0.555336
5,(Survived_No),"(Class_3rd, Gender_Male)",0.676965,0.231713,0.191731,0.283221,1.222295,1.0,0.03487,1.071861,0.562995,0.267427,0.067043,0.555336
6,"(Class_Crew, Age_Adult)",(Gender_Male),0.40209,0.786461,0.39164,0.974011,1.238474,1.0,0.075412,8.216621,0.322047,0.491448,0.878295,0.735995
7,"(Age_Adult, Gender_Male)",(Class_Crew),0.757383,0.40209,0.39164,0.517097,1.286022,1.0,0.087104,1.238157,0.916706,0.510059,0.192348,0.745554
8,(Class_Crew),"(Age_Adult, Gender_Male)",0.40209,0.757383,0.39164,0.974011,1.286022,1.0,0.087104,9.33548,0.371976,0.510059,0.892882,0.745554
9,(Gender_Male),"(Class_Crew, Age_Adult)",0.786461,0.40209,0.39164,0.497978,1.238474,1.0,0.075412,1.191004,0.90173,0.491448,0.160372,0.735995


# Another Example with Retail_Dataset

In [None]:
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

In [None]:
df = pd.read_csv('/content/retail_dataset.csv')
df.head()

  and should_run_async(code)


Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,


In [None]:
df.shape

  and should_run_async(code)


(315, 7)

In [None]:
# Find unique values of column index 0.
items = (df['0'].unique())
items

  and should_run_async(code)


array(['Bread', 'Cheese', 'Meat', 'Eggs', 'Wine', 'Bagel', 'Pencil',
       'Diaper', 'Milk'], dtype=object)

In [None]:
items = (df['1'].unique())
items

  and should_run_async(code)


array(['Wine', 'Cheese', 'Meat', 'Pencil', 'Bread', 'Diaper', 'Eggs', nan,
       'Bagel', 'Milk'], dtype=object)

In [None]:
# One-hot encoding
encoded_vals = []
for index, row in df.iterrows(): # iterrows() returns index and content for each row
    labels = {}
    uncommons = list(set(items) - set(row))#total 9 items (Bagel,Milk,...,pencil) - items present in row = Bagel, Milk (for 1st row these 2 items are not present)
    commons = list(set(items).intersection(row))# common items
    for uc in uncommons:
        labels[uc] = 0 # assign 0 to uncommon items
    for com in commons:
        labels[com] = 1# assign 1 to common items
    encoded_vals.append(labels)

  and should_run_async(code)


In [None]:
encoded_vals

  and should_run_async(code)


[{'Milk': 0,
  nan: 0,
  'Bagel': 0,
  'Cheese': 1,
  'Meat': 1,
  'Pencil': 1,
  'Eggs': 1,
  'Wine': 1,
  'Bread': 1,
  'Diaper': 1},
 {'Eggs': 0,
  nan: 0,
  'Bagel': 0,
  'Cheese': 1,
  'Meat': 1,
  'Pencil': 1,
  'Wine': 1,
  'Bread': 1,
  'Diaper': 1,
  'Milk': 1},
 {'Bread': 0,
  'Diaper': 0,
  'Pencil': 0,
  'Bagel': 0,
  'Cheese': 1,
  nan: 1,
  'Eggs': 1,
  'Wine': 1,
  'Meat': 1,
  'Milk': 1},
 {'Bread': 0,
  'Diaper': 0,
  'Pencil': 0,
  'Bagel': 0,
  'Cheese': 1,
  nan: 1,
  'Eggs': 1,
  'Wine': 1,
  'Meat': 1,
  'Milk': 1},
 {'Cheese': 0,
  'Bagel': 0,
  'Eggs': 0,
  'Bread': 0,
  'Diaper': 0,
  'Milk': 0,
  'Meat': 1,
  'Wine': 1,
  nan: 1,
  'Pencil': 1},
 {'Meat': 0,
  'Cheese': 0,
  nan: 0,
  'Bagel': 1,
  'Pencil': 1,
  'Eggs': 1,
  'Wine': 1,
  'Bread': 1,
  'Diaper': 1,
  'Milk': 1},
 {'Meat': 0,
  'Bagel': 0,
  'Bread': 0,
  'Diaper': 0,
  'Milk': 0,
  'Cheese': 1,
  nan: 1,
  'Pencil': 1,
  'Eggs': 1,
  'Wine': 1},
 {'Meat': 0,
  'Cheese': 0,
  'Wine': 0,
  'Eggs

In [None]:
ohe_df = pd.DataFrame(encoded_vals)

  and should_run_async(code)


In [None]:
ohe_df

  and should_run_async(code)


Unnamed: 0,Milk,NaN,Bagel,Cheese,Meat,Pencil,Eggs,Wine,Bread,Diaper
0,0,0,0,1,1,1,1,1,1,1
1,1,0,0,1,1,1,0,1,1,1
2,1,1,0,1,1,0,1,1,0,0
3,1,1,0,1,1,0,1,1,0,0
4,0,1,0,0,1,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
310,0,1,0,1,0,0,1,0,1,0
311,1,1,0,0,1,1,0,0,0,0
312,0,0,0,1,1,1,1,1,1,1
313,0,1,0,1,1,0,0,0,0,0


In [None]:
freq_items = apriori(ohe_df, min_support = 0.2, use_colnames = True)

  and should_run_async(code)


In [None]:
freq_items

  and should_run_async(code)


Unnamed: 0,support,itemsets
0,0.501587,(Milk)
1,0.869841,(nan)
2,0.425397,(Bagel)
3,0.501587,(Cheese)
4,0.47619,(Meat)
5,0.361905,(Pencil)
6,0.438095,(Eggs)
7,0.438095,(Wine)
8,0.504762,(Bread)
9,0.406349,(Diaper)


In [None]:
asso_rules=association_rules(freq_items, metric = "confidence", num_itemsets = 315, min_threshold = 0.6) # can use metric as lift

  and should_run_async(code)


In [None]:
asso_rules

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(Milk),(nan),0.501587,0.869841,0.409524,0.816456,0.938626,1.0,-0.026778,0.709141,-0.115976,0.425743,-0.410157,0.643629
1,(Cheese),(Milk),0.501587,0.501587,0.304762,0.607595,1.211344,1.0,0.053172,1.270148,0.350053,0.436364,0.21269,0.607595
2,(Milk),(Cheese),0.501587,0.501587,0.304762,0.607595,1.211344,1.0,0.053172,1.270148,0.350053,0.436364,0.21269,0.607595
3,(Bagel),(nan),0.425397,0.869841,0.336508,0.791045,0.909413,1.0,-0.03352,0.622902,-0.147743,0.350993,-0.605388,0.588953
4,(Cheese),(nan),0.501587,0.869841,0.393651,0.78481,0.902245,1.0,-0.042651,0.604855,-0.178565,0.402597,-0.653288,0.618682
5,(Meat),(nan),0.47619,0.869841,0.368254,0.773333,0.889051,1.0,-0.045956,0.57423,-0.192405,0.376623,-0.741463,0.598345
6,(Pencil),(nan),0.361905,0.869841,0.266667,0.736842,0.8471,1.0,-0.048133,0.494603,-0.220499,0.276316,-1.021823,0.521706
7,(Eggs),(nan),0.438095,0.869841,0.336508,0.768116,0.883053,1.0,-0.044565,0.56131,-0.190735,0.346405,-0.781548,0.577489
8,(Wine),(nan),0.438095,0.869841,0.31746,0.724638,0.833069,1.0,-0.063613,0.472682,-0.262869,0.320513,-1.115589,0.544801
9,(Bread),(nan),0.504762,0.869841,0.396825,0.786164,0.903801,1.0,-0.042237,0.608683,-0.176903,0.405844,-0.64289,0.621184


In [None]:
asso_rules.sort_values('confidence',ascending = False) # can sort as lift value

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
19,"(Meat, Milk)",(Cheese),0.244444,0.501587,0.203175,0.831169,1.657077,1.0,0.080564,2.952137,0.524816,0.374269,0.661262,0.618116
0,(Milk),(nan),0.501587,0.869841,0.409524,0.816456,0.938626,1.0,-0.026778,0.709141,-0.115976,0.425743,-0.410157,0.643629
28,"(Meat, Eggs)",(Cheese),0.266667,0.501587,0.215873,0.809524,1.613924,1.0,0.082116,2.616667,0.518717,0.390805,0.617834,0.619952
3,(Bagel),(nan),0.425397,0.869841,0.336508,0.791045,0.909413,1.0,-0.03352,0.622902,-0.147743,0.350993,-0.605388,0.588953
9,(Bread),(nan),0.504762,0.869841,0.396825,0.786164,0.903801,1.0,-0.042237,0.608683,-0.176903,0.405844,-0.64289,0.621184
4,(Cheese),(nan),0.501587,0.869841,0.393651,0.78481,0.902245,1.0,-0.042651,0.604855,-0.178565,0.402597,-0.653288,0.618682
10,(Diaper),(nan),0.406349,0.869841,0.31746,0.78125,0.898152,1.0,-0.035999,0.595011,-0.160381,0.331126,-0.68064,0.573107
5,(Meat),(nan),0.47619,0.869841,0.368254,0.773333,0.889051,1.0,-0.045956,0.57423,-0.192405,0.376623,-0.741463,0.598345
17,"(Cheese, Milk)",(nan),0.304762,0.869841,0.234921,0.770833,0.886177,1.0,-0.030174,0.567965,-0.155938,0.25,-0.760671,0.520453
7,(Eggs),(nan),0.438095,0.869841,0.336508,0.768116,0.883053,1.0,-0.044565,0.56131,-0.190735,0.346405,-0.781548,0.577489
