In [17]:
import os
import numpy as np
import pandas as pd

from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

# Ice Cream Order Example

In [18]:
IceCreamOrder = pd.read_csv('IceCreamMailOrder.csv')
IceCreamOrder.head()

Unnamed: 0,Banana,Turtle,Vanilla,Chocolate,MintCC,Strawberry,Neopolitan
0,bought,no,no,no,no,no,no
1,no,bought,no,bought,no,no,bought
2,no,no,no,no,no,no,no
3,bought,bought,bought,no,bought,no,bought
4,no,no,bought,no,no,no,bought


In [19]:
IceCreamOrder.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Banana      2000 non-null   object
 1   Turtle      2000 non-null   object
 2   Vanilla     2000 non-null   object
 3   Chocolate   2000 non-null   object
 4   MintCC      2000 non-null   object
 5   Strawberry  2000 non-null   object
 6   Neopolitan  2000 non-null   object
dtypes: object(7)
memory usage: 109.5+ KB


In [20]:
IceCreamOrder = IceCreamOrder.replace('no', False)
IceCreamOrder = IceCreamOrder.replace('bought', True)
IceCreamOrder.head()

Unnamed: 0,Banana,Turtle,Vanilla,Chocolate,MintCC,Strawberry,Neopolitan
0,True,False,False,False,False,False,False
1,False,True,False,True,False,False,True
2,False,False,False,False,False,False,False
3,True,True,True,False,True,False,True
4,False,False,True,False,False,False,True


In [21]:
frequent_itemsets_IceCreamOrder = apriori(IceCreamOrder, min_support=0.1, use_colnames=True)
frequent_itemsets_IceCreamOrder.sort_values(by='support', ascending=False).head(15)

Unnamed: 0,support,itemsets
2,0.431,(Vanilla)
0,0.423,(Banana)
3,0.282,(Chocolate)
6,0.276,(Neopolitan)
8,0.256,"(Banana, Vanilla)"
1,0.2475,(Turtle)
5,0.241,(Strawberry)
4,0.2145,(MintCC)
12,0.195,"(Banana, Neopolitan)"
20,0.1925,"(Vanilla, Neopolitan)"


In [22]:
association_rules(frequent_itemsets_IceCreamOrder, metric='confidence', min_threshold=0.7)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(MintCC),(Banana),0.2145,0.423,0.1515,0.706294,1.669725,0.060767,1.964548,0.510629
1,(Neopolitan),(Banana),0.276,0.423,0.195,0.706522,1.670264,0.078252,1.966074,0.554271
2,(MintCC),(Vanilla),0.2145,0.431,0.1525,0.710956,1.649549,0.06005,1.968556,0.501303
3,"(Banana, Turtle)",(Vanilla),0.165,0.431,0.129,0.781818,1.813963,0.057885,2.607917,0.53739
4,"(Vanilla, Turtle)",(Banana),0.162,0.423,0.129,0.796296,1.882497,0.060474,2.832545,0.559416
5,"(Chocolate, Banana)",(Vanilla),0.184,0.431,0.146,0.793478,1.841017,0.066696,2.755158,0.559831
6,"(Chocolate, Vanilla)",(Banana),0.1875,0.423,0.146,0.778667,1.84082,0.066687,2.606928,0.562171
7,"(Banana, MintCC)",(Vanilla),0.1515,0.431,0.1225,0.808581,1.876058,0.057204,2.972534,0.550345
8,"(Vanilla, MintCC)",(Banana),0.1525,0.423,0.1225,0.803279,1.899004,0.057993,2.933083,0.558594
9,"(Banana, Strawberry)",(Vanilla),0.1625,0.431,0.1265,0.778462,1.806175,0.056462,2.568403,0.532948


# Book In Class Activity

### Import and Raw Dataset

In [23]:
book = pd.read_csv('Books.csv')
book.head()

Unnamed: 0,ChildBks,YouthBks,CookBks,DoItYBks,RefBks,ArtBks,GeogBks
0,yes,no,yes,no,yes,yes,no
1,no,yes,yes,yes,yes,yes,yes
2,yes,yes,yes,yes,yes,yes,yes
3,no,no,no,yes,no,yes,no
4,yes,yes,no,yes,yes,yes,no


In [24]:
book.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1878 entries, 0 to 1877
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ChildBks  1878 non-null   object
 1   YouthBks  1878 non-null   object
 2   CookBks   1878 non-null   object
 3   DoItYBks  1878 non-null   object
 4   RefBks    1878 non-null   object
 5   ArtBks    1878 non-null   object
 6   GeogBks   1878 non-null   object
dtypes: object(7)
memory usage: 102.8+ KB


### Data Prep

In [25]:
book = book.replace('no', False)
book = book.replace('yes', True)
book.head()

Unnamed: 0,ChildBks,YouthBks,CookBks,DoItYBks,RefBks,ArtBks,GeogBks
0,True,False,True,False,True,True,False
1,False,True,True,True,True,True,True
2,True,True,True,True,True,True,True
3,False,False,False,True,False,True,False
4,True,True,False,True,True,True,False


### Model Building

In [26]:
frequent_itemsets_book = apriori(book, min_support=0.1, use_colnames=True)
frequent_itemsets_book

Unnamed: 0,support,itemsets
0,0.579872,(ChildBks)
1,0.752929,(YouthBks)
2,0.571885,(CookBks)
3,0.717785,(DoItYBks)
4,0.781150,(RefBks)
...,...,...
122,0.273163,"(GeogBks, YouthBks, ArtBks, CookBks, ChildBks,..."
123,0.312567,"(GeogBks, YouthBks, DoItYBks, ArtBks, ChildBks..."
124,0.269968,"(GeogBks, DoItYBks, ArtBks, CookBks, ChildBks,..."
125,0.313099,"(GeogBks, YouthBks, DoItYBks, ArtBks, CookBks,..."


In [35]:
ar_df = association_rules(frequent_itemsets_book, metric= 'confidence', min_threshold=0.1)
ar_df.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(ChildBks),(YouthBks),0.579872,0.752929,0.495208,0.853994,1.13423,0.058605,1.692206,0.281688
1,(YouthBks),(ChildBks),0.752929,0.579872,0.495208,0.657709,1.13423,0.058605,1.227398,0.478991
2,(CookBks),(ChildBks),0.571885,0.579872,0.404686,0.707635,1.220329,0.073066,1.436998,0.42173
3,(ChildBks),(CookBks),0.579872,0.571885,0.404686,0.697888,1.220329,0.073066,1.417074,0.429748
4,(ChildBks),(DoItYBks),0.579872,0.717785,0.481363,0.830119,1.156502,0.06514,1.661256,0.3221


In [39]:
print("Index 0")
print(1878 * ar_df['support'][0])
print("\nIndex 1")
print(1878 * ar_df['support'][1])

Index 1
930.0

Index 1
930.0


In [48]:
print("Index 0")
print(round(ar_df['confidence'][0], 3))
print("\nIndex 1")
print(round(ar_df['confidence'][1], 3))

Index 0
0.854

Index 1
0.658


In [49]:
print("Index 0")
print(round(ar_df['support'][0], 3))
print("\nIndex 1")
print(round(ar_df['support'][1], 3))

Index 0
0.495

Index 1
0.495


In [50]:
print("Index 0")
print(round(ar_df['lift'][0], 3))
print("\nIndex 1")
print(round(ar_df['lift'][1], 3))

Index 0
1.134

Index 1
1.134


### ArtBks consequent only

In [54]:
ar_df[ar_df['consequents'] == {'ArtBks'}].sort_values(by='lift', ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
1498,"(GeogBks, YouthBks, DoItYBks, CookBks, ChildBks)",(ArtBks),0.280618,0.758253,0.259851,0.925996,1.221223,0.047072,3.266678,0.251811
1809,"(GeogBks, YouthBks, DoItYBks, CookBks, ChildBk...",(ArtBks),0.258786,0.758253,0.239617,0.925926,1.221130,0.043391,3.263578,0.244310
774,"(CookBks, ChildBks, DoItYBks, YouthBks)",(ArtBks),0.305112,0.758253,0.281150,0.921466,1.215248,0.049798,3.078239,0.254894
1375,"(YouthBks, DoItYBks, CookBks, ChildBks, RefBks)",(ArtBks),0.280085,0.758253,0.257721,0.920152,1.213515,0.045345,3.027588,0.244401
1685,"(GeogBks, DoItYBks, CookBks, ChildBks, RefBks)",(ArtBks),0.293930,0.758253,0.269968,0.918478,1.211308,0.047095,2.965424,0.247066
...,...,...,...,...,...,...,...,...,...,...
176,"(YouthBks, RefBks)",(ArtBks),0.630990,0.758253,0.531949,0.843038,1.111816,0.053498,1.540159,0.272541
40,(GeogBks),(ArtBks),0.728435,0.758253,0.613951,0.842836,1.111550,0.061613,1.538183,0.369542
32,(DoItYBks),(ArtBks),0.717785,0.758253,0.600639,0.836795,1.103583,0.056376,1.481247,0.332584
18,(YouthBks),(ArtBks),0.752929,0.758253,0.612354,0.813296,1.072591,0.041443,1.294809,0.273921


# Stats Com Courses

### Data import

In [55]:
scc = pd.read_csv('StatisticsComCourses.csv')
scc.head(5)

Unnamed: 0,Intro,DataMining,Survey,Cat Data,Regression,Forecast,DOE,SW
0,no,no,yes,yes,yes,yes,yes,yes
1,yes,yes,no,yes,yes,yes,yes,yes
2,yes,no,yes,no,no,yes,yes,no
3,no,yes,yes,yes,yes,yes,yes,yes
4,no,no,yes,yes,yes,yes,yes,yes


In [62]:
for col in scc.columns:
    print(scc[col].value_counts())
    print('\n')

Intro
yes    221
no     144
Name: count, dtype: int64


DataMining
yes    300
no      65
Name: count, dtype: int64


Survey
yes    297
no      68
Name: count, dtype: int64


Cat Data
yes    289
no      76
Name: count, dtype: int64


Regression
yes    289
no      76
Name: count, dtype: int64


Forecast
yes    314
no      51
Name: count, dtype: int64


DOE
yes    302
no      63
Name: count, dtype: int64


SW
yes    284
no      81
Name: count, dtype: int64




In [63]:
scc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Intro       365 non-null    object
 1   DataMining  365 non-null    object
 2   Survey      365 non-null    object
 3   Cat Data    365 non-null    object
 4   Regression  365 non-null    object
 5   Forecast    365 non-null    object
 6   DOE         365 non-null    object
 7   SW          365 non-null    object
dtypes: object(8)
memory usage: 22.9+ KB


### Data Prep

In [64]:
scc = scc.replace('no', False)
scc = scc.replace('yes', True)
scc.head()

Unnamed: 0,Intro,DataMining,Survey,Cat Data,Regression,Forecast,DOE,SW
0,False,False,True,True,True,True,True,True
1,True,True,False,True,True,True,True,True
2,True,False,True,False,False,True,True,False
3,False,True,True,True,True,True,True,True
4,False,False,True,True,True,True,True,True


In [65]:
scc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   Intro       365 non-null    bool 
 1   DataMining  365 non-null    bool 
 2   Survey      365 non-null    bool 
 3   Cat Data    365 non-null    bool 
 4   Regression  365 non-null    bool 
 5   Forecast    365 non-null    bool 
 6   DOE         365 non-null    bool 
 7   SW          365 non-null    bool 
dtypes: bool(8)
memory usage: 3.0 KB


### Model Building

In [66]:
frequent_itemsets_scc = apriori(scc, min_support=0.1, use_colnames=True)
frequent_itemsets_scc

Unnamed: 0,support,itemsets
0,0.605479,(Intro)
1,0.821918,(DataMining)
2,0.813699,(Survey)
3,0.791781,(Cat Data)
4,0.791781,(Regression)
...,...,...
243,0.293151,"(Survey, Regression, DOE, DataMining, Forecast..."
244,0.298630,"(Regression, DOE, DataMining, Forecast, SW, Ca..."
245,0.306849,"(Survey, Regression, DOE, Forecast, SW, Cat Data)"
246,0.109589,"(Survey, DOE, Intro, DataMining, Forecast, SW,..."


In [67]:
scc_ar_df = association_rules(frequent_itemsets_scc, metric= 'confidence', min_threshold=0.1)
scc_ar_df.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Intro),(DataMining),0.605479,0.821918,0.482192,0.79638,0.968929,-0.015463,0.874581,-0.075172
1,(DataMining),(Intro),0.821918,0.605479,0.482192,0.586667,0.968929,-0.015463,0.954485,-0.152593
2,(Survey),(Intro),0.813699,0.605479,0.479452,0.589226,0.973155,-0.013226,0.960431,-0.128971
3,(Intro),(Survey),0.605479,0.813699,0.479452,0.791855,0.973155,-0.013226,0.895057,-0.065351
4,(Cat Data),(Intro),0.791781,0.605479,0.468493,0.591696,0.977235,-0.010914,0.966241,-0.100623


### Model Stats

In [70]:
# Instances
print("Index 0")
print(365 * scc_ar_df['support'][0])
print("\nIndex 1")
print(365 * scc_ar_df['support'][1])

Index 0
176.0

Index 1
176.0


In [71]:
# Confidence
print("Index 0")
print(round(scc_ar_df['confidence'][0], 3))
print("\nIndex 1")
print(round(scc_ar_df['confidence'][1], 3))

Index 0
0.796

Index 1
0.587


In [72]:
# Support
print("Index 0")
print(round(scc_ar_df['support'][0], 3))
print("\nIndex 1")
print(round(scc_ar_df['support'][1], 3))

Index 0
0.482

Index 1
0.482


In [73]:
print("Index 0")
print(round(scc_ar_df['lift'][0], 3))
print("\nIndex 1")
print(round(scc_ar_df['lift'][1], 3))

Index 0
0.969

Index 1
0.969


In [75]:
scc_ar_df[scc_ar_df['consequents'] == {'DataMining'}].sort_values(by='lift', ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
218,"(Cat Data, Forecast)",(DataMining),0.695890,0.821918,0.597260,0.858268,1.044226,0.025296,1.256469,0.139268
896,"(Cat Data, Forecast, Survey)",(DataMining),0.589041,0.821918,0.498630,0.846512,1.029922,0.014487,1.160232,0.070696
1065,"(Cat Data, Forecast, DOE)",(DataMining),0.583562,0.821918,0.493151,0.845070,1.028169,0.013511,1.149440,0.065789
1078,"(Cat Data, Forecast, SW)",(DataMining),0.558904,0.821918,0.471233,0.843137,1.025817,0.011860,1.135274,0.057056
20,(Forecast),(DataMining),0.860274,0.821918,0.723288,0.840764,1.022930,0.016213,1.118356,0.160428
...,...,...,...,...,...,...,...,...,...,...
4791,"(Survey, DOE, Intro, Forecast, SW, Cat Data)",(DataMining),0.175342,0.821918,0.109589,0.625000,0.760417,-0.034528,0.474886,-0.276442
3178,"(Survey, Regression, Intro, SW, Cat Data)",(DataMining),0.191781,0.821918,0.117808,0.614286,0.747381,-0.039820,0.461695,-0.294886
3798,"(Regression, DOE, Intro, SW, Cat Data)",(DataMining),0.191781,0.821918,0.112329,0.585714,0.712619,-0.045299,0.429854,-0.332874
3549,"(Survey, Regression, DOE, Intro, SW)",(DataMining),0.178082,0.821918,0.104110,0.584615,0.711282,-0.042259,0.428716,-0.330593
