In [1]:
import time
import timeit
import numpy as np
import pandas as pd
import mlxtend
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import association_rules
import plotly_express as px

# Portfolio-Exam
# Part 1

## Exercise 1.

*Consider the following set of items: 𝑎, 𝑏, 𝑐, 𝑑, 𝑒, 𝑔, 𝑚 in some transaction database. Assume that APRIORI is running and has already computed the set 𝐿4 using alphabetical order on the items. The resulting set is:*

L_4 = [
[a, b, c, d], [a, b, c, e], [a, b, c, m], [a, b, d, e], [a, b, c, m], 
[a, c, e, m], [a, d, g, m], [b, c, d, e], [b, c, d, g], [b, c, d, m], 
[b, c, e, g], [b, c, e, m], [b, d, e, g], [c, d, e, g], [c, d, g, m]
]

### 1.1
*Use the APRIORI candidate generation (and alphabetical order on the items) to create the set 𝐶5 of candidates for 5-element frequent itemsets. Explain your decisions to create or discard elements.*

- Candidates are created by combining all itemsets that differ only in the last element
- Itemsets not meeting this condition are discarded

C_5 = [
[a, b, c, d, e], [a, b, c, d, m], [a, b, c, e, m], [b, c, d, e, g], 
[b, c, d, e, m], [b, c, d, g, m], [b, c, e, g, m]]

_______________
### 1.2
*For how many itemsets does the database have to be scanned to yield 𝐿5 from 𝐶5?*

- 14 scans of L_4 yielded seven 5-element itemsets

_______________
_______________

## Exercise 2.
### 2.1

In [2]:
# loading grocery dataset into variable foods
foods = [['fish', 'apples', 'cider', 'dragon fruit', 'garlic', 'ice cream', 'mints', 'prunes'],
           ['apples', 'bacon', 'cider', 'fish', 'lemons', 'mints', 'oatmeal'],
           ['bacon', 'fish', 'ham', 'jam', 'oatmeal'],
           ['bacon', 'cider', 'kiwis', 'spam', 'prunes'],
           ['apples', 'fish', 'cider', 'eggs', 'lemons', 'prunes', 'mints', 'nachos']]

In [3]:
# encoding foods and creating dataframe
te = TransactionEncoder()
foods_te_ary = te.fit_transform(foods)
foods_df = pd.DataFrame(foods_te_ary, columns=te.columns_)

### 2.2

In [4]:
# loading data into variable lines
# adapted from: https://stackoverflow.com/questions/28977415/splitting-a-list-of-numbers-from-txt-file-on-both-spaces-and-new-lines-python
with open('T10I4D100K.dat') as file:
    lines = [line.split() for line in file]
lines = lines[0:10000]

In [5]:
# encoding foods and creating dataframe
lines_te_ary = te.fit_transform(lines)
lines_df = pd.DataFrame(lines_te_ary, columns=te.columns_)

### 2.3

In [6]:
# calculating frequent itemsets
frequent_itemsets = apriori(foods_df, min_support=0.6, use_colnames=True)

# adding column with length of each itemset
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

# showing top 5 results, sorted by support descending
frequent_itemsets.sort_values(by='support', ascending=False).head(5)

Unnamed: 0,support,itemsets,length
2,0.8,(cider),1
3,0.8,(fish),1
0,0.6,(apples),1
10,0.6,"(cider, mints)",2
16,0.6,"(fish, cider, mints)",3


In [7]:
# calculating frequent itemsets
frequent_itemsets = apriori(lines_df, min_support=0.05, use_colnames=True)

# adding column with length of each itemset
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

# showing top 5 results, sorted by support descending
frequent_itemsets.sort_values(by='support', ascending=False).head(5)

Unnamed: 0,support,itemsets,length
2,0.0823,(368),1
4,0.0703,(529),1
8,0.0629,(829),1
1,0.0613,(354),1
7,0.0574,(766),1


The highest relative support that a non-empty itemset of the grocery dataset reaches: 0.8

The highest relative support that a non-empty itemset of the T10I4D100K dataset reaches: 0.0823
_______________
_______________

## Exercise 3.
### 3.1

In [8]:
# timing APRIORI on the grocery dataset, using support threshold of 0.4
%timeit apriori(foods_df, min_support=0.4, use_colnames=True)

1.68 ms ± 120 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [9]:
# storing APRIORI results in variable foods_fis
foods_fis = apriori(foods_df, min_support=0.4, use_colnames=True)
# counting length of results
len(foods_fis)

54

- to run APRIORI on the grocery dataset with a support threshhold of 0.4 took 1.68 milliseconds on average
- using 0.4 as support threshhold yielded 54 frequent itemsets
_______________

### 3.2

In [10]:
# timing APRIORI on the T10I4D100K dataset, using support threshold of 0.004
%timeit apriori(lines_df, min_support=0.004, use_colnames=True)

16.1 s ± 43.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
# storing APRIORI results in variable lines_fis
lines_fis = apriori(lines_df, min_support=0.004, use_colnames=True)
# counting length of results
len(lines_fis)

2185

- to run APRIORI on the T10I4D100K dataset with a support threshhold of 0.004 took 16.1 seconds on average
- using 0.004 as support threshhold yielded 2185 frequent itemsets
_______________

### 3.3

In [12]:
%%timeit -r 10 -n 10
apriori(foods_df, min_support=0.4, use_colnames=True)
# measuring time of 10 loops and 10 repeats

1.71 ms ± 197 µs per loop (mean ± std. dev. of 10 runs, 10 loops each)


- 10 measurements and 10 runs of APRIORI on the grocery dataset took 1.71 milliseconds on average with a standard deviation of 197 microseconds
_______________

In [13]:
%%timeit -r 10 -n 1
apriori(lines_df, min_support=0.004, use_colnames=True)
# measuring time of 10 loops and 1 repeat

16.5 s ± 231 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)


- 10 measurements and 1 run of APRIORI on the T10I4D100K dataset took 16.5 milliseconds on average with a standard deviation of 231 microseconds
_______________

### 3.4
*Discuss limitations of the above approaches (the methods for measuring time that have been applied in 3.1-3.3)!*

Using the %timeit function we can only measure the runtime of a single-line statement, though multiple ones can be chained with using semicolons, while %%timeit allows to meassure the runtime of a cell but this only provides a very basic timing functionality.

Computationally simple statements are executed so fast, that it is difficult to measure the runtime exactly. To get the most precise result, the statement is executed in a loop (N parameter), which is then repeated several times (R parameter).

The result of the meassurements will also vary, if the machine is in use with other tasks or left alone during execution, this leads to different results on every execution.

Magic line commands are intended as a solution for timing little bits of code quickly and easily but don't allow for more complex constructs because the measurements can't be stored for later use in tables or plots.

_______________
_______________

## Exercise 4.
### 4.1

In [14]:
# grocery non-sparse data structure, APRIORI and 0.2, 0.4, 0.6, 0.8

# creating list with threshholds to over loop
threshholds = [0.2, 0.4, 0.6, 0.8]

# creating lists with and without entries for results table
types = ['non_sparse']*len(threshholds)
dataset = ['grocery']*len(threshholds)
algorithm = ['APRIORI']*len(threshholds)
runtimes = []
mem_size = []

# setup configuration for the timer
setup = '''
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
import pandas as pd
te = TransactionEncoder()
foods = [['fish', 'apples', 'cider', 'dragon fruit', 'garlic', 'ice cream', 'mints', 'prunes'],
           ['apples', 'bacon', 'cider', 'fish', 'lemons', 'mints', 'oatmeal'],
           ['bacon', 'fish', 'ham', 'jam', 'oatmeal'],
           ['bacon', 'cider', 'kiwis', 'spam', 'prunes'],
           ['apples', 'fish', 'cider', 'eggs', 'lemons', 'prunes', 'mints', 'nachos']]
foods_te_ary = te.fit_transform(foods)
foods_df = pd.DataFrame(foods_te_ary, columns=te.columns_)
'''

# looping through threshholds
for threshhold in threshholds:
   # measuring and appending execution times to list
   runtimes.append(min(timeit.Timer(stmt=f'apriori(foods_df, min_support={threshhold})', setup=setup).repeat(10, 10)))
   # measuring and appending memory usage to list
   mem_size.append(sys.getsizeof(apriori(foods_df, min_support=threshhold)))

# creating the results table 
grocery = pd.DataFrame({
    'dataset': dataset,
    'algorithm': algorithm,
    'type': types,
    'threshhold': threshholds,
    'runtime': runtimes,
    'mem_size': mem_size}
    )

In [15]:
# grocery sparse data structure, APRIORI and 0.2, 0.4, 0.6, 0.8

# creating list with threshholds to over loop
threshholds = [0.2, 0.4, 0.6, 0.8]

# creating lists with and without entries for results table
types = ['sparse']*len(threshholds)
dataset = ['grocery']*len(threshholds)
algorithm = ['APRIORI']*len(threshholds)
runtimes = []
mem_size = []

# creating seperate sparse dataframe for meassuring memory requirements
foods_s = te.fit_transform(foods, sparse=True)
foods_df_s = pd.DataFrame.sparse.from_spmatrix(foods_s, columns=te.columns_)

# setup configuration for the timer
setup = '''
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
import pandas as pd
te = TransactionEncoder()
foods = [['fish', 'apples', 'cider', 'dragon fruit', 'garlic', 'ice cream', 'mints', 'prunes'],
           ['apples', 'bacon', 'cider', 'fish', 'lemons', 'mints', 'oatmeal'],
           ['bacon', 'fish', 'ham', 'jam', 'oatmeal'],
           ['bacon', 'cider', 'kiwis', 'spam', 'prunes'],
           ['apples', 'fish', 'cider', 'eggs', 'lemons', 'prunes', 'mints', 'nachos']]
foods_s = te.fit_transform(foods, sparse=True)
foods_df_s = pd.DataFrame.sparse.from_spmatrix(foods_s, columns=te.columns_)
'''

# looping through threshholds
for threshhold in threshholds:
   # measuring and appending execution times to list
   runtimes.append(min(timeit.Timer(stmt=f'apriori(foods_df_s, min_support={threshhold})', setup=setup).repeat(10, 10)))
   # measuring and appending memory usage to list
   mem_size.append(sys.getsizeof(apriori(foods_df_s, min_support=threshhold)))

# creating the results table 
grocery_s = pd.DataFrame({
    'dataset': dataset,
    'algorithm': algorithm,
    'type': types,
    'threshhold': threshholds,
    'runtime': runtimes,
    'mem_size': mem_size}
    )

In [16]:
# T10I4D100K non-sparse data structure, APRIORI and 0.002,0.004,0.008,0.016,0.032,0.064

# creating list with threshholds to over loop
threshholds = [0.002,0.004,0.008,0.016,0.032,0.064]

# creating lists with and without entries for results table
types = ['non_sparse']*len(threshholds)
dataset = ['T10I4D100K']*len(threshholds)
algorithm = ['APRIORI']*len(threshholds)
runtimes = []
mem_size = []

# setup configuration for the timer
setup = '''
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
import pandas as pd
te = TransactionEncoder()
with open('T10I4D100K.dat') as file:
    lines = [line.split() for line in file] 
lines = lines[0:10000]
lines_te_ary = te.fit_transform(lines)
lines_df = pd.DataFrame(lines_te_ary, columns=te.columns_)
'''

# looping through threshholds
for threshhold in threshholds:
   # measuring and appending execution times to list
   runtimes.append(min(timeit.Timer(stmt=f'apriori(lines_df, min_support={threshhold})', setup=setup).repeat(10, 1)))
   # measuring and appending memory usage to list
   mem_size.append(sys.getsizeof(apriori(lines_df, min_support=threshhold)))

# creating the results table 
t10i4d100k = pd.DataFrame({
    'dataset': dataset,
    'algorithm': algorithm,
    'type': types,
    'threshhold': threshholds,
    'runtime': runtimes,
    'mem_size': mem_size}
    )

In [17]:
# T10I4D100K sparse data structure, APRIORI and 0.002,0.004,0.008,0.016,0.032,0.064

# creating list with threshholds to over loop
threshholds = [0.002,0.004,0.008,0.016,0.032,0.064]

# creating lists with and without entries for results table
types = ['sparse']*len(threshholds)
dataset = ['T10I4D100K']*len(threshholds)
algorithm = ['APRIORI']*len(threshholds)
runtimes = []
mem_size = []

# creating seperate sparse dataframe for meassuring memory requirements
lines_s = te.fit_transform(lines, sparse=True)
lines_df_s = pd.DataFrame.sparse.from_spmatrix(lines_s, columns=te.columns_)

# setup configuration for the timer
setup = '''
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
import pandas as pd
te = TransactionEncoder()
with open('T10I4D100K.dat') as file:
    lines = [line.split() for line in file]
lines = lines[0:10000]
lines_s = te.fit_transform(lines, sparse=True)
lines_df_s = pd.DataFrame.sparse.from_spmatrix(lines_s, columns=te.columns_)
'''

# looping through threshholds
for threshhold in threshholds:
   # measuring and appending execution times to list
   runtimes.append(min(timeit.Timer(stmt=f'apriori(lines_df_s, min_support={threshhold})', setup=setup).repeat(10, 1)))
   # measuring and appending memory usage to list
   mem_size.append(sys.getsizeof(apriori(lines_df_s, min_support=threshhold)))

# creating the results table 
t10i4d100k_s = pd.DataFrame({
    'dataset': dataset,
    'algorithm': algorithm,
    'type': types,
    'threshhold': threshholds,
    'runtime': runtimes,
    'mem_size': mem_size}
    )

### 4.2

In [18]:
# combining results for the grocery dataset
foods_res = [grocery, grocery_s]
foods_res = pd.concat(foods_res)

# combining results for the T10I4D100K dataset
t10i4d100k_res = [t10i4d100k, t10i4d100k_s]
t10i4d100k_res = pd.concat(t10i4d100k_res)

In [19]:
# displaying results of different runs of grocery dataset
foods_res

Unnamed: 0,dataset,algorithm,type,threshhold,runtime,mem_size
0,grocery,APRIORI,non_sparse,0.2,0.037317,245224
1,grocery,APRIORI,non_sparse,0.4,0.014228,12832
2,grocery,APRIORI,non_sparse,0.6,0.011231,4032
3,grocery,APRIORI,non_sparse,0.8,0.004636,576
0,grocery,APRIORI,sparse,0.2,0.053366,245224
1,grocery,APRIORI,sparse,0.4,0.025117,12832
2,grocery,APRIORI,sparse,0.6,0.017651,4032
3,grocery,APRIORI,sparse,0.8,0.007767,576


In [20]:
# size of non-sparse grocery dataset
sys.getsizeof(foods_df)

229

In [21]:
# size of sparse grocery dataset
sys.getsizeof(foods_df_s)

309

In [22]:
# displaying results of different runs of T10I4D100K dataset
t10i4d100k_res

Unnamed: 0,dataset,algorithm,type,threshhold,runtime,mem_size
0,T10I4D100K,APRIORI,non_sparse,0.002,143.66645,4123984
1,T10I4D100K,APRIORI,non_sparse,0.004,15.358248,508968
2,T10I4D100K,APRIORI,non_sparse,0.008,3.321504,108872
3,T10I4D100K,APRIORI,non_sparse,0.016,0.800877,46368
4,T10I4D100K,APRIORI,non_sparse,0.032,0.047374,10728
5,T10I4D100K,APRIORI,non_sparse,0.064,0.007844,576
0,T10I4D100K,APRIORI,sparse,0.002,175.409166,4123984
1,T10I4D100K,APRIORI,sparse,0.004,15.156119,508968
2,T10I4D100K,APRIORI,sparse,0.008,3.267764,108872
3,T10I4D100K,APRIORI,sparse,0.016,0.79292,46368


In [23]:
# size of non-sparse T10I4D100K dataset
sys.getsizeof(lines_df)

8660144

In [24]:
# size of sparse T10I4D100K dataset
sys.getsizeof(lines_df_s)

502894

### 4.3
*Interpret the results – state observations, limitations, advice regarding the use of sparse representations!*

When running APRIORI, a lower threshhold results in a higher runtime.

In case of the grocery dataset using the sparse data representation takes more time than the non-sparse data representation, while memory consumption stays the same.

In case of the T10I4D100K dataset the runtime of the sparse data representation at the threshhold of 0.002 is almost 20% higher than the non-sparse data representation. However, at any threshhold larger than 0.002 the runtimes of the sparse and non-sparse data representations are practically the same. Again, the form of data representation has no impact on the memory of the execution.

Transforming the smaller grocery dataset into a sparse data representation acutally increases memory consumption by around 30%, while transforming a large dataset like T10I4D100K into a sparse data representation reduces its memory consumption to around 95%.

While the form of data representation has no impact on the memory requirements of the executions, the sparse data representations have higher runtimes. This suggests additional computations are required by APRIORI when working with sparse data. 

This becomes especially apparent at threshhold of 0.002 on the T10I4D100K dataset.
_______________

### 4.4
*Explain, why only the minimum runtime is reported (in contrast to other options,like the average runtime).*

Minimum runtime is reported because we want to know, what is the best achievable performance (fastest runtime) for a given code setup.

Since the machine is running multiple processes in parallel at any given time, the availability of computing resources for the execution of the statement varies from moment to moment, which will affect runtimes and the results of the meassurements.

The minimum meassurement tells us the lowest time that can be achieved given the most optimal system state.


_______________
_______________

## Exercise 5.
### 5.1

In [25]:
# grocery non-sparse data structure, APRIORI and 0.2, 0.4, 0.6, 0.8

# creating list with threshholds to over loop
threshholds = [0.2, 0.4, 0.6, 0.8]

# creating lists with and without entries for results table
types = ['non_sparse']*len(threshholds)
dataset = ['grocery']*len(threshholds)
algorithm = ['APRIORI']*len(threshholds)
runtimes = []
mem_size = []

# setup configuration for the timer
setup = '''
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
import pandas as pd
te = TransactionEncoder()
foods = [['fish', 'apples', 'cider', 'dragon fruit', 'garlic', 'ice cream', 'mints', 'prunes'],
           ['apples', 'bacon', 'cider', 'fish', 'lemons', 'mints', 'oatmeal'],
           ['bacon', 'fish', 'ham', 'jam', 'oatmeal'],
           ['bacon', 'cider', 'kiwis', 'spam', 'prunes'],
           ['apples', 'fish', 'cider', 'eggs', 'lemons', 'prunes', 'mints', 'nachos']]
foods_te_ary = te.fit_transform(foods)
foods_df = pd.DataFrame(foods_te_ary, columns=te.columns_)
'''

# looping through threshholds
for threshhold in threshholds:
   # measuring and appending execution times to list
   runtimes.append(min(timeit.Timer(stmt=f'apriori(foods_df, min_support={threshhold})', setup=setup).repeat(10, 10)))
   # measuring and appending memory usage to list
   mem_size.append(sys.getsizeof(apriori(foods_df, min_support=threshhold)))

# creating the results table 
foods_apriori = pd.DataFrame({
    'dataset': dataset,
    'algorithm': algorithm,
    'type': types,
    'threshhold': threshholds,
    'runtime': runtimes,
    'mem_size': mem_size}
    )

In [26]:
# T10I4D100K non-sparse data structure, APRIORI and 0.002,0.004,0.008,0.016,0.032,0.064

# creating list with threshholds to over loop
threshholds = [0.002,0.004,0.008,0.016,0.032,0.064]

# creating lists with and without entries for results table
types = ['non_sparse']*len(threshholds)
dataset = ['T10I4D100K']*len(threshholds)
algorithm = ['APRIORI']*len(threshholds)
runtimes = []
mem_size = []

# setup configuration for the timer
setup = '''
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
import pandas as pd
te = TransactionEncoder()
with open('T10I4D100K.dat') as file:
    lines = [line.split() for line in file]
lines = lines[0:10000]
lines_te_ary = te.fit_transform(lines)
lines_df = pd.DataFrame(lines_te_ary, columns=te.columns_)
'''

# looping through threshholds
for threshhold in threshholds:
   # measuring and appending execution times to list
   runtimes.append(min(timeit.Timer(stmt=f'apriori(lines_df, min_support={threshhold})', setup=setup).repeat(10, 1)))
   # measuring and appending memory usage to list
   mem_size.append(sys.getsizeof(apriori(lines_df, min_support=threshhold)))

# creating the results table 
t10i4d100k_apriori = pd.DataFrame({
    'dataset': dataset,
    'algorithm': algorithm,
    'type': types,
    'threshhold': threshholds,
    'runtime': runtimes,
    'mem_size': mem_size}
    )

In [27]:
# grocery non-sparse data structure, FP-Growth and 0.2, 0.4, 0.6, 0.8 

# creating list with threshholds to over loop
threshholds = [0.2, 0.4, 0.6, 0.8]

# creating lists with and without entries for results table
types = ['non_sparse']*len(threshholds)
dataset = ['grocery']*len(threshholds)
algorithm = ['FP-Growth']*len(threshholds)
runtimes = []
mem_size = []

# setup configuration for the timer
setup = '''
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth
import pandas as pd
te = TransactionEncoder()
foods = [['fish', 'apples', 'cider', 'dragon fruit', 'garlic', 'ice cream', 'mints', 'prunes'],
           ['apples', 'bacon', 'cider', 'fish', 'lemons', 'mints', 'oatmeal'],
           ['bacon', 'fish', 'ham', 'jam', 'oatmeal'],
           ['bacon', 'cider', 'kiwis', 'spam', 'prunes'],
           ['apples', 'fish', 'cider', 'eggs', 'lemons', 'prunes', 'mints', 'nachos']]
foods_te_ary = te.fit_transform(foods)
foods_df = pd.DataFrame(foods_te_ary, columns=te.columns_)
'''

# looping through threshholds
for threshhold in threshholds:
   # measuring and appending execution times to list
   runtimes.append(min(timeit.Timer(stmt=f'fpgrowth(foods_df, min_support={threshhold})', setup=setup).repeat(10, 10)))
   # measuring and appending memory usage to list
   mem_size.append(sys.getsizeof(fpgrowth(foods_df, min_support=threshhold)))

# creating the results table 
foods_fpgrowth = pd.DataFrame({
    'dataset': dataset,
    'algorithm': algorithm,
    'type': types,
    'threshhold': threshholds,
    'runtime': runtimes,
    'mem_size': mem_size}
    )

In [28]:
# T10I4D100K non-sparse data structure, FPGrowth and 0.002,0.004,0.008,0.016,0.032,0.064

# creating list with threshholds to over loop
threshholds = [0.002,0.004,0.008,0.016,0.032,0.064]

# creating lists with and without entries for results table
types = ['non_sparse']*len(threshholds)
dataset = ['T10I4D100K']*len(threshholds)
algorithm = ['FP-Growth']*len(threshholds)
runtimes = []
mem_size = []

# setup configuration for the timer
setup = '''
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth
import pandas as pd
te = TransactionEncoder()
with open('T10I4D100K.dat') as file:
    lines = [line.split() for line in file]
lines = lines[0:10000]
lines_te_ary = te.fit_transform(lines)
lines_df = pd.DataFrame(lines_te_ary, columns=te.columns_)
'''

# looping through threshholds
for threshhold in threshholds:
   # measuring and appending execution times to list
   runtimes.append(min(timeit.Timer(stmt=f'fpgrowth(lines_df, min_support={threshhold})', setup=setup).repeat(10, 1)))
   # measuring and appending memory usage to list
   mem_size.append(sys.getsizeof(fpgrowth(lines_df, min_support=threshhold)))

# creating the results table 
t10i4d100k_fpgrowth = pd.DataFrame({
    'dataset': dataset,
    'algorithm': algorithm,
    'type': types,
    'threshhold': threshholds,
    'runtime': runtimes,
    'mem_size': mem_size}
    )

### 5.2

In [29]:
# combining results of grocery dataset
foods_res = [foods_apriori, foods_fpgrowth]
foods_res = pd.concat(foods_res)

# combining results of T10I4D100K dataset
t10i4d100k_res = [t10i4d100k_apriori, t10i4d100k_fpgrowth]
t10i4d100k_res = pd.concat(t10i4d100k_res)

In [30]:
# displaying results of different runs of grocery dataset
foods_res

Unnamed: 0,dataset,algorithm,type,threshhold,runtime,mem_size
0,grocery,APRIORI,non_sparse,0.2,0.037286,245224
1,grocery,APRIORI,non_sparse,0.4,0.014199,12832
2,grocery,APRIORI,non_sparse,0.6,0.011173,4032
3,grocery,APRIORI,non_sparse,0.8,0.004597,576
0,grocery,FP-Growth,non_sparse,0.2,0.011177,245224
1,grocery,FP-Growth,non_sparse,0.4,0.003969,12832
2,grocery,FP-Growth,non_sparse,0.6,0.003317,4032
3,grocery,FP-Growth,non_sparse,0.8,0.002742,576


In [31]:
# displaying results of different runs of T10I4D100K dataset
t10i4d100k_res

Unnamed: 0,dataset,algorithm,type,threshhold,runtime,mem_size
0,T10I4D100K,APRIORI,non_sparse,0.002,143.892698,4123984
1,T10I4D100K,APRIORI,non_sparse,0.004,14.847923,508968
2,T10I4D100K,APRIORI,non_sparse,0.008,3.270451,108872
3,T10I4D100K,APRIORI,non_sparse,0.016,0.780645,46368
4,T10I4D100K,APRIORI,non_sparse,0.032,0.0475,10728
5,T10I4D100K,APRIORI,non_sparse,0.064,0.00788,576
0,T10I4D100K,FP-Growth,non_sparse,0.002,0.362512,4123984
1,T10I4D100K,FP-Growth,non_sparse,0.004,0.27245,508968
2,T10I4D100K,FP-Growth,non_sparse,0.008,0.206002,108872
3,T10I4D100K,FP-Growth,non_sparse,0.016,0.122046,46368


### 5.3

In [32]:
# combining results per dataset
foods_res = [foods_apriori, foods_fpgrowth]
foods_res = pd.concat(foods_res)

t10i4d100k_res = [t10i4d100k_apriori, t10i4d100k_fpgrowth]
t10i4d100k_res = pd.concat(t10i4d100k_res)

In [33]:
# to stop plotly from printing DeprecationWarnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [34]:
# plotting results of different runs of the grocery dataset
px.line(foods_res, x='threshhold', y='runtime', color='algorithm', width=800, 
    title='Grocery Dataset',
    labels={
        'threshhold': 'Threshhold',
        'runtime': 'Runtime in Seconds',
        'algorithm': 'Algorithm'
        })

In [35]:
# plotting results of different runs of the T10I4D100K dataset
px.line(t10i4d100k_res, x='threshhold', y='runtime', color='algorithm', log_x=True, log_y=True, width=800, 
    title='T10I4D100K Dataset',
    labels={
        'threshhold': 'Threshhold',
        'runtime': 'Runtime in Seconds',
        'algorithm': 'Algorithm'
        })

### 5.4
*Interpret the results – state observations, limitations, advice regarding the use of the two algorithms!*

Examining execution speeds, we find the runtimes of FP-Growth are only fractions of those of APRIORI.

The increase of runtime with the decrease of the threshhold affects APRIORI more than FP-Growth. 

There is however an exeption on the T10I4D100K dataset at the thresholds 0.032 where the runtimes are effectively the same and 0.064 where APRIORI outperforms FP-Growth. 

FP-Growth generally performs faster but APRIORI seems preferable when working with large datasets and high threshholds. This cross-over point will be scenario dependent.
_______________
_______________