In [1]:
import pandas as pd
import time
import warnings
from apriori_algorithm import AprioriAlgorithm
from preprocessor import Streamer, preprocess

warnings.filterwarnings('ignore')

In [2]:
dataset_metadata = {
    'amazon-reviews': {
        'path': './data/amazon-reviews/all_csv_files.csv',
        'size': 233055326,
        'limit': 100000
    },
    
    'groceries': {
        'path': './data/groceries/Groceries_dataset.csv',
        'size': 38766,
        'limit': 10000
    },
    
    
    'movielens': {
        'path': ['./data/movielens/ratings.csv', './data/movielens/movies.csv'],
        'size': 100836, 
        'limit': 50000
    }
}

In [3]:
MIN_SUPPORT = 200
VERBOSE = False
DISPLAY_ITERATION = 5000

In [4]:
apriori = AprioriAlgorithm(minsup=MIN_SUPPORT, verbose=VERBOSE)

In [5]:
datasets = [
    'movielens',
    'groceries',
    'amazon-reviews',
    ]

In [6]:
threshold_rating = 4.0

In [7]:
for dataset in datasets:
    
    results = pd.DataFrame(columns=['item_sets', 'supp_count'])
    dataset_meta = dataset_metadata[dataset]
    num_streams = dataset_meta['size'] // dataset_meta['limit']
    
    if dataset == 'movielens':
        datastream = Streamer(filepath=dataset_meta['path'][0],
                              stream_limit=dataset_meta['limit'])
        movies_df = pd.read_csv(dataset_meta['path'][1])

    else:
        datastream = Streamer(filepath=dataset_meta['path'],
                            stream_limit=dataset_meta['limit'])
    
    print(f'Running APRIORI on {dataset} dataset with {num_streams} sequential I/O streams\n')
    print('--'*30)
    
    for stream_id in range(num_streams):

        start_time = time.time()
        
        df = datastream.getCurrentStream(stream_id)

        if dataset == 'movielens':
            movie_transactions = preprocess(dataset_name=dataset,
                                            ratings=df,
                                            movies=movies_df,
                                            threshold_rating=threshold_rating)

        else:
            movie_transactions = preprocess(dataset_name=dataset, 
                                            df=df, 
                                            threshold_rating=threshold_rating)
            
    
        freq_item_sets = apriori.run(movie_transactions)
        
        results = pd.concat([results, freq_item_sets], ignore_index=True)
        results = results.drop_duplicates()
        
            
        time_taken = round(time.time() - start_time, 2)

        print(f'Number of frequent itemsets found: {len(freq_item_sets)}')
        print(f'Number of transactions: {len(movie_transactions)}')
        print(f'Finished data-stream #{stream_id} in {time_taken} seconds.')
        # display(results) # uncomment to display results after each stream
                    
        print('--'*30)
    
    print(f'Finished running {dataset} dataset.')
    display(results)
    print("##"*30, '\n\n')

Running APRIORI on movielens dataset with 2 sequential I/O streams

------------------------------------------------------------
Accessing data-stream #0
Number of frequent itemsets found: 2
Number of transactions: 322
Finished data-stream #0 in 0.03 seconds.
------------------------------------------------------------
Accessing data-stream #1
Number of frequent itemsets found: 2
Number of transactions: 288
Finished data-stream #1 in 0.04 seconds.
------------------------------------------------------------
Finished running movielens dataset.


Unnamed: 0,item_sets,supp_count
0,The (1994),366
1,The (1999),346
2,The (1994),306
3,The (1999),365


############################################################ 


Running APRIORI on groceries dataset with 3 sequential I/O streams

------------------------------------------------------------
Accessing data-stream #0
Number of frequent itemsets found: 15
Number of transactions: 3512
Finished data-stream #0 in 0.13 seconds.
------------------------------------------------------------
Accessing data-stream #1
Number of frequent itemsets found: 10
Number of transactions: 3664
Finished data-stream #1 in 0.08 seconds.
------------------------------------------------------------
Accessing data-stream #2
Number of frequent itemsets found: 10
Number of transactions: 3567
Finished data-stream #2 in 0.08 seconds.
------------------------------------------------------------
Finished running groceries dataset.


Unnamed: 0,item_sets,supp_count
0,beef,294
1,canned beer,299
2,chicken,291
3,citrus fruit,431
4,frankfurter,536
5,other vegetables,441
6,pip fruit,270
7,pork,367
8,rolls/buns,331
9,root vegetables,280


############################################################ 


Running APRIORI on amazon-reviews dataset with 2330 sequential I/O streams

------------------------------------------------------------
Accessing data-stream #0
Number of frequent itemsets found: 1
Number of transactions: 63050
Finished data-stream #0 in 29.68 seconds.
------------------------------------------------------------
Accessing data-stream #1
Number of frequent itemsets found: 60
Number of transactions: 76287
Finished data-stream #1 in 28.06 seconds.
------------------------------------------------------------
Accessing data-stream #2
Number of frequent itemsets found: 45
Number of transactions: 71687
Finished data-stream #2 in 14.98 seconds.
------------------------------------------------------------
Accessing data-stream #3
Number of frequent itemsets found: 1
Number of transactions: 68496
Finished data-stream #3 in 0.91 seconds.
------------------------------------------------------------
Accessing data-str