In [1]:
import pandas as pd
import time
import warnings
from apriori_algorithm import AprioriAlgorithm
from preprocessor import Streamer, preprocess
from colorama import Fore

warnings.filterwarnings('ignore')

In [2]:
dataset_metadata = {
    'amazon-reviews': {
        'path': './data/amazon-reviews/all_csv_files.csv',
        
        # using only half the dataset
        'size': 233055326//2,  # original size = 233055326 
        
        'limit': 70000,
        'minsup': [12500, 10000, 7500, 5000, 2500, 1000]
    },
    
    'groceries': {
        'path': './data/groceries/Groceries_dataset.csv',
        'size': 38766,
        'limit': 10000,
        'minsup': [200, 150, 100, 50, 25, 10]
    },
    
    
    'movielens': {
        'path': ['./data/movielens/ratings.csv', './data/movielens/movies.csv'],
        'size': 100836, 
        'limit': 50000,
        'minsup': [500, 400, 300, 200, 100, 50]
    }
}

In [3]:
MIN_SUPPORT = 100
VERBOSE = False
DISPLAY_ITERATION = 5000

In [4]:
datasets = [
    'groceries',
    # 'movielens',
    # 'amazon-reviews',
    ]

In [5]:
threshold_rating = 4.0

In [6]:
for dataset in datasets:
    
    dataset_meta = dataset_metadata[dataset]
    num_streams = dataset_meta['size'] // dataset_meta['limit']
    
    total_time = 0
    total_timer = time.time()
        
    for minsup in dataset_meta['minsup']:
        results = pd.DataFrame(columns=['item_sets', 'supp_count'])
        timer2 = time.time()
        apriori = AprioriAlgorithm(minsup= minsup, verbose=VERBOSE)
        
        if dataset == 'movielens':
            datastream = Streamer(filepath=dataset_meta['path'][0],
                                stream_limit=dataset_meta['limit'])
            movies_df = pd.read_csv(dataset_meta['path'][1])

        else:
            datastream = Streamer(filepath=dataset_meta['path'],
                                stream_limit=dataset_meta['limit'])
        
        print(Fore.WHITE + f'Running Apriori on',
              Fore.GREEN + f'{dataset}',
              Fore.WHITE + f':',
              Fore.GREEN + f'streams = {num_streams}',
              Fore.WHITE + f',',
              Fore.GREEN + f'minsup = {minsup}')
              
        print(Fore.WHITE + "##"*40)
        
        for stream_id in range(num_streams):

            start_time = time.time()
            df = datastream.getCurrentStream(stream_id)

            if dataset == 'movielens':
                movie_transactions = preprocess(dataset_name=dataset,
                                                ratings=df,
                                                movies=movies_df,
                                                threshold_rating=threshold_rating)

            else:
                movie_transactions = preprocess(dataset_name=dataset, 
                                                df=df, 
                                                threshold_rating=threshold_rating)
                
        
            freq_item_sets = apriori.run(movie_transactions)
            
            results = pd.concat([results, freq_item_sets], ignore_index=True)
            results = results.drop_duplicates()
            
            print(Fore.WHITE + f'\tAccessing data-stream #{stream_id}')
            print(Fore.WHITE + f'\tNum. of transactions: {len(movie_transactions)}')
            if len(freq_item_sets)>0:
                print(Fore.WHITE + f'\tNum. of freq itemsets:',
                      Fore.GREEN + f'{len(freq_item_sets)}')
            else:
                print(Fore.WHITE + f'\tNum. of freq itemsets: {len(freq_item_sets)}')
            
            print(Fore.WHITE + f'\tTotal num. of freq itemsets till now: {len(results)}')
            
            time_taken = round(time.time() - start_time, 3)
            print(Fore.WHITE + f'\tFinished data-stream #{stream_id} in',
                  Fore.GREEN +  f'{time_taken} seconds.')

            print(Fore.WHITE + '--'*40)

        # display(results)
        results.to_csv(f'./logs/{dataset}/itemsets_df_minsup_{minsup}.csv', index=False)
        
        print(Fore.WHITE + f'Time taken for minsup {minsup} =', 
              Fore.GREEN + f'{round(time.time() - timer2, 3)} seconds.\n')
    
    total_time += time.time() - total_timer
    print(Fore.GREEN +
          f'Completed {dataset} dataset in {total_time:.2f} seconds.')

    print(Fore.WHITE + "##"*40, '\n\n')

[37mRunning Apriori on [32mgroceries [37m: [32mstreams = 3 [37m, [32mminsup = 200
[37m################################################################################
[37m	Accessing data-stream #0
[37m	Num. of transactions: 3512
[37m	Num. of freq itemsets: [32m15
[37m	Total num. of freq itemsets till now: 15
[37m	Finished data-stream #0 in [32m0.198 seconds.
[37m--------------------------------------------------------------------------------
[37m	Accessing data-stream #1
[37m	Num. of transactions: 3664
[37m	Num. of freq itemsets: [32m10
[37m	Total num. of freq itemsets till now: 25
[37m	Finished data-stream #1 in [32m0.125 seconds.
[37m--------------------------------------------------------------------------------
[37m	Accessing data-stream #2
[37m	Num. of transactions: 3567
[37m	Num. of freq itemsets: [32m10
[37m	Total num. of freq itemsets till now: 34
[37m	Finished data-stream #2 in [32m0.121 seconds.
[37m----------------------------------------------