In [1]:
import pandas as pd
import time
import warnings
from apriori_algorithm import AprioriAlgorithm
from preprocessor import Streamer, preprocess
from colorama import Fore

warnings.filterwarnings('ignore')

In [2]:
dataset_metadata = {
    'amazon-reviews': {
        'path': './data/amazon-reviews/all_csv_files.csv',

        # using only half the dataset
        'size': 233055326//2,  # original size = 233055326

        'limit': 70000,
        'minsup': [10000]
    },

    'groceries': {
        'path': './data/groceries/Groceries_dataset.csv',
        'size': 38766,
        'limit': 10000,
        'minsup': [200, 150, 100, 50, 25, 10]
    },


    'movielens': {
        'path': ['./data/movielens/ratings.csv', './data/movielens/movies.csv'],
        'size': 100836,
        'limit': 50000,
        'minsup': [500, 400, 300, 200, 100, 50]
    }
}

In [3]:
MIN_SUPPORT = 100
VERBOSE = False
DISPLAY_ITERATION = 5000

In [4]:
datasets = [
    # 'groceries',
    # 'movielens',
    'amazon-reviews',
    ]

In [5]:
threshold_rating = 4.0

In [6]:
for dataset in datasets:
    
    results = pd.DataFrame(columns=['item_sets', 'supp_count'])
    dataset_meta = dataset_metadata[dataset]
    num_streams = dataset_meta['size'] // dataset_meta['limit']
    
    total_time = 0
    total_timer = time.time()
    
    for minsup in dataset_meta['minsup']:
        
        apriori = AprioriAlgorithm(minsup= minsup, verbose=VERBOSE)
        
        if dataset == 'movielens':
            datastream = Streamer(filepath=dataset_meta['path'][0],
                                stream_limit=dataset_meta['limit'])
            movies_df = pd.read_csv(dataset_meta['path'][1])

        else:
            datastream = Streamer(filepath=dataset_meta['path'],
                                stream_limit=dataset_meta['limit'])
        
        print(Fore.WHITE + f'Running Apriori on {dataset} : streams = {num_streams} , minsup = {minsup}')
        print(Fore.WHITE + "##"*40)
        
        for stream_id in range(num_streams):

            start_time = time.time()
            df = datastream.getCurrentStream(stream_id)

            if dataset == 'movielens':
                movie_transactions = preprocess(dataset_name=dataset,
                                                ratings=df,
                                                movies=movies_df,
                                                threshold_rating=threshold_rating)

            else:
                movie_transactions = preprocess(dataset_name=dataset, 
                                                df=df, 
                                                threshold_rating=threshold_rating)
                
        
            freq_item_sets = apriori.run(movie_transactions)
            
            results = pd.concat([results, freq_item_sets], ignore_index=True)
            results = results.drop_duplicates()
            
                
            print(Fore.WHITE + f'\tAccessing data-stream #{stream_id}')
            print(Fore.GREEN + f'\tNum. of transactions: {len(movie_transactions)}')
            print(Fore.GREEN + f'\tNum. of freq itemsets: {len(freq_item_sets)}')
            
            time_taken = round(time.time() - start_time, 2)
            print(Fore.WHITE + f'\tFinished data-stream #{stream_id} in {time_taken} seconds.')

            print(Fore.WHITE + '--'*40)
    
        # display(results)
        results.to_csv(f'./logs/{dataset}/itemsets_df_minsup_{minsup}.csv', index=False)
        
    
    total_time += time.time() - total_timer
    print(Fore.GREEN + f'Completed {dataset} dataset in {total_time:.2f} seconds.')

    print(Fore.WHITE + "##"*40, '\n\n')

[37mRunning Apriori on amazon-reviews : streams = 1664 , minsup = 10000
[37m################################################################################
[37m	Accessing data-stream #0
[32m	Num. of transactions: 48514
[32m	Num. of freq itemsets: 0
[37m	Finished data-stream #0 in 0.58 seconds.
[37m--------------------------------------------------------------------------------
[37m	Accessing data-stream #8
[32m	Num. of transactions: 46400
[32m	Num. of freq itemsets: 1
[37m	Finished data-stream #8 in 0.56 seconds.
[37m--------------------------------------------------------------------------------
[37m	Accessing data-stream #9
[32m	Num. of transactions: 52353
[32m	Num. of freq itemsets: 1
[37m	Finished data-stream #9 in 0.64 seconds.
[37m--------------------------------------------------------------------------------
[37m	Accessing data-stream #10
[32m	Num. of transactions: 43044
[32m	Num. of freq itemsets: 1
[37m	Finished data-stream #10 in 0.57 seconds.
[37m----

KeyboardInterrupt: 