In [1]:
import findspark
findspark.init()

from pyspark import SparkContext
import time

from apriori_son_utils import apriori_partition, get_dataset_info, turnStr2Pair, countItemsets

In [2]:
def spark_apriori_run(dataset_name, cur_minsup, partition_num, input_file_path):
    start = time.time()
    
    sc = SparkContext(appName=f'cz4042_{dataset_name}_task2').getOrCreate()
    
    output_file_path = f'./logs/{dataset_name}/minsup_{
        cur_minsup}_partition_{partition_num}.txt'
    
    try:
        print('Start running Apriori algorithm on', dataset_name, 'with minsup =',
              cur_minsup, 'and partition_num =', partition_num, '...')

        with open(input_file_path) as f:
            rawStrList = f.readlines()
            pairList = [pair.split('\n')[0] for pair in rawStrList]
            f.close()

        qualifiedUsersRDD = sc.parallelize(pairList, partition_num).map(
            lambda pairStr: turnStr2Pair(pairStr)).groupByKey().mapValues(
                lambda iterable: set(iterable)).map(
                    lambda pair: pair[1])

        full_size = qualifiedUsersRDD.count()

        rawCandidates = qualifiedUsersRDD.mapPartitions(
            lambda partition: apriori_partition(partition, cur_minsup, full_size))

        candidatesResultRDD = rawCandidates.flatMap(lambda x: x).flatMap(
            lambda x: x).distinct().sortBy(
                lambda pairs: (len(pairs), pairs))

        candidatesResult = candidatesResultRDD.collect()

        busResult = qualifiedUsersRDD.collect()

        frequentItemsets = candidatesResultRDD.map(
            lambda cand: countItemsets(cand, busResult)).filter(
                lambda itemset: itemset[1] >= cur_minsup).sortBy(
                    lambda pair: (len(pair[0]), pair[0])).map(
                        lambda pair: pair[0])

        
        with open(output_file_path, 'w') as f:
            f.write('Candidates:\n')
            for cand in candidatesResult:
                f.write(str(cand) + '\n')
            f.write('\nFrequent Itemsets:\n')
            for itemset in frequentItemsets.collect():
                f.write(str(itemset) + '\n')
            f.close()

        sc.stop()

        # print(frequentItemsets.collect())
        print(f'Output file is saved to {output_file_path}')
        print('Time taken:', round(time.time() - start, 2), 'seconds.')

    except KeyboardInterrupt:
        sc.stop()
        print('KeyboardInterrupt. Therefore, SparkContext server stopped.')

In [3]:
def main(dataset_name):
    info = get_dataset_info(dataset_name)
    input_file_path = info['path']
    cur_minsup = 0
    minsup_step = info['minsup_step']
    partition_num = info['partition_num']
    
    for i in range(5):
        cur_minsup = cur_minsup + minsup_step
        
        # SON for partition based Apriori algorithm
        spark_apriori_run(dataset_name, cur_minsup, partition_num, input_file_path)
        
        print('Finish running Apriori algorithm on', dataset_name, 'with minsup =',
              cur_minsup, 'and partition_num =', partition_num)
        
        print('--'*50,'\n')

In [4]:
%%time
main(dataset_name='groceries')

23/11/20 13:00:51 WARN Utils: Your hostname, Siddhant-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 10.91.3.5 instead (on interface en0)
23/11/20 13:00:51 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/20 13:00:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Start running Apriori algorithm on groceries with minsup = 50 and partition_num = 10 ...


                                                                                

Output file is saved to ./logs/groceries/minsup_50_partition_10.txt
Time taken: 16.66 seconds.
Finish running Apriori algorithm on groceries with minsup = 50 and partition_num = 10
---------------------------------------------------------------------------------------------------- 

Start running Apriori algorithm on groceries with minsup = 100 and partition_num = 10 ...


                                                                                

Output file is saved to ./logs/groceries/minsup_100_partition_10.txt
Time taken: 5.06 seconds.
Finish running Apriori algorithm on groceries with minsup = 100 and partition_num = 10
---------------------------------------------------------------------------------------------------- 

Start running Apriori algorithm on groceries with minsup = 150 and partition_num = 10 ...
Output file is saved to ./logs/groceries/minsup_150_partition_10.txt
Time taken: 3.64 seconds.
Finish running Apriori algorithm on groceries with minsup = 150 and partition_num = 10
---------------------------------------------------------------------------------------------------- 

Start running Apriori algorithm on groceries with minsup = 200 and partition_num = 10 ...
Output file is saved to ./logs/groceries/minsup_200_partition_10.txt
Time taken: 3.29 seconds.
Finish running Apriori algorithm on groceries with minsup = 200 and partition_num = 10
--------------------------------------------------------------------

In [4]:
%%time
main(dataset_name='movielens')

23/11/20 13:39:03 WARN Utils: Your hostname, Siddhant-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 10.91.3.5 instead (on interface en0)
23/11/20 13:39:03 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/20 13:39:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Start running Apriori algorithm on movielens with minsup = 100 and partition_num = 100 ...


ERROR:root:Exception while sending command.                       (0 + 8) / 100]
Traceback (most recent call last):
  File "/Users/siddhantpathak/anaconda3/envs/fp_spark/lib/python3.12/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: reentrant call inside <_io.BufferedReader name=74>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/siddhantpathak/anaconda3/envs/fp_spark/lib/python3.12/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/siddhantpathak/anaconda3/envs/fp_spark/lib/python3.12/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception whi

Py4JError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe

[Stage 3:>                                                        (0 + 8) / 100]