### Task1: Simulated small data (8.5 pts)

In [1]:
import json
import time
import math
from itertools import combinations
from pyspark import SparkContext
from operator import add

There are two CSV files (small1.csv and small2.csv) provided on the Vocareum in your workspace. The small1.csv is just a sample file that you can use to debug your code. For Task1, we will test your code on small2.csv for grading.

In this task, you need to build two kinds of market-basket models.

In [None]:
# params = sys.argv
# case_number, support = int(params[1]), int(params[2])
# input_file_path, output_file_path = params[3], params[4]

In [2]:
case_number, support = 1, 4
input_file_path = './small1.csv'

In [3]:
sc = SparkContext.getOrCreate()

In [4]:
small = sc.textFile(input_file_path).map(lambda l: l.split(","))

#### Case 1 (4.25 pts)

You will calculate the combinations of frequent businesses (as singletons, pairs, triples, etc.) that are qualified as “frequent” given a support threshold. You need to create a basket for each user containing the business ids reviewed by this user. If a business was reviewed more than once by a reviewer, we consider this product was rated only once. More specifically, the business ids within each basket are unique. The generated baskets are similar to:

user1: [business11, business12, business13, ...]

user2: [business21, business22, business23, ...]

user3: [business31, business32, business33, ...]

In [5]:
# header = small.top(1)[0]
header = small.first()
if case_number==1:
    data = small.filter(lambda l: l!=header) \
                .map(lambda u_b: (u_b[0], u_b[1]))

#### Case 2 (4.25 pts)

You will calculate the combinations of frequent users (as singletons, pairs, triples, etc.) that are qualified as “frequent” given a support threshold. You need to create a basket for each business containing the user ids that commented on this business. Similar to case 1, the user ids within each basket are unique. The generated baskets are similar to:

business1: [user11, user12, user13, ...] 

business2: [user21, user22, user23, ...] 

business3: [user31, user32, user33, ...]

In [None]:
if case_number==2:
    data = small.filter(lambda l: l!=header) \
            .map(lambda u_b: (u_b[1], u_b[0]))

In [6]:
data.take(2)

[('1', '100'), ('1', '98')]

In [20]:
participants = data.groupByKey().map(lambda u_b: sorted(list(set(u_b[1]))))
participants.take(5)

[['100', '101', '102', '98'],
 ['101', '102', '103', '97', '99'],
 ['102', '103', '104', '97', '98', '99'],
 ['97', '98', '99'],
 ['97', '98']]

In [8]:
len_data = participants.count()
len_data

19

In [16]:
def counts_singleton(partitions):
    counts = {}
    for pars in partitions:
        for p in pars:
            if p in counts: counts[p] += 1
            else: counts[p] = 1
    return counts

def frequent_items(counts, threshold):
    freq_items = dict(filter(lambda p_c: p_c[1]>=threshold, counts.items()))
    return sorted(freq_items.keys())

def counts_items(partitions, freq_items, n_items):
    counts = {}
    for pars in partitions:
        pars = sorted(set(pars) & set(freq_items))
        for p in combinations(pars, n_items):
            if p in counts: counts[p] += 1
            else: counts[p] = 1
    return counts

In [33]:
def find_candidates(participants, support, len_data):
    candidates = []
    partitions = list(participants)
    ps = math.ceil(support * len(partitions) / len_data)
    
    count_singleton = counts_singleton(partitions)
    freq_singleton = frequent_items(count_singleton, ps)
    candidates.append([(i,) for i in freq_singleton])

    n_items = 2
    freq_n_items, freq_n0_items = [None], freq_singleton
    while freq_n_items:
        count_items = counts_items(partitions, freq_n0_items, n_items)
        freq_n_items = frequent_items(count_items, ps)
        candidates.append(freq_n_items)
        freq_n0_items = set()
        for i in freq_n_items:
            freq_n0_items = freq_n0_items | set(i)
        n_items += 1
        
    return candidates

In [34]:
candidates = participants.mapPartitions(lambda b: find_candidates(b, support, len_data))
candidates.collect()
# candidates

[[('100',), ('101',), ('102',), ('97',), ('98',), ('99',)],
 [('100', '101'),
  ('100', '98'),
  ('101', '102'),
  ('101', '98'),
  ('102', '97'),
  ('102', '98'),
  ('97', '98'),
  ('97', '99'),
  ('98', '99')],
 [('100', '101', '98')],
 [],
 [('100',), ('101',), ('102',), ('103',), ('105',), ('97',), ('98',), ('99',)],
 [('100', '101'),
  ('100', '99'),
  ('101', '102'),
  ('101', '97'),
  ('101', '99'),
  ('102', '103'),
  ('102', '105'),
  ('102', '98'),
  ('102', '99'),
  ('103', '105'),
  ('103', '98'),
  ('103', '99'),
  ('105', '98'),
  ('105', '99'),
  ('97', '98'),
  ('97', '99'),
  ('98', '99')],
 [('100', '101', '99'),
  ('101', '97', '99'),
  ('102', '103', '105'),
  ('102', '103', '98'),
  ('102', '103', '99'),
  ('102', '105', '98'),
  ('102', '105', '99'),
  ('102', '98', '99'),
  ('103', '105', '98'),
  ('103', '105', '99'),
  ('103', '98', '99'),
  ('105', '98', '99'),
  ('97', '98', '99')],
 [('102', '103', '105', '98'),
  ('102', '103', '105', '99'),
  ('102', '103'

In [42]:
candidates = participants.mapPartitions(lambda b: find_candidates(b, support, len_data)) \
                .flatMap(lambda x: x).distinct() \
                .sortBy(lambda x: (len(x), x))
candidates = candidates.collect()
candidates

[('100',),
 ('101',),
 ('102',),
 ('103',),
 ('105',),
 ('97',),
 ('98',),
 ('99',),
 ('100', '101'),
 ('100', '98'),
 ('100', '99'),
 ('101', '102'),
 ('101', '97'),
 ('101', '98'),
 ('101', '99'),
 ('102', '103'),
 ('102', '105'),
 ('102', '97'),
 ('102', '98'),
 ('102', '99'),
 ('103', '105'),
 ('103', '98'),
 ('103', '99'),
 ('105', '98'),
 ('105', '99'),
 ('97', '98'),
 ('97', '99'),
 ('98', '99'),
 ('100', '101', '98'),
 ('100', '101', '99'),
 ('101', '97', '99'),
 ('102', '103', '105'),
 ('102', '103', '98'),
 ('102', '103', '99'),
 ('102', '105', '98'),
 ('102', '105', '99'),
 ('102', '98', '99'),
 ('103', '105', '98'),
 ('103', '105', '99'),
 ('103', '98', '99'),
 ('105', '98', '99'),
 ('97', '98', '99'),
 ('102', '103', '105', '98'),
 ('102', '103', '105', '99'),
 ('102', '103', '98', '99'),
 ('102', '105', '98', '99'),
 ('103', '105', '98', '99'),
 ('102', '103', '105', '98', '99')]

In [58]:
def find_frequent(partition, candidates):
    freq = {}
    for pars in partition:
        for i in candidates:
            if set(i).issubset(pars):
                if i in freq:
                    freq[i] += 1
                else:
                    freq[i] = 1
                    
    freq = [(k, v) for k, v in freq.items()]
    return freq

In [59]:
freq = participants.mapPartitions(lambda b: find_frequent(b, candidates)) \
                    .reduceByKey(add) \
                    .filter(lambda f: f[1]>=support).map(lambda f: f[0]) \
                    .sortBy(lambda x: (len(x), x))
freq = freq.collect()
freq

[('100',),
 ('101',),
 ('102',),
 ('103',),
 ('97',),
 ('98',),
 ('99',),
 ('100', '101'),
 ('100', '98'),
 ('101', '102'),
 ('101', '97'),
 ('101', '98'),
 ('101', '99'),
 ('102', '103'),
 ('102', '97'),
 ('102', '98'),
 ('102', '99'),
 ('103', '99'),
 ('97', '98'),
 ('97', '99'),
 ('98', '99'),
 ('100', '101', '98'),
 ('101', '97', '99'),
 ('102', '103', '99'),
 ('97', '98', '99')]

In [61]:
str(('100',)).split(',')

["('100'", ')']

In [None]:
def output_form(data):
    output = ''
    len_cur = 1
    for i in data:
        if len(i)==1:
            output += f'{str(i).split(',')[0]}),'
        elif len(i)==len_cur:
            output += f'{str(i)},'
        else:
            output = output[:-1]
            output += f'\n\n{str(i)},'
            len_cur = len(i)
    return output[:-1]

In [None]:
with open(output_file_path, 'w+') as f:
    f.write('Candidates:\n' + output_form(candidates) + '\n\n' + 'Frequent Itemsets:\n' + output_form(freq))

time_end = time.time()
print(f'Duration: {time_end - time_start}')

### Task 2: Real-world data set -Yelp data- (4.0 pts)

In task2, you will explore the Yelp dataset to find the frequent business sets (only case 1). You will jointly
use the business.json and review.json to generate the input user-business CSV file yourselves.

#### (1) Data preprocessing

You need to generate a sample dataset from business.json and review.json with the following steps:
1. The state of the business you need is Nevada, i.e., filtering ‘state’== ‘NV’.
2. Select “user_id” and “business_id” from review.json whose “business_id” is from Nevada. Each line in the CSV file would be “user_id1, business_id1”.
3. The header of CSV file should be “user_id,business_id”

You need to save the dataset in CSV format.

In [1]:
import csv
import json
from pyspark import SparkContext

In [2]:
import os
# os.chdir()
os.getcwd()

'/Users/antheayang/Desktop/llleArn/DSCI553/HW2'

In [3]:
os.chdir('/Users/antheayang/Desktop/llleArn/DSCI553')
os.getcwd()

'/Users/antheayang/Desktop/llleArn/DSCI553'

In [4]:
review_path = './HW1/review.json'
business_path = './HW1/business.json'
state = 'NV'
output_path = './HW2/user_business.csv'

In [5]:
sc = SparkContext.getOrCreate()

review = sc.textFile(review_path).map(lambda r: json.loads(r)) 
business = sc.textFile(business_path).map(lambda r: json.loads(r))

In [6]:
business_need = business.map(lambda r: (r['business_id'], r['state'])) \
                        .filter(lambda r: r[1]==state).map(lambda r: r[0])
business_need = business_need.collect()

In [7]:
output = review.map(lambda r: (r['user_id'], r['business_id'])) \
               .filter(lambda r: r[1] in business_need)
output = output.collect()

In [8]:
with open(output_path, 'w+') as f:
    writer = csv.writer(f)
    writer.writerow(['user_id', 'business_id'])
    for r in output:
        writer.writerow(r)

#### (2) Apply SON algorithm

The requirements for task 2 are similar to task 1. However, you will test your implementation with the large dataset you just generated. For this purpose, you need to report the total execution time. For this execution time, we take into account also the time from reading the file till writing the results to the output file. You are asked to find the frequent business sets (only case 1) from the file you just generated. The following are the steps you need to do:
1. Reading the user_business CSV file in to RDD and then build the case 1 market-basket model; 
2. Find out qualified users who reviewed more than k businesses. (k is the filter threshold);
3. Apply the SON algorithm code to the filtered market-basket model;

In [1]:
import time
import math
from itertools import combinations
from pyspark import SparkContext
from operator import add

In [2]:
def counts_singleton(partitions):
    counts = {}
    for pars in partitions:
        for p in pars:
            if p in counts: counts[p] += 1
            else: counts[p] = 1
    return counts

def frequent_items(counts, threshold):
    freq_items = dict(filter(lambda p_c: p_c[1]>=threshold, counts.items()))
    return sorted(freq_items.keys())

def counts_items(partitions, freq_items, n_items):
    counts = {}
    for pars in partitions:
        pars = sorted(set(pars) & set(freq_items))
        for p in combinations(pars, n_items):
            if p in counts: counts[p] += 1
            else: counts[p] = 1
    return counts

def find_candidates(participants, support, len_data):
    candidates = []
    partitions = list(participants)
    ps = math.ceil(support * len(partitions) / len_data)

    count_singleton = counts_singleton(partitions)
    freq_singleton = frequent_items(count_singleton, ps)
    candidates.append([(i,) for i in freq_singleton])

    n_items = 2
    freq_n_items, freq_n0_items = [None], freq_singleton
    while freq_n_items:
        count_items = counts_items(partitions, freq_n0_items, n_items)
        freq_n_items = frequent_items(count_items, ps)
        candidates.append(freq_n_items)
        freq_n0_items = set()
        for i in freq_n_items:
            freq_n0_items = freq_n0_items | set(i)
        n_items += 1
    return candidates


def find_frequent(partition, candidates):
    freq = {}
    for pars in partition:
        for i in candidates:
            if set(i).issubset(pars):
                if i in freq:
                    freq[i] += 1
                else:
                    freq[i] = 1

    freq = [(k, v) for k, v in freq.items()]
    return freq


def output_form(data):
    output = ''
    len_cur = 1
    for i in data:
        if len(i)==1:
            output += str(i).split(',')[0] + '),'
        elif len(i)==len_cur:
            output += str(i) + ','
        else:
            output = output[:-1]
            output += '\n\n' + str(i) + ','
            len_cur = len(i)
    return output[:-1]

In [3]:
time_start = time.time()
# params = sys.argv
# filter_threshold, support = int(params[1]), int(params[2])
# input_file_path, output_file_path = params[3], params[4]
filter_threshold, support = 70, 50
# input_file_path, output_file_path = './ub.csv', './test.csv'
input_file_path, output_file_path = './user_business.csv', './test.csv'

sc = SparkContext.getOrCreate()
user_business = sc.textFile(input_file_path).map(lambda l: l.split(","))
header = user_business.first()
data = user_business.filter(lambda l: l!=header) \
                    .map(lambda u_b: (u_b[0], u_b[1]))

participants = data.groupByKey().map(lambda u_b: sorted(list(set(u_b[1])))) \
                    .filter(lambda b: len(b)>filter_threshold)
len_data = participants.count()

candidates = participants.mapPartitions(lambda b: find_candidates(b, support, len_data)) \
                        .flatMap(lambda x: x).distinct() \
                        .sortBy(lambda x: (len(x), x))
candidates = candidates.collect()    

freq = participants.mapPartitions(lambda b: find_frequent(b, candidates)) \
                .reduceByKey(add) \
                .filter(lambda f: f[1]>=support).map(lambda f: f[0]) \
                .sortBy(lambda f: (len(f), f))
freq = freq.collect()


with open(output_file_path, 'w+') as f:
    f.write('Candidates:\n' + output_form(candidates) + '\n\n' + 'Frequent Itemsets:\n' + output_form(freq))

time_end = time.time()
print(f'Duration: {time_end - time_start}')

Duration: 11.842086791992188
