In [1]:
import time
import math
from itertools import combinations
from pyspark import SparkContext
from operator import add

### Task1: Simulated small data (8.5 pts)

There are two CSV files (small1.csv and small2.csv) provided on the Vocareum in your workspace. The small1.csv is just a sample file that you can use to debug your code. For Task1, we will test your code on small2.csv for grading.

In this task, you need to build two kinds of market-basket models.

#### Case 1 (4.25 pts)

You will calculate the combinations of frequent businesses (as singletons, pairs, triples, etc.) that are qualified as “frequent” given a support threshold. You need to create a basket for each user containing the business ids reviewed by this user. If a business was reviewed more than once by a reviewer, we consider this product was rated only once. More specifically, the business ids within each basket are unique. The generated baskets are similar to:

user1: [business11, business12, business13, ...]

user2: [business21, business22, business23, ...]

user3: [business31, business32, business33, ...]

In [None]:
# params = sys.argv
# case_number, support = int(params[1]), int(params[2])
# input_file_path, output_file_path = params[3], params[4]

In [2]:
case_number, support = 1, 4
input_file_path = './small1.csv'

In [3]:
sc = SparkContext.getOrCreate()

In [4]:
small = sc.textFile(input_file_path).map(lambda l: l.split(","))

In [5]:
# header = small.top(1)[0]
header = small.first()
if case_number==1:
    data = small.filter(lambda l: l!=header) \
                .map(lambda u_b: (u_b[0], u_b[1]))
elif case_number==2:
    data = small.filter(lambda l: l!=header) \
            .map(lambda u_b: (u_b[1], u_b[0]))

In [6]:
data.take(2)

[('1', '100'), ('1', '98')]

In [7]:
participants = data.groupByKey().map(lambda u_b: sorted(list(set(u_b[1]))))
participants.take(2)

[['100', '101', '102', '98'], ['101', '102', '103', '97', '99']]

In [8]:
len_data = participants.count()
len_data

19

In [9]:
def PCY_first_pass(participants, n_buckets, **kwargs):
    def hash_func(p, n_buckets):
#         return str(sum(p) % n_buckets)
        return str((int(p[0]) + int(p[1])) % n_buckets)

    partitions = list(participants)
    ps = math.ceil(support * len(partitions) / len_data)

    counts = {}
    bitmap = {}
    for pars in partitions:
        for p in pars:
            if p in counts:
                counts[p] += 1
            else:
                counts[p] = 1

        for p in combinations(pars, 2):
            k = hash_func(p, n_buckets)
            if k in bitmap:
                bitmap[k][0] += 1
                bitmap[k][1].add(p)
            else:
                bitmap[k] = [1, {p}]

    frequency = dict(filter(lambda p_c: p_c[1]>=ps, counts.items()))
    freq_items = sorted(list(set(frequency.keys()))) # need to be sorted to make sure ('99','97')==('97','99')
    freq_items = [(i,) for i in freq_items]
    bitmap = dict(filter(lambda c: c[1][0]>=ps, bitmap.items()))
    freq_buckets = set()
    for _,bsk in bitmap.values():
        freq_buckets = freq_buckets | bsk

    return freq_items, freq_buckets
#     return freq_items, bitmap

def PCY_second_pass(partition, freq_items, freq_buckets):
    
    for bsk in partition:
        bsk = sorted(list(set(bsk) & set(freq_items)))
    
    freq_item_pairs = set()
#         for f_i in freq_items:                    ###### pairs of different partitions?
    comb = combinations(freq_items, 2)
    for cmb in comb:
        freq_item_pairs.add(cmb)
    candidates = freq_item_pairs & freq_buckets

        
        
    
def PCY(participants, n_buckets, **kwargs):
    freq_items, freq_buckets = PCY_first_pass(participants, n_buckets)
    candidates = PCY_second_pass(freq_items, freq_buckets)
    
    return freq_items, candidates

In [10]:
# candidates = participants.mapPartitions(lambda b: PCY(b, n_buckets=99))
candidates = participants.mapPartitions(lambda b: PCY_first_pass(b, n_buckets=99))
candidates.collect()

[[('100',), ('101',), ('102',), ('97',), ('98',), ('99',)],
 {('100', '101'),
  ('100', '102'),
  ('100', '105'),
  ('100', '106'),
  ('100', '107'),
  ('100', '108'),
  ('100', '98'),
  ('100', '99'),
  ('101', '102'),
  ('101', '105'),
  ('101', '106'),
  ('101', '107'),
  ('101', '97'),
  ('101', '98'),
  ('101', '99'),
  ('102', '103'),
  ('102', '104'),
  ('102', '105'),
  ('102', '106'),
  ('102', '97'),
  ('102', '98'),
  ('102', '99'),
  ('103', '104'),
  ('103', '97'),
  ('103', '98'),
  ('103', '99'),
  ('104', '97'),
  ('104', '98'),
  ('104', '99'),
  ('105', '98'),
  ('107', '98'),
  ('108', '98'),
  ('97', '98'),
  ('97', '99'),
  ('98', '99')},
 [('100',), ('101',), ('102',), ('103',), ('105',), ('97',), ('98',), ('99',)],
 {('100', '101'),
  ('100', '102'),
  ('100', '103'),
  ('100', '105'),
  ('100', '106'),
  ('100', '107'),
  ('100', '108'),
  ('100', '97'),
  ('100', '98'),
  ('100', '99'),
  ('101', '102'),
  ('101', '103'),
  ('101', '105'),
  ('101', '106'),
  (

In [None]:
candidates.flatMap(lambda pairs: pairs).distinct() \
        .sortBy(lambda pairs: (len(pairs), pairs)).collect()

In [168]:
a = {('100', '101'),('100', '102'),('100', '98'),('100', '99'),('101', '102'),('101', '97'),('101', '98'),('101', '99'),('102', '97'),('102', '98'),('102', '99'),('97', '98'),('97', '99'),('98', '99')} | \
{('100', '101'),('100', '102'),('100', '103'),('100', '105'),('100', '97'),('100', '98'),('100', '99'),('101', '102'),('101', '103'),('101', '105'),('101', '97'),('101', '98'),('101', '99'),('102', '103'),('102', '105'),('102', '97'),('102', '98'),('102', '99'),('103', '105'),('103', '97'),('103', '98'),('103', '99'),('105', '97'),('105', '98'),('105', '99'),('97', '98'),('97', '99'),('98', '99')}
len(a)

28

In [169]:
b = set([('100', '101'), ('100', '98'), ('100', '99'), ('101', '102'), ('101', '97'), ('101', '98'), ('101', '99'), 
 ('102', '103'), ('102', '105'), ('102', '97'), ('102', '98'), ('102', '99'), ('103', '105'), ('103', '98'), 
 ('103', '99'), ('105', '98'), ('105', '99'), ('97', '98'), ('97', '99'), ('98', '99'), ])

In [170]:
len(b)

20

In [171]:
a-b

{('100', '102'),
 ('100', '103'),
 ('100', '105'),
 ('100', '97'),
 ('101', '103'),
 ('101', '105'),
 ('103', '97'),
 ('105', '97')}

In [172]:
b-a

set()

In [None]:
[('100',), ('101',), ('102',), ('103',), ('105',), ('97',), ('98',), ('99',), 
 ('100', '101'), ('100', '98'), ('100', '99'), ('101', '102'), ('101', '97'), ('101', '98'), ('101', '99'), 
 ('102', '103'), ('102', '105'), ('102', '97'), ('102', '98'), ('102', '99'), ('103', '105'), ('103', '98'), 
 ('103', '99'), ('105', '98'), ('105', '99'), ('97', '98'), ('97', '99'), ('98', '99'), 
 
 ('100', '101', '98'), ('100', '101', '99'), ('101', '97', '99'), ('102', '103', '105'), ('102', '103', '98'), 
 ('102', '103', '99'), ('102', '105', '98'), ('102', '105', '99'), ('102', '98', '99'), ('103', '105', '98'), 
 ('103', '105', '99'), ('103', '98', '99'), ('105', '98', '99'), ('97', '98', '99'), 
 ('102', '103', '105', '98'), ('102', '103', '105', '99'), ('102', '103', '98', '99'), 
 ('102', '105', '98', '99'), ('103', '105', '98', '99'), 
 ('102', '103', '105', '98', '99')]


#### Case 2 (4.25 pts)

You will calculate the combinations of frequent users (as singletons, pairs, triples, etc.) that are qualified as “frequent” given a support threshold. You need to create a basket for each business containing the user ids that commented on this business. Similar to case 1, the user ids within each basket are unique. The generated baskets are similar to:

business1: [user11, user12, user13, ...] 

business2: [user21, user22, user23, ...] 

business3: [user31, user32, user33, ...]

In [None]:
with open(output_file, 'w+') as o:
    json.dump(results, o)