In [1]:
import re
import sys
import time
from pyspark import SparkConf, SparkContext

conf = SparkConf()
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")

In [2]:
fname = "data/browsing.txt"
sessions = (sc
    .textFile(fname)
    # .sample(False, 0.1, 0) # uncomment this line to load a small sample only
    .map(lambda x : x.split()))

#### Frequent Singletons

In [3]:
time_start = time.time()

# first pass: only keeps singleton with frequency >= 100
freq_items_support = (sessions
    .flatMap(lambda x : [(y, 1) for y in x])
    .reduceByKey(lambda n1, n2 : n1 + n2)
    .filter(lambda x : x[1] >= 100)
    .sortByKey()
    )

freq_items = {x[0] : x[1] for x in freq_items_support.collect()}

time_end = time.time()
print("processing completed, time elapsed: %.2fs\n"%(time_end - time_start))

processing completed, time elapsed: 2.64s



#### Frequent Pair

In [4]:
def build_pairs(basket):
    pairs = []
    if len(basket) < 2: return pairs
    for i, item_1 in enumerate(basket[:-1]):
        for item_2 in basket[i + 1:]:
            if all(item in freq_items for item in [item_1, item_2]):
                key = (item_1, item_2) if item_1 < item_2 else (item_2, item_1)
                val = [freq_items[item] for item in key] + [1]
                pairs.append((key, tuple(val)))
    return pairs

def pair_conf(rdd):
    items, support = rdd
    i1, i2 = items
    s1, s2, s12 = support
    
    # confidence is assymetrical, the denominator decides direction
    # for each pair, get two confidence A -> B, A <- B
    return [((i1, i2), s12 / s1), 
            ((i2, i1), s12 / s2)]

In [5]:
time_start = time.time()

# second pass
pair_support = (sessions
    .flatMap(build_pairs)
    .reduceByKey(lambda x, y: (x[0], x[1], x[2] + y[2])))

# warning: use all pairs to find rules, not just frequent pairs
conf_pair = pair_support.flatMap(pair_conf)
sorted_pair = sorted(conf_pair.collect(), key=lambda rdd : (-rdd[1], rdd[0][0]))
with open('hw1_q2d.txt', 'w') as file:
    for rel in sorted_pair[:5]:
        (a, b), conf = rel
        line = "Conf(%s -> %s) = %.10f"%(a, b, conf)
        print(line)
        file.write(line + "\n")
        
time_end = time.time()
print("processing completed, time elapsed: %.2fs\n"%(time_end - time_start))

Conf(DAI93865 -> FRO40251) = 1.0000000000
Conf(GRO85051 -> FRO40251) = 0.9991762768
Conf(GRO38636 -> FRO40251) = 0.9906542056
Conf(ELE12951 -> FRO40251) = 0.9905660377
Conf(DAI88079 -> FRO40251) = 0.9867256637
processing completed, time elapsed: 6.18s



#### Triples

In [6]:
def build_triples(l):
    triples = []
    if len(l) < 3: return triples
    for i, item_1 in enumerate(l[:-2]):
        for j in range(i + 1, len(l) - 1):
            item_2 = l[j]
            for item_3 in l[j + 1:]:
                # sort triples in alphabetic order once, so that
                # all 2-item permutation will be in alphabetic order
                triple = tuple(sorted([item_1, item_2, item_3]))
                
                if all(item in freq_items for item in triple):
                    pairs = [(triple[0], triple[1]), (triple[0], triple[2]), (triple[1], triple[2])]
                    
                    # construct triple only if all permutation of 2-iten pairs are frequent
                    if all(pair in freq_pairs for pair in pairs):
                            val = tuple([freq_pairs[pairs[0]], 
                                        freq_pairs[pairs[1]], 
                                        freq_pairs[pairs[2]], 
                                        1])
                            triples.append((triple, val))
    return triples

def triple_conf(rdd):
    items, support = rdd
    i1, i2, i3 = items
    s12, s13, s23, s123 = support
    
    # key is ordered such that A & B -> C
    # not that A & B are already sorted since i1 < i2 < i3
    return [((i1, i2, i3), s123 / s12), 
            ((i1, i3, i2), s123 / s13), 
            ((i2, i3, i1), s123 / s23)]

In [7]:
time_start = time.time()

# use frequent pair to build frequent triples
freq_pair_support = pair_support.filter(lambda x : x[1][2] >= 100)
freq_pairs = {x[0] : x[1][2] for x in freq_pair_support.collect()}

# third pass: use all candidate triples to find rules, not just frequent triples
conf_triple = (sessions
    .flatMap(build_triples)
    .reduceByKey(lambda x, y: (x[0], x[1], x[2], x[3] + y[3]))
    .flatMap(triple_conf))

sorted_triple = sorted(conf_triple.collect(), key=lambda rdd : (-rdd[1], rdd[0][0], rdd[0][1]))
with open('hw1_q2e.txt', 'w') as file:
    for rel in sorted_triple[:5]:
        (a, b, c), conf = rel
        line = "Conf(%s, %s -> %s) = %.10f"%(a, b, c, conf)
        print(line)
        file.write(line + "\n")
        
time_end = time.time()
print("processing completed, time elapsed: %.2fs\n"%(time_end - time_start))

Conf(DAI23334, ELE92920 -> DAI62779) = 1.0000000000
Conf(DAI31081, GRO85051 -> FRO40251) = 1.0000000000
Conf(DAI55911, GRO85051 -> FRO40251) = 1.0000000000
Conf(DAI62779, DAI88079 -> FRO40251) = 1.0000000000
Conf(DAI75645, GRO85051 -> FRO40251) = 1.0000000000
processing completed, time elapsed: 13.91s



In [8]:
sc.stop()