#### Libs

In [1]:
from pyspark import SparkContext
from datetime import datetime

### Spark


Create a new SparkContext

In [2]:
sc = SparkContext(appName="Assignment_conditions")

In [3]:
threshold=10
min_std_lift=0.2

#### Read data

Here we read the test data and map it by splitting by "," and ignoring the header row.

In [4]:
data = sc.textFile("testing.csv")
data = data.map(lambda line: line.split(",")).filter(lambda v: v[0]!="START")
data.take(1)


[['2017-01-14',
  '2017-03-30',
  '09e4e8cb-29c2-4ef4-86c0-a6ff0ba25d2a',
  '88e540ab-a7d7-47de-93c1-720a06f3d601',
  '65363002',
  'Otitis media']]

### Reorganize data

Next we map the unique conditions by their code and respective description

In [5]:
condition_map = data.map(lambda line: (int(line[4]),line[5])) \
.distinct()

condition_map.take(5)

[(65363002, 'Otitis media'),
 (444814009, 'Viral sinusitis (disorder)'),
 (233678006, 'Childhood asthma'),
 (232353008, 'Perennial allergic rhinitis with seasonal variation'),
 (446096008, 'Perennial allergic rhinitis')]

And save the data  to new file

In [6]:
format_time = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
condition_map.saveAsTextFile("{0}/{1}".format("condition_map", format_time))


Here we get all the unique patient-condition pairs to a map.

?
We map by patient-(condition) code, group it by key and 

In [7]:
#Map by PATIENT-(CONDITION)CODE, 
patient_rdd2 = data.map(lambda line: (line[2],line[4])) \
.groupByKey() \
.map(lambda tp: [int(j) for j in set(tp[1])]) 

patient_rdd2.cache()
n_buckets = patient_rdd2.count()
patient_rdd2.take(10)

[[446096008, 283371005, 284551006, 444814009, 72892002, 195662009],
 [44465007, 307731004, 10509002, 444814009, 72892002, 195662009],
 [65966004, 55822004, 162864005, 24079001],
 [64859006, 70704007, 40055000, 59621000, 444814009, 162864005],
 [156073000,
  19169002,
  55822004,
  82423001,
  15777000,
  10509002,
  40055000,
  72892002,
  444814009,
  75498004],
 [195662009, 444814009, 283385000],
 [195662009, 65363002, 43878008],
 [44465007, 10509002, 444814009, 58150001, 43878008, 195662009],
 [446096008,
  44465007,
  284549007,
  72892002,
  198992004,
  53741008,
  444814009,
  43878008],
 [64859006, 162864005, 59621000]]

### Most frequent

Here we map the most frequent conditions and their counts.

flatMap: We flatmap them by key, and set their value to 1;
reduceByKey: We reduce by key and make all keys unique and their values their respective counts;
sortBy: We sort it by decreasingly by their count;
filter: And finally we apply the threshold, keeping only the ones that have higher counts than it. 

In [8]:
frequent_singles = patient_rdd2 \
.flatMap(lambda l: [(i,1) for i in l]) \
.reduceByKey(lambda v1,v2:v1+v2) \
.sortBy(lambda a:a[1],False) \
.filter(lambda k: k[1]>threshold)

frequent_singles.take(10)

[(444814009, 95),
 (195662009, 69),
 (10509002, 58),
 (162864005, 43),
 (59621000, 34),
 (15777000, 32),
 (271737000, 32),
 (40055000, 30),
 (72892002, 25),
 (65363002, 22)]

Once we have the most frequent items we create a set with them

In [9]:
freq_single_item_dict = frequent_singles.collectAsMap()
print(freq_single_item_dict)

{444814009: 95, 195662009: 69, 10509002: 58, 162864005: 43, 59621000: 34, 15777000: 32, 271737000: 32, 40055000: 30, 72892002: 25, 65363002: 22, 43878008: 20, 19169002: 19, 44465007: 14, 75498004: 13, 55822004: 12}


### Build Pairs

In [10]:
def pair_forming(l):
    pairs=[]
    l.sort()
    for i in range(0,len(l)):
        if l[i] in freq_single_item_dict.keys():
            for j in range(i+1,len(l)):
                if  l[i]!=l[j] and l[j] in freq_single_item_dict.keys():
                    pairs.append((l[i],l[j]))
    return pairs

### Build Triples

In [11]:
def triple_forming(l):
    triples=[]
    pairs = pair_forming(l)
    for i,j in pairs:
        for k in l:
            if (i,j) in frequent_pairs_dict.keys() and k in freq_single_item_dict.keys() and k!=i and k!=j:#or (j,i) in frequent_pairs#prob unecessary
                  triples.append(tuple(sorted([i,j,k])))
    return triples

In [12]:
###COUNTING PAIRS
pair_count = patient_rdd2.flatMap(pair_forming).map(lambda p:(p,1)).reduceByKey(lambda v1,v2: v1+v2).filter(lambda tp:tp[1]>=threshold)
pair_count.sortBy(lambda a: a[1],False).take(10)

[((195662009, 444814009), 44),
 ((10509002, 444814009), 39),
 ((10509002, 195662009), 31),
 ((15777000, 271737000), 27),
 ((162864005, 444814009), 23),
 ((59621000, 444814009), 23),
 ((15777000, 444814009), 23),
 ((271737000, 444814009), 22),
 ((10509002, 162864005), 18),
 ((162864005, 195662009), 17)]

In [13]:
###FREQUENT PAIR SET
frequent_pairs_dict = pair_count.collectAsMap()
print(frequent_pairs_dict.keys())

dict_keys([(195662009, 444814009), (10509002, 72892002), (40055000, 59621000), (162864005, 444814009), (10509002, 15777000), (10509002, 19169002), (10509002, 40055000), (15777000, 40055000), (15777000, 59621000), (15777000, 271737000), (40055000, 271737000), (59621000, 271737000), (162864005, 195662009), (10509002, 271737000), (10509002, 59621000), (72892002, 195662009), (72892002, 444814009), (10509002, 195662009), (10509002, 444814009), (40055000, 162864005), (40055000, 444814009), (59621000, 162864005), (59621000, 444814009), (15777000, 444814009), (19169002, 444814009), (43878008, 444814009), (65363002, 444814009), (15777000, 162864005), (162864005, 271737000), (271737000, 444814009), (59621000, 195662009), (15777000, 195662009), (195662009, 271737000), (10509002, 162864005), (19169002, 195662009), (40055000, 195662009)])


In [14]:
###COUNTING TRIPLES
triple_count = patient_rdd2.flatMap(triple_forming).map(lambda p:(p,1)).reduceByKey(lambda v1,v2: v1+v2).filter(lambda tp:tp[1]>=threshold)
triple_count.sortBy(lambda a: a[1], False).take(10)

[((10509002, 195662009, 444814009), 63),
 ((15777000, 271737000, 444814009), 57),
 ((15777000, 195662009, 271737000), 42),
 ((59621000, 195662009, 444814009), 39),
 ((15777000, 195662009, 444814009), 36),
 ((15777000, 162864005, 271737000), 33),
 ((15777000, 59621000, 271737000), 33),
 ((195662009, 271737000, 444814009), 33),
 ((72892002, 195662009, 444814009), 30),
 ((10509002, 162864005, 444814009), 30)]

In [15]:
###FREQUENT TRIPLE SET (PROBABLY NOT NEEDED)
#frequent_triples_dict = triple_count.collectAsMap()
#print(frequent_triples_dict.keys())

In [16]:
min_std_lift = 0.2
x=195662009
# y=444814009

## Association Rules: Confidence


In [17]:
def confidence_pairs(single_1, single_2,count):
    den = freq_single_item_dict.get(single_1,0)
    if den==0:
        return 0
    else:
        return count/den

In [18]:
def confidence_triples(pair,single, count):
    den = frequent_pairs_dict.get(pair,0)
    if den==0:
        return 0
    else:
        return count/den

## Association Rules: Interest

    interestI -> j = p(j|I) − p(j)
    interestI -> j = confidence(I -> j)−baskets containing j / baskets

high positive interest: presence of I indicates the presence of j.

high negative interest: presence of I discourages the presence of j

In [19]:
def interest_pairs(single_1, single_2,count):
    return confidence_pairs(single_1, single_2,count) - (freq_single_item_dict[single_2]/n_buckets)

In [20]:
def interest_triples(pair,single,count):
    return confidence_triples(pair,single,count) - (freq_single_item_dict[single]/n_buckets)

## Association rules: Lift

    Lift(I) -> j = confidence(I -> j)/P(j) =P(I, j)/P(I)*P(j)

Lift (also known as the observed/expected ratio) is a measure of the degree of dependence between I and j. 

A lift of 1 indicates that I and j are independent

In [21]:
def lift_pairs(single_1, single_2,count):
    return confidence_pairs(single_1, single_2,count)/(freq_single_item_dict[single_2]/n_buckets)

In [22]:
def lift_triples(pair,single,count):
    return confidence_triples(pair,single,count)/(freq_single_item_dict[single]/n_buckets)

## Association rules: Standardised lift

Standardised lift ranges from 0 to 1.

This facilitates setting a fixed threshold for selecting the rules

In [23]:
def std_lift_pairs(single_1, single_2,count):
    p_single_1 = freq_single_item_dict.get(single_1,0)/n_buckets
    p_single_2 = freq_single_item_dict.get(single_2,0)/n_buckets
    
    if p_single_1!=0 and p_single_2!=0:
        num = lift_pairs(single_1,single_2, count) - (max(p_single_1+p_single_2-1,1/n_buckets)/(p_single_1*p_single_2))
        den = (1/p_single_1*p_single_2) - (max(p_single_1+p_single_2-1,1/n_buckets)/(p_single_1*p_single_2))
        if den!=0:
            return num/den
        else:
            return 0
    else:
        return 0

In [24]:
def std_lift_triples(pair,single,count):
    p_pair = frequent_pairs_dict.get(pair,0)/n_buckets
    p_single = freq_single_item_dict[single]/n_buckets
    if p_pair!=0 and p_single!=0:
        num = lift_triples(pair,single, count) - (max(p_pair+p_single-1,1/n_buckets)/(p_pair*p_single))
        den = (1/p_pair*p_single) - (max(p_pair+p_single-1,1/n_buckets)/(p_pair*p_single))
        if den!=0:
            return num/den
        else:
            return 0
    else:
        return 0

Breaking frequent itemsets into rules with respective counts

In [25]:
def break_doubles(pair):
    single_1 = pair[0][0]
    single_2 = pair[0][1]
    count = pair[1]
    return [((single_1,single_2),count),((single_2,single_1),count)]

In [26]:
def break_triples(triple_count):
    
    triple=triple_count[0]
    count = triple_count[1]
    first_pair = (triple[0],triple[1])
    second_pair = (triple[0],triple[2])
    third_pair = (triple[1],triple[2])
    pairs = [first_pair, second_pair, third_pair]
    singles = [2,1,0]
    candidate_rules=[]
    for i in range(3):
        candidate_rules.append(((pairs[i],triple[singles[i]],count)))
    
    return candidate_rules

Getting metrics for the rules in the format:
left side of the rule,right side,standard lift,lift,confidence,interest

In [27]:
def get_metrics_double(pair):
    single_1 = pair[0][0]
    single_2 = pair[0][1]
    count = pair[1]
    
    confidence = confidence_pairs(single_1, single_2,count)
    interest = interest_pairs(single_1, single_2,count)
    lift = lift_pairs(single_1, single_2,count)
    std_lift = std_lift_pairs(single_1, single_2,count)
    return ([single_1],single_2,std_lift,lift,confidence,interest)

In [28]:
def get_metrics_triple(triple):
    pair = triple[0]
    single = triple[1]
    count = triple[2]
    confidence = confidence_triples(pair, single,count)
    interest = interest_triples(pair, single,count)
    lift = lift_triples(pair, single,count)
    std_lift = std_lift_triples(pair, single,count)
    return (list(pair),single,std_lift,lift,confidence,interest)

In [29]:
possible_triple_rules = triple_count.flatMap(break_triples)
possible_triple_rules.take(10)

[((10509002, 72892002), 195662009, 18),
 ((10509002, 195662009), 72892002, 18),
 ((72892002, 195662009), 10509002, 18),
 ((10509002, 72892002), 444814009, 21),
 ((10509002, 444814009), 72892002, 21),
 ((72892002, 444814009), 10509002, 21),
 ((40055000, 59621000), 162864005, 18),
 ((40055000, 162864005), 59621000, 18),
 ((59621000, 162864005), 40055000, 18),
 ((40055000, 59621000), 444814009, 27)]

In [30]:
resulting_triples = possible_triple_rules.map(get_metrics_triple).filter(lambda a: a[2]>=min_std_lift).sortBy(lambda a: a[2], False)
resulting_triples.take(10)

[([10509002, 444814009],
  19169002,
  18.909090909090907,
  5.538461538461538,
  0.6923076923076923,
  0.5673076923076923),
 ([10509002, 195662009],
  19169002,
  16.727272727272727,
  6.193548387096774,
  0.7741935483870968,
  0.6491935483870968),
 ([195662009, 444814009],
  19169002,
  16.727272727272727,
  4.363636363636363,
  0.5454545454545454,
  0.4204545454545454),
 ([15777000, 444814009],
  271737000,
  9.761467889908257,
  11.771739130434783,
  2.4782608695652173,
  2.2677345537757434),
 ([271737000, 444814009],
  15777000,
  9.761467889908255,
  12.306818181818182,
  2.590909090909091,
  2.3803827751196174),
 ([15777000, 444814009],
  19169002,
  9.454545454545455,
  4.869565217391305,
  0.6086956521739131,
  0.4836956521739131),
 ([195662009, 444814009],
  72892002,
  9.319238900634247,
  4.145454545454545,
  0.6818181818181818,
  0.5173444976076554),
 ([10509002, 162864005],
  19169002,
  7.999999999999998,
  5.333333333333333,
  0.6666666666666666,
  0.5416666666666666),


In [31]:
candidate_double_rules = pair_count.flatMap(break_doubles)

In [32]:
resulting_pairs = candidate_double_rules.map(get_metrics_double).filter(lambda a: a[2]).sortBy(lambda a: a[2])
resulting_pairs.take(10)

[([19169002],
  444814009,
  0.18843683083511778,
  1.0105263157894737,
  0.631578947368421,
  0.006578947368421018),
 ([65363002],
  444814009,
  0.20556745182012848,
  0.9454545454545455,
  0.5909090909090909,
  -0.03409090909090906),
 ([72892002],
  444814009,
  0.23982869379014995,
  0.96,
  0.6,
  -0.025000000000000022),
 ([43878008], 444814009, 0.2569593147751606, 1.28, 0.8, 0.17500000000000004),
 ([40055000],
  444814009,
  0.27408993576017127,
  0.9066666666666666,
  0.5666666666666667,
  -0.05833333333333335),
 ([271737000], 444814009, 0.35974304068522484, 1.1, 0.6875, 0.0625),
 ([19169002],
  195662009,
  0.3627684964200477,
  1.391304347826087,
  0.631578947368421,
  0.17763157894736836),
 ([162864005],
  444814009,
  0.3768736616702355,
  0.855813953488372,
  0.5348837209302325,
  -0.09011627906976749),
 ([15777000], 444814009, 0.3768736616702355, 1.15, 0.71875, 0.09375),
 ([59621000],
  444814009,
  0.37687366167023556,
  1.0823529411764707,
  0.6764705882352942,
  0.05147

Uniting both types of rules X -> Y and X,Y -> Z to a single rdd

In [33]:
final_rules = resulting_triples.union(resulting_pairs).sortBy(lambda a: a[2])
final_rules.take(10)

[([19169002],
  444814009,
  0.18843683083511778,
  1.0105263157894737,
  0.631578947368421,
  0.006578947368421018),
 ([65363002],
  444814009,
  0.20556745182012848,
  0.9454545454545455,
  0.5909090909090909,
  -0.03409090909090906),
 ([72892002],
  444814009,
  0.23982869379014995,
  0.96,
  0.6,
  -0.025000000000000022),
 ([43878008], 444814009, 0.2569593147751606, 1.28, 0.8, 0.17500000000000004),
 ([40055000],
  444814009,
  0.27408993576017127,
  0.9066666666666666,
  0.5666666666666667,
  -0.05833333333333335),
 ([65363002, 444814009],
  195662009,
  0.296810587980039,
  1.6945373467112597,
  0.7692307692307693,
  0.3152834008097166),
 ([10509002, 72892002],
  444814009,
  0.34261241970021417,
  3.3600000000000003,
  2.1,
  1.475),
 ([40055000, 195662009],
  444814009,
  0.34261241970021417,
  2.5846153846153848,
  1.6153846153846154,
  0.9903846153846154),
 ([271737000], 444814009, 0.35974304068522484, 1.1, 0.6875, 0.0625),
 ([15777000, 59621000],
  195662009,
  0.362768496420

In [34]:
conds = sc.broadcast(condition_map.collectAsMap())

Function to translate condition ids to names

In [35]:
def translate_conditions(line):
    temp_list = list(line)
    left_side = line[0]
    right_side = line[1]
    translated_left = [conds.value[i] for i in left_side]
    translated_right = conds.value[right_side]
    temp_list[0] = translated_left
    temp_list[1] = translated_right
    return tuple(temp_list)
    

Translating condition ids to their name in the final rules rdd

In [36]:
final_rules = final_rules.map(translate_conditions)
final_rules.take(10)

[(['Miscarriage in first trimester'],
  'Viral sinusitis (disorder)',
  0.18843683083511778,
  1.0105263157894737,
  0.631578947368421,
  0.006578947368421018),
 (['Otitis media'],
  'Viral sinusitis (disorder)',
  0.20556745182012848,
  0.9454545454545455,
  0.5909090909090909,
  -0.03409090909090906),
 (['Normal pregnancy'],
  'Viral sinusitis (disorder)',
  0.23982869379014995,
  0.96,
  0.6,
  -0.025000000000000022),
 (['Streptococcal sore throat (disorder)'],
  'Viral sinusitis (disorder)',
  0.2569593147751606,
  1.28,
  0.8,
  0.17500000000000004),
 (['Chronic sinusitis (disorder)'],
  'Viral sinusitis (disorder)',
  0.27408993576017127,
  0.9066666666666666,
  0.5666666666666667,
  -0.05833333333333335),
 (['Otitis media', 'Viral sinusitis (disorder)'],
  'Acute viral pharyngitis (disorder)',
  0.296810587980039,
  1.6945373467112597,
  0.7692307692307693,
  0.3152834008097166),
 (['Acute bronchitis (disorder)', 'Normal pregnancy'],
  'Viral sinusitis (disorder)',
  0.342612419

Saving results to a folder and stopping spark context

In [37]:
#FILES ARE STRUCTURED AS left_rule_side,right_rule_side,std_lift,lift,confidence,interest
final_rules.saveAsTextFile("{0}/{1}".format("extracted_rules", format_time))
sc.stop()