#### Libs

In [1]:
from pyspark import SparkContext
from datetime import datetime

### Spark


Create a new SparkContext

In [2]:
sc = SparkContext(appName="Assignment_conditions")

In [3]:
threshold=1000
min_std_lift=0.2

#### Read data

Here we read the test data and map it by splitting by "," and ignoring the header row.

In [4]:
data = sc.textFile("conditions.csv")
data = data.map(lambda line: line.split(",")).filter(lambda v: v[0]!="START")
data.take(1)


[['2017-01-14',
  '2017-03-30',
  '09e4e8cb-29c2-4ef4-86c0-a6ff0ba25d2a',
  '88e540ab-a7d7-47de-93c1-720a06f3d601',
  '65363002',
  'Otitis media']]

### Reorganize data

Next we map the unique conditions by their code and respective description

In [5]:
condition_map = data.map(lambda line: (int(line[4]),line[5])) \
.distinct()

condition_map.take(5)

[(40275004, 'Contact dermatitis'),
 (126906006, 'Neoplasm of prostate'),
 (399211009, 'History of myocardial infarction (situation)'),
 (97331000119101,
  'Macular edema and retinopathy due to type 2 diabetes mellitus (disorder)'),
 (241929008, 'Acute allergic reaction')]

And save the data  to new file

In [6]:
format_time = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
condition_map.saveAsTextFile("{0}/{1}".format("condition_map", format_time))


Here we get all the unique patient-condition pairs to a map.

?
We map by patient-(condition) code, group it by key and 

In [7]:
#Map by PATIENT-(CONDITION)CODE, 
patient_rdd2 = data.map(lambda line: (line[2],line[4])) \
.groupByKey() \
.map(lambda tp: [int(j) for j in set(tp[1])]) 

patient_rdd2.cache()
n_buckets = patient_rdd2.count()
patient_rdd2.take(10)

[[65966004, 55822004, 162864005, 24079001],
 [446096008,
  44465007,
  284549007,
  72892002,
  198992004,
  53741008,
  444814009,
  43878008],
 [126906006,
  92691004,
  15777000,
  68496003,
  40055000,
  59621000,
  271737000,
  162573006,
  403190006,
  444814009,
  162864005,
  370143000,
  235919008,
  67811000119102,
  254632001,
  49436004,
  65275009],
 [444814009, 162864005, 10509002],
 [363406005,
  283371005,
  64859006,
  443165006,
  68496003,
  713197008,
  40055000,
  230690007,
  359817006,
  195662009,
  55680006],
 [44465007,
  82423001,
  15777000,
  449868002,
  40055000,
  90560007,
  59621000,
  271737000,
  162864005,
  87433001],
 [444814009, 110030002, 65363002, 10509002],
 [195662009, 444814009, 88805009, 162864005],
 [195662009, 444814009, 10509002, 43878008],
 [10509002, 162864005, 59621000, 284551006]]

### Most frequent

Here we map the most frequent conditions and their counts.

flatMap: We flatmap them by key, and set their value to 1;
reduceByKey: We reduce by key and make all keys unique and their values their respective counts;
sortBy: We sort it by decreasingly by their count;
filter: And finally we apply the threshold, keeping only the ones that have higher counts than it. 

In [8]:
frequent_singles = patient_rdd2 \
.flatMap(lambda l: [(i,1) for i in l]) \
.reduceByKey(lambda v1,v2:v1+v2) \
.sortBy(lambda a:a[1],False) \
.filter(lambda k: k[1]>threshold)

frequent_singles.take(10)

[(444814009, 751940),
 (195662009, 524692),
 (10509002, 461495),
 (162864005, 365567),
 (271737000, 355372),
 (15777000, 354315),
 (59621000, 305134),
 (40055000, 250239),
 (72892002, 205390),
 (19169002, 201894)]

Once we have the most frequent items we create a set with them

In [9]:
#freq_single_item_dict = frequent_singles.collectAsMap()
frequent_single_rdd = frequent_singles.map(lambda a: a[0])
frequent_single_set = sc.broadcast(set(frequent_single_rdd.collect()))
#for x in (frequent_single_set.value):
#    print(x)
print(frequent_single_set.value)
frequent_singles.saveAsTextFile("{0}/{1}".format("frequent_singles_count", format_time))
###### SAVE AS TEXT FILES 


#print(freq_single_item_dict)

{36971009, 196416002, 239872002, 399211009, 55680006, 90560007, 408512008, 84757009, 75498004, 201834006, 195967001, 192127007, 156073000, 263102004, 185086009, 301011002, 65363002, 92691004, 307731004, 40275004, 239720000, 254632001, 410429000, 39848009, 422034002, 197927001, 72892002, 47505003, 58150001, 38822007, 43878008, 65275009, 68496003, 198992004, 83664006, 59621000, 446096008, 128613002, 127013003, 88805009, 126906006, 429007001, 24079001, 235919008, 62564004, 271737000, 283385000, 230265002, 95417003, 110030002, 55822004, 363406005, 233678006, 109838007, 195662009, 444814009, 403192003, 6072007, 10509002, 47693006, 162573006, 79586000, 5602001, 237602007, 40055000, 230690007, 82423001, 403191005, 449868002, 15777000, 19169002, 232353008, 444470001, 403190006, 7200002, 444448004, 74400008, 254837009, 367498001, 80394007, 370143000, 35999006, 94260004, 87433001, 33737001, 359817006, 241929008, 97331000119101, 90781000119102, 67811000119102, 124171000119105, 368581000119106, 15

### Build Pairs

In [10]:
def pair_forming(l):
    pairs=[]
    l.sort()
    for i in range(0,len(l)):
        if l[i] in frequent_single_set.value:
            for j in range(i+1,len(l)):
                if  l[i]!=l[j] and l[j] in frequent_single_set.value:
                    pairs.append((l[i],l[j]))
    return pairs

### Build Triples

In [11]:
def triple_forming(l):
    triples=[]
    pairs = pair_forming(l)
    for i,j in pairs:
        for k in l:
            if (i,j) in frequent_pair_set.value and (i,k) in frequent_pair_set.value and (j,k) and k!=i and k!=j:#or (j,i) in frequent_pairs#prob unecessary
                  triples.append(tuple(sorted([i,j,k])))
    return triples

In [12]:
###COUNTING PAIRS
pair_count = patient_rdd2.flatMap(pair_forming).map(lambda p:(p,1)).reduceByKey(lambda v1,v2: v1+v2).filter(lambda tp:tp[1]>=threshold).sortBy(lambda a: a[1],False)
frequent_pair_set = sc.broadcast(set(pair_count.map(lambda a: a[0]).collect()))
pair_count.saveAsTextFile("{0}/{1}".format("frequent_pairs_count", format_time))

pair_count.take(10)

[((195662009, 444814009), 343651),
 ((10509002, 444814009), 302516),
 ((15777000, 271737000), 289176),
 ((162864005, 444814009), 243812),
 ((271737000, 444814009), 236847),
 ((15777000, 444814009), 236320),
 ((10509002, 195662009), 211065),
 ((59621000, 444814009), 203450),
 ((162864005, 195662009), 167438),
 ((40055000, 444814009), 165530)]

In [13]:
###FREQUENT PAIR SET
#frequent_pairs_dict = pair_count.collectAsMap()
#print(frequent_pairs_dict.keys())

In [14]:
###COUNTING TRIPLES
triple_count = patient_rdd2.flatMap(triple_forming).map(lambda p:(p,1)).reduceByKey(lambda v1,v2: v1+v2).filter(lambda tp:tp[1]>=threshold)
triple_count.sortBy(lambda a: a[1], False).take(10)

[((15777000, 271737000, 444814009), 578457),
 ((10509002, 195662009, 444814009), 417522),
 ((15777000, 195662009, 271737000), 397749),
 ((10509002, 15777000, 271737000), 346530),
 ((162864005, 195662009, 444814009), 335580),
 ((195662009, 271737000, 444814009), 325680),
 ((15777000, 195662009, 444814009), 324249),
 ((15777000, 59621000, 271737000), 299454),
 ((10509002, 162864005, 444814009), 292152),
 ((10509002, 271737000, 444814009), 284379)]

In [15]:
###FREQUENT TRIPLE SET (PROBABLY NOT NEEDED)
#frequent_triples_dict = triple_count.collectAsMap()
#print(frequent_triples_dict.keys())

In [16]:
min_std_lift = 0.2
x=195662009
# y=444814009

## Association Rules: Confidence


In [17]:
def confidence_pairs(single_1, single_2,count):
    den = single_counts_dict.value[single_1] #had  0 in case miss
    if den==0:
        return 0
    else:
        return count/den

In [18]:
def confidence_triples(pair,single, count):
    den = pair_counts_dict.value.get(pair,0)
    if den==0:
        return 0
    else:
        return count/den

## Association Rules: Interest

    interestI -> j = p(j|I) − p(j)
    interestI -> j = confidence(I -> j)−baskets containing j / baskets

high positive interest: presence of I indicates the presence of j.

high negative interest: presence of I discourages the presence of j

In [19]:
def interest_pairs(single_1, single_2,count):
    return confidence_pairs(single_1, single_2,count) - (single_counts_dict.value[single_2]/n_buckets)

In [20]:
def interest_triples(pair,single,count):
    return confidence_triples(pair,single,count) - (single_counts_dict.value[single]/n_buckets)

## Association rules: Lift

    Lift(I) -> j = confidence(I -> j)/P(j) =P(I, j)/P(I)*P(j)

Lift (also known as the observed/expected ratio) is a measure of the degree of dependence between I and j. 

A lift of 1 indicates that I and j are independent

In [21]:
def lift_pairs(single_1, single_2,count):
    return confidence_pairs(single_1, single_2,count)/(single_counts_dict.value[single_2]/n_buckets)

In [22]:
def lift_triples(pair,single,count):
    return confidence_triples(pair,single,count)/(single_counts_dict.value[single]/n_buckets)

## Association rules: Standardised lift

Standardised lift ranges from 0 to 1.

This facilitates setting a fixed threshold for selecting the rules

In [23]:
def std_lift_pairs(single_1, single_2,count):
    p_single_1 = single_counts_dict.value[single_1]/n_buckets
    p_single_2 = single_counts_dict.value[single_2]/n_buckets
    
    if p_single_1!=0 and p_single_2!=0:
        num = lift_pairs(single_1,single_2, count) - (max(p_single_1+p_single_2-1,1/n_buckets)/(p_single_1*p_single_2))
        den = (1/p_single_1*p_single_2) - (max(p_single_1+p_single_2-1,1/n_buckets)/(p_single_1*p_single_2))
        if den!=0:
            return num/den
        else:
            return 0
    else:
        return 0

In [24]:
def std_lift_triples(pair,single,count):
    p_pair = pair_counts_dict.value.get(pair,0)/n_buckets
    p_single = single_counts_dict.value[single]/n_buckets
    if p_pair!=0 and p_single!=0:
        num = lift_triples(pair,single, count) - (max(p_pair+p_single-1,1/n_buckets)/(p_pair*p_single))
        den = (1/p_pair*p_single) - (max(p_pair+p_single-1,1/n_buckets)/(p_pair*p_single))
        if den!=0:
            return num/den
        else:
            return 0
    else:
        return 0

Breaking frequent itemsets into rules with respective counts

In [25]:
def break_doubles(pair):
    single_1 = pair[0][0]
    single_2 = pair[0][1]
    count = pair[1]
    return [((single_1,single_2),count),((single_2,single_1),count)]

In [26]:
def break_triples(triple_count):
    
    triple=triple_count[0]
    count = triple_count[1]
    first_pair = (triple[0],triple[1])
    second_pair = (triple[0],triple[2])
    third_pair = (triple[1],triple[2])
    pairs = [first_pair, second_pair, third_pair]
    singles = [2,1,0]
    candidate_rules=[]
    for i in range(3):
        candidate_rules.append(((pairs[i],triple[singles[i]],count)))
    
    return candidate_rules

Getting metrics for the rules in the format:
left side of the rule,right side,standard lift,lift,confidence,interest

In [27]:
single_counts_dict = sc.broadcast(frequent_singles.collectAsMap())
pair_counts_dict = sc.broadcast(pair_count.collectAsMap())
pair_counts_dict.value


{(195662009, 444814009): 343651,
 (10509002, 444814009): 302516,
 (15777000, 271737000): 289176,
 (162864005, 444814009): 243812,
 (271737000, 444814009): 236847,
 (15777000, 444814009): 236320,
 (10509002, 195662009): 211065,
 (59621000, 444814009): 203450,
 (162864005, 195662009): 167438,
 (40055000, 444814009): 165530,
 (195662009, 271737000): 162925,
 (15777000, 195662009): 162227,
 (10509002, 162864005): 146077,
 (10509002, 271737000): 142085,
 (10509002, 15777000): 141106,
 (59621000, 195662009): 140139,
 (72892002, 444814009): 136986,
 (19169002, 444814009): 134867,
 (59621000, 271737000): 126551,
 (10509002, 59621000): 122910,
 (15777000, 59621000): 119842,
 (40055000, 195662009): 114470,
 (59621000, 162864005): 110177,
 (40055000, 162864005): 103517,
 (15777000, 162864005): 103371,
 (15777000, 40055000): 101659,
 (162864005, 271737000): 101419,
 (10509002, 40055000): 100115,
 (40055000, 271737000): 98993,
 (43878008, 444814009): 98873,
 (72892002, 195662009): 94716,
 (19169002

In [28]:
def get_metrics_double(pair):
    single_1 = pair[0][0]
    single_2 = pair[0][1]
    count = pair[1]
    
    confidence = confidence_pairs(single_1, single_2,count)
    interest = interest_pairs(single_1, single_2,count)
    lift = lift_pairs(single_1, single_2,count)
    std_lift = std_lift_pairs(single_1, single_2,count)
    return ([single_1],single_2,std_lift,lift,confidence,interest)

In [29]:
def get_metrics_triple(triple):
    pair = triple[0]
    single = triple[1]
    count = triple[2]
    confidence = confidence_triples(pair, single,count)
    interest = interest_triples(pair, single,count)
    lift = lift_triples(pair, single,count)
    std_lift = std_lift_triples(pair, single,count)
    return (list(pair),single,std_lift,lift,confidence,interest)

In [30]:
possible_triple_rules = triple_count.flatMap(break_triples)
possible_triple_rules.take(10)

[((15777000, 40055000), 444814009, 202134),
 ((15777000, 444814009), 40055000, 202134),
 ((40055000, 444814009), 15777000, 202134),
 ((15777000, 92691004), 162864005, 16533),
 ((15777000, 162864005), 92691004, 16533),
 ((92691004, 162864005), 15777000, 16533),
 ((15777000, 162864005), 67811000119102, 1395),
 ((15777000, 67811000119102), 162864005, 1395),
 ((162864005, 67811000119102), 15777000, 1395),
 ((15777000, 370143000), 444814009, 5562)]

In [31]:
resulting_triples = possible_triple_rules.map(get_metrics_triple).filter(lambda a: a[2]>=min_std_lift).sortBy(lambda a: a[2], False)
resulting_triples.take(10)

[([6072007, 236077008],
  94260004,
  2339.6542278192446,
  785.3310719131614,
  1.3900949796472184,
  1.3883249045421282),
 ([162573006, 254632001],
  67811000119102,
  2002.0959747186398,
  1543.4373333333335,
  2.998666666666667,
  2.996723817027161),
 ([162573006, 67811000119102],
  254632001,
  1999.7892884928303,
  1543.4373333333333,
  3.0,
  2.998056286487822),
 ([6072007, 271737000],
  94260004,
  1764.067310724761,
  827.6020725474916,
  1.4649178255372945,
  1.4631477504322044),
 ([236077008, 271737000],
  94260004,
  1764.067310724761,
  827.6020725474916,
  1.4649178255372945,
  1.4631477504322044),
 ([6072007, 444814009],
  94260004,
  1549.364254348247,
  782.0137402094784,
  1.384223053383203,
  1.3824529782781128),
 ([236077008, 444814009],
  94260004,
  1549.364254348247,
  782.0137402094784,
  1.384223053383203,
  1.3824529782781128),
 ([254632001, 271737000],
  67811000119102,
  1444.7380973807203,
  1543.1727954624607,
  2.998152709359606,
  2.9962098597201),
 ([16

In [32]:
candidate_double_rules = pair_count.flatMap(break_doubles)

In [33]:
resulting_pairs = candidate_double_rules.map(get_metrics_double).filter(lambda a: a[2]).sortBy(lambda a: a[2])
resulting_pairs.take(10)

[([195967001],
  444814009,
  0.002440400092434995,
  1.0124421961931973,
  0.6576626240352812,
  0.008082206992110041),
 ([403192003],
  444814009,
  0.0026205638576483167,
  1.013903448922831,
  0.658611825192802,
  0.009031408149630882),
 ([94260004],
  444814009,
  0.0027761598366961853,
  1.0195416345081063,
  0.662274280136652,
  0.012693863093480862),
 ([67811000119102],
  444814009,
  0.0030771152172229836,
  1.029497881117422,
  0.6687416629613161,
  0.019161245918144942),
 ([254632001],
  444814009,
  0.0030791625327367712,
  1.0297245288483419,
  0.6688888888888889,
  0.019308471845717712),
 ([235919008],
  444814009,
  0.0036708367162214305,
  1.0351509612377363,
  0.6724137931034483,
  0.022833376060277133),
 ([65275009],
  444814009,
  0.0036708367162214305,
  1.0351509612377363,
  0.6724137931034483,
  0.022833376060277133),
 ([1501000119109],
  444814009,
  0.004141719284392613,
  1.0266416731434154,
  0.6668863261943987,
  0.01730590915122754),
 ([67811000119102],
  19

Uniting both types of rules X -> Y and X,Y -> Z to a single rdd

In [34]:
final_rules = resulting_triples.union(resulting_pairs).sortBy(lambda a: a[2])
final_rules.take(10)

[([195967001],
  444814009,
  0.002440400092434995,
  1.0124421961931973,
  0.6576626240352812,
  0.008082206992110041),
 ([403192003],
  444814009,
  0.0026205638576483167,
  1.013903448922831,
  0.658611825192802,
  0.009031408149630882),
 ([94260004],
  444814009,
  0.0027761598366961853,
  1.0195416345081063,
  0.662274280136652,
  0.012693863093480862),
 ([67811000119102],
  444814009,
  0.0030771152172229836,
  1.029497881117422,
  0.6687416629613161,
  0.019161245918144942),
 ([254632001],
  444814009,
  0.0030791625327367712,
  1.0297245288483419,
  0.6688888888888889,
  0.019308471845717712),
 ([235919008],
  444814009,
  0.0036708367162214305,
  1.0351509612377363,
  0.6724137931034483,
  0.022833376060277133),
 ([65275009],
  444814009,
  0.0036708367162214305,
  1.0351509612377363,
  0.6724137931034483,
  0.022833376060277133),
 ([1501000119109],
  444814009,
  0.004141719284392613,
  1.0266416731434154,
  0.6668863261943987,
  0.01730590915122754),
 ([67811000119102],
  19

In [35]:
conds = sc.broadcast(condition_map.collectAsMap())

Function to translate condition ids to names

In [36]:
def translate_conditions(line):
    temp_list = list(line)
    left_side = line[0]
    right_side = line[1]
    translated_left = [conds.value[i] for i in left_side]
    translated_right = conds.value[right_side]
    temp_list[0] = translated_left
    temp_list[1] = translated_right
    return tuple(temp_list)
    

Translating condition ids to their name in the final rules rdd

In [37]:
final_rules = final_rules.map(translate_conditions).sortBy(lambda a: a[2],False)
final_rules.take(10)

[(['Bleeding from anus', 'Protracted diarrhea'],
  'Secondary malignant neoplasm of colon',
  2339.6542278192446,
  785.3310719131614,
  1.3900949796472184,
  1.3883249045421282),
 (['Suspected lung cancer (situation)',
   'Small cell carcinoma of lung (disorder)'],
  'Primary small cell malignant neoplasm of lung  TNM stage 1 (disorder)',
  2002.0959747186398,
  1543.4373333333335,
  2.998666666666667,
  2.996723817027161),
 (['Suspected lung cancer (situation)',
   'Primary small cell malignant neoplasm of lung  TNM stage 1 (disorder)'],
  'Small cell carcinoma of lung (disorder)',
  1999.7892884928303,
  1543.4373333333333,
  3.0,
  2.998056286487822),
 (['Bleeding from anus', 'Anemia (disorder)'],
  'Secondary malignant neoplasm of colon',
  1764.067310724761,
  827.6020725474916,
  1.4649178255372945,
  1.4631477504322044),
 (['Protracted diarrhea', 'Anemia (disorder)'],
  'Secondary malignant neoplasm of colon',
  1764.067310724761,
  827.6020725474916,
  1.4649178255372945,
  1.

Saving results to a folder and stopping spark context

In [38]:
#FILES ARE STRUCTURED AS left_rule_side,right_rule_side,std_lift,lift,confidence,interest
final_rules.saveAsTextFile("{0}/{1}".format("extracted_rules", format_time))
sc.stop()