#### Libs

In [1]:
from pyspark import SparkContext
from datetime import datetime

### Spark


Create a new SparkContext

In [2]:
sc = SparkContext(appName="Assignment_conditions")

In [3]:
threshold=10

#### Read data

Here we read the test data and map it by splitting by "," and ignoring the header row.

In [4]:
data = sc.textFile("testing.csv")
data = data.map(lambda line: line.split(",")).filter(lambda v: v[0]!="START")
data.take(1)


[['2017-01-14',
  '2017-03-30',
  '09e4e8cb-29c2-4ef4-86c0-a6ff0ba25d2a',
  '88e540ab-a7d7-47de-93c1-720a06f3d601',
  '65363002',
  'Otitis media']]

### Reorganize data

Next we map the unique conditions by their code and respective description

In [5]:
condition_map = data.map(lambda line: (int(line[4]),line[5])) \
.distinct()#reduceByKey(lambda val1, val2: val1)

condition_map.take(5)

[(65363002, 'Otitis media'),
 (444814009, 'Viral sinusitis (disorder)'),
 (233678006, 'Childhood asthma'),
 (232353008, 'Perennial allergic rhinitis with seasonal variation'),
 (446096008, 'Perennial allergic rhinitis')]

And save the data  to new file

In [6]:
format_time = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
condition_map.saveAsTextFile("{0}/{1}".format("condition_map", format_time))


Here we get all the unique patient-condition pairs to a map.

?
We map by patient-(condition) code, group it by key and 

In [16]:
#Map by PATIENT-(CONDITION)CODE, 
patient_rdd2 = data.map(lambda line: (line[2],line[4])) \
.groupByKey() \
.map(lambda tp: [int(j) for j in set(tp[1])]) 
# .map(lambda k: (k[0],[int[j] for j in k[1]]))

patient_rdd2.take(10)

[[446096008, 283371005, 284551006, 444814009, 72892002, 195662009],
 [44465007, 307731004, 10509002, 444814009, 72892002, 195662009],
 [65966004, 55822004, 162864005, 24079001],
 [64859006, 70704007, 40055000, 59621000, 444814009, 162864005],
 [156073000,
  19169002,
  55822004,
  82423001,
  15777000,
  10509002,
  40055000,
  72892002,
  444814009,
  75498004],
 [195662009, 444814009, 283385000],
 [195662009, 65363002, 43878008],
 [44465007, 10509002, 444814009, 58150001, 43878008, 195662009],
 [446096008,
  44465007,
  284549007,
  72892002,
  198992004,
  53741008,
  444814009,
  43878008],
 [64859006, 162864005, 59621000]]

### Most frequent

Here we map the most frequent conditions and their counts.

flatMap: We flatmap them by key, and set their value to 1;
reduceByKey: We reduce by key and make all keys unique and their values their respective counts;
sortBy: We sort it by decreasingly by their count;
filter: And finally we apply the threshold, keeping only the ones that have higher counts than it. 

In [8]:
frequent_singles = patient_rdd2 \
.flatMap(lambda l: [(i,1) for i in l]) \
.reduceByKey(lambda v1,v2:v1+v2) \
.sortBy(lambda a:a[1],False) \
.filter(lambda k: k[1]>threshold)

frequent_singles.take(10)

[(444814009, 95),
 (195662009, 69),
 (10509002, 58),
 (162864005, 43),
 (59621000, 34),
 (15777000, 32),
 (271737000, 32),
 (40055000, 30),
 (72892002, 25),
 (65363002, 22)]

Once we have the most frequent items we create a set with them

In [9]:
freq_single_item_set = set([k[0] for k in frequent_singles.collect()])
print(freq_single_item_set)

{72892002, 195662009, 162864005, 59621000, 15777000, 10509002, 271737000, 43878008, 19169002, 44465007, 75498004, 55822004, 40055000, 444814009, 65363002}


### Build Pairs

In [10]:
def pair_forming(l):
    pairs=[]
    list.sort()
    for i in range(0,len(l)):
        if l[i] in freq_single_item_set:
            for j in range(i+1,len(l)):
                if  l[i]!=l[j] and l[j] in freq_single_item_set:
                    pairs.append((l[i],l[j]))
    return pairs

### Build Triples

In [11]:
def triple_forming(list):
    triples=[]
    pairs = pair_forming(list)
    for i,j in pairs:
        for k in list:
            if (i,j) in frequent_pairs and k in freq_single_item_set and k!=i and k!=j:#or (j,i) in frequent_pairs#prob unecessary
                  triples.append(tuple(sorted([i,j,k])))
    return triples

In [12]:
###COUNTING PAIRS
pair_count = patient_rdd2.flatMap(pair_forming).map(lambda p:(p,1)).reduceByKey(lambda v1,v2: v1+v2).filter(lambda tp:tp[1]>=threshold)
pair_count.sortBy(lambda a: a[1],False).take(10)

[((195662009, 444814009), 44),
 ((10509002, 444814009), 39),
 ((10509002, 195662009), 31),
 ((15777000, 271737000), 27),
 ((162864005, 444814009), 23),
 ((59621000, 444814009), 23),
 ((15777000, 444814009), 23),
 ((271737000, 444814009), 22),
 ((10509002, 162864005), 18),
 ((162864005, 195662009), 17)]

In [13]:
###FREQUENT PAIR SET
frequent_pairs = list(set(pair_count.map(lambda tp:tp[0]).collect()))

print(frequent_pairs[0:10])

[(195662009, 271737000), (10509002, 271737000), (10509002, 162864005), (10509002, 59621000), (10509002, 19169002), (195662009, 444814009), (10509002, 444814009), (162864005, 271737000), (10509002, 40055000), (72892002, 444814009)]


In [14]:
###COUNTING TRIPLES
triple_count = patient_rdd2.flatMap(triple_forming).map(lambda p:(p,1)).reduceByKey(lambda v1,v2: v1+v2).filter(lambda tp:tp[1]>=threshold)
triple_count.sortBy(lambda a: a[1], False).take(10)

[((10509002, 195662009, 444814009), 63),
 ((15777000, 271737000, 444814009), 57),
 ((15777000, 195662009, 271737000), 42),
 ((59621000, 195662009, 444814009), 39),
 ((15777000, 195662009, 444814009), 36),
 ((15777000, 162864005, 271737000), 33),
 ((15777000, 59621000, 271737000), 33),
 ((195662009, 271737000, 444814009), 33),
 ((72892002, 195662009, 444814009), 30),
 ((10509002, 162864005, 444814009), 30)]

In [15]:
###FREQUENT TRIPLE SET
frequent_triples = list(set(triple_count.map(lambda tp:tp[0]).collect()))
print(frequent_triples[0:10])

[(40055000, 59621000, 195662009), (59621000, 162864005, 444814009), (15777000, 162864005, 271737000), (10509002, 59621000, 162864005), (10509002, 162864005, 195662009), (10509002, 40055000, 444814009), (10509002, 19169002, 195662009), (19169002, 59621000, 195662009), (15777000, 162864005, 444814009), (15777000, 40055000, 162864005)]


In [19]:
min_std_lift = 0.2
x=195662009
# y=444814009

## Association Rules: Confidence


In [34]:
# buckets = data.map()
# buckets.take(5)
def measure_confidence_pairs(item):
    count =0
    for pair in frequent_pairs: #usar dataset inteiro ou so pareS?
        if pair[0] == item or pair[1] == item:
            count+=1
    return count, count/len(frequent_pairs)


In [43]:
c_p,conf_p = measure_confidence_pairs(x)
c_p,conf_p

(9, 0.25)

In [36]:
def measure_confidence_triples(item):
    count =0
    for pair in frequent_triples: #usar dataset inteiro ou so pareS?
        if pair[0] == item or pair[1] == item:
            count+=1
    return count, count/len(frequent_triples)

In [44]:
c_t,conf_t = measure_confidence_triples(x)
c_t,conf_t

(15, 0.2054794520547945)

## Association Rules: Interest

    interestI -> j = p(j|I) − p(j)
    interestI -> j = confidence(I -> j)−baskets containing j / baskets

high positive interest: presence of I indicates the presence of j.

high negative interest: presence of I discourages the presence of j

In [40]:
def measure_interest_pairs(count,conf):
    return conf-count/len(frequent_pairs)
    

In [41]:
measure_interest_pairs(c_p,conf_p)

0.0

## Association rules: Lift

    Lift(I) -> j = confidence(I -> j)/P(j) =P(I, j)/P(I)*P(j)

Lift (also known as the observed/expected ratio) is a measure of the degree of dependence between I and j. 

A lift of 1 indicates that I and j are independent

In [45]:
def measure_lift(count,conf):
    return conf/count
    

In [47]:
lift = measure_lift(c_p,conf_p)
lift

0.027777777777777776

## Association rules: Standardised lift

Standardised lift ranges from 0 to 1.

This facilitates setting a fixed threshold for selecting the rules

In [None]:
def measure_stand_lift(lift,p_i,p_j,n):
    common = (max(p_i+p_j-1,1/n) /(p_i*p_j))
    num = lift-common
    denom = 1/(p_i * p_j)-common
    return num/denom
    

In [None]:
#measure_stand_lift(lift,c_p,)