In [1]:
import random
import math
import json
import pandas as pd
import numpy as np
import scipy.stats as stats
import seaborn as sns
from mlxtend.frequent_patterns import apriori
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
%load_ext blackcellmagic

In [3]:
ANSWERS_CSV = "answers.csv"
adf = pd.read_csv(ANSWERS_CSV, index_col=0)
adf.columns = [s.split("_")[0] for s in adf.columns]
adf.head(3)

Unnamed: 0,j050601,j050701,j050702,j052201,j052501,j052601,j052602,j053801,j053802,s051542,...,v051601,v051701,v052401,v053001,v053201,v054501,v054601,v064101,v064201,v090201
0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0


In [4]:
BINARY_FEATURES_CSV = "binary.csv"
bdf = pd.read_csv(BINARY_FEATURES_CSV, index_col=1)
del bdf["Unnamed: 0"]
bdf.head(3)

Unnamed: 0_level_0,shows_gpa,took_algorithms,took_data_science,took_security,took_data_structures,took_databases,took_oop,took_swe,knows_vcs,knows_android,...,has_awards,has_non_tech_exp,has_tech_exp,has_leader_role,has_member_role,is_transfer,has_volunteer_exp,has_github,has_errors,is_foreign
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
s052220,1,1,0,0,1,1,1,1,1,1,...,1,0,1,1,1,0,0,1,0,1
v090201,0,0,0,0,0,0,1,0,0,0,...,0,1,0,1,1,0,1,1,0,0
s052922,1,0,1,0,1,0,1,0,0,0,...,0,0,1,1,1,0,0,0,0,0


In [5]:
fqis = apriori(bdf, min_support=0.5, use_colnames=True)
fqis["length"] = [len(s) for s in fqis["itemsets"]]
top = fqis.query("length > 1").sort_values(by=["support"], ascending=False)
top.head(5)

Unnamed: 0,support,itemsets,length
33,0.72,"(knows_java, has_tech_exp)",2
19,0.68,"(knows_java, took_oop)",2
24,0.64,"(has_tech_exp, took_oop)",2
30,0.64,"(knows_java, knows_c)",2
42,0.64,"(knows_c, has_tech_exp)",2


In [6]:
def contingency_table(bdf, itemset):
    N = len(bdf)
    iset = tuple(itemset)
    a = iset[0]
    b = iset[1]
    f11 = len(bdf.query("{} == 1.0 and {} == 1.0".format(a, b)))
    f10 = len(bdf.query("{} == 1.0 and {} == 0.0".format(a, b)))
    f01 = len(bdf.query("{} == 0.0 and {} == 1.0".format(a, b)))
    f00 = len(bdf.query("{} == 0.0 and {} == 0.0".format(a, b)))
    f1p = f11 + f10
    f0p = f01 + f00
    fp1 = f11 + f01
    fp0 = f10 + f00
    return f11, f10, f01, f00


def show_contingency_table(itemset, f11, f10, f01, f00):
    N = len(bdf)
    iset = tuple(itemset)
    a = iset[0]
    b = iset[1]
    f1p = f11 + f10
    f0p = f01 + f00
    fp1 = f11 + f01
    fp0 = f10 + f00
    print("A = {}".format(a))
    print("B = {}".format(b))
    print("  \t+B\t-B    ")
    print("+A\t{}\t{}\t{}".format(f11, f10, f1p))
    print("-A\t{}\t{}\t{}".format(f01, f00, f0p))
    print("  \t{}\t{}\t{}".format(fp1, fp0, N))
    return f11, f10, f01, f00


def support(f11, f10, f01, f00):
    N = f11 + f10 + f01 + f00
    return f11 / N

def support_a(f11, f10, f01, f00):
    N = f11 + f10 + f01 + f00
    return (f11 + f10) / N

def support_b(f11, f10, f01, f00):
    N = f11 + f10 + f01 + f00
    return (f11 + f01) / N

def confidence_ab(f11, f10, f01, f00):
    f1p = f11 + f10
    return f11 / f1p

def confidence_ba(f11, f10, f01, f00):
    fp1 = f11 + f01
    return f11 / fp1

def interest_factor(f11, f10, f01, f00):
    N = f11 + f10 + f01 + f00
    f1p = f11 + f10
    fp1 = f11 + f01
    return (N * f11) / (f1p * fp1)

def phi_correlation(f11, f10, f01, f00):
    f1p = f11 + f10
    f0p = f01 + f00
    fp1 = f11 + f01
    fp0 = f10 + f00
    num = (f11 * f00) - (f01 * f10)
    denom = math.sqrt(f1p * fp1 * f0p * fp0)
    return num / denom

def is_score(f11, f10, f01, f00):
    intfac = interest_factor(f11, f10, f01, f00)
    supp = support(f11, f10, f01, f00)
    return math.sqrt(intfac * supp)

# Association Rule Evaluation Metrics

- Support(A, B) = What proportion of the data shows both A and B?
- Confidence(A -> B) = How often does B occur when A occurs?
- Interest(A, B) = 1 if A, B statistically independent, > 1 if positively correlated, < 1 if negatively correlated.

## Considerations

- Asymmetric itemsets may indicate preferences: e.g., a good SWE knows Java, but someone who knows Java may not be a good SWE.
- Interest Factor is a correlation metric. Phi-correlation is not necessary appropriate for asymmetric itemsets.
- IS Score should be preferred for asymmetric itemsets.

In [12]:
def itemset_metrics(bdf, its):
    pair = tuple(its)
    a = pair[0]
    b = pair[1]
    ct = contingency_table(bdf, its)
    f11, f10, f01, f00 = show_contingency_table(its, *ct)
    N = f11 + f10 + f01 + f00
    conf_ab = confidence_ab(*ct)
    conf_ba = confidence_ba(*ct)
    print("Support({0}) = {1:.3f}".format(a, support_a(*ct)))
    print("Support({0}) = {1:.3f}".format(b, support_b(*ct)))
    print("Support({0}, {1}) = {2:.3f}".format(a, b, support(*ct)))
    print("Confidence({0} -> {1}) = {2:.3f}".format(a, b, conf_ab))
    print("Confidence({0} -> {1}) = {2:.3f}".format(b, a, conf_ba))
    print("Symmetric" if conf_ab == conf_ba else "Asymmetric")
    print("Interest({0}, {1}) = {2:.3f}".format(a, b, interest_factor(*ct)))
    print("Phi({0}, {1}) = {2:.3f}".format(a, b, phi_correlation(*ct)))
    print("IS({0}, {1}) = {2:.3f}".format(a, b, is_score(*ct)))
    
itemset_metrics(bdf, fqis["itemsets"][19])

A = knows_java
B = took_oop
  	+B	-B    
+A	17	4	21
-A	1	3	4
  	18	7	25
Support(knows_java) = 0.840
Support(took_oop) = 0.720
Support(knows_java, took_oop) = 0.680
Confidence(knows_java -> took_oop) = 0.810
Confidence(took_oop -> knows_java) = 0.944
Asymmetric
Interest(knows_java, took_oop) = 1.124
Phi(knows_java, took_oop) = 0.457
IS(knows_java, took_oop) = 0.874


# Rules from Accepting Itemsets

In [13]:
rater_idx = 5
jdf = bdf.copy()
rates = adf[rater_idx:(rater_idx + 1)].T[rater_idx]
rates_inv = rates.apply(lambda x: 1 if not x else 0)
jdf["accepted"] = rates
jdf["rejected"] = rates_inv
jdf[["accepted", "rejected", "took_oop", "knows_java"]].head(3)

Unnamed: 0_level_0,accepted,rejected,took_oop,knows_java
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
s052220,1.0,0,1,1
v090201,0.0,1,1,1
s052922,0.0,1,1,1


In [14]:
rq = apriori(jdf, min_support=0.5, max_len=2, use_colnames=True)
rq["length"] = [len(s) for s in rq["itemsets"]]
rq = rq.query("length == 2")
rq["accepting"] = [1 if "accepted" in s else 0 for s in rq["itemsets"]]
rq["rejecting"] = [1 if "rejected" in s else 0 for s in rq["itemsets"]]
rq.query("accepting == 1").sort_values(by=["support"], ascending=False)

Unnamed: 0,support,itemsets,length,accepting,rejecting
57,0.64,"(accepted, has_tech_exp)",2,1,0
39,0.6,"(accepted, knows_java)",2,1,0
28,0.52,"(accepted, took_oop)",2,1,0
46,0.52,"(accepted, knows_c)",2,1,0
50,0.52,"(accepted, knows_web)",2,1,0
53,0.52,"(accepted, has_projects)",2,1,0


In [15]:
itemset_metrics(jdf, rq["itemsets"][57])

A = accepted
B = has_tech_exp
  	+B	-B    
+A	16	0	16
-A	5	4	9
  	21	4	25
Support(accepted) = 0.640
Support(has_tech_exp) = 0.840
Support(accepted, has_tech_exp) = 0.640
Confidence(accepted -> has_tech_exp) = 1.000
Confidence(has_tech_exp -> accepted) = 0.762
Asymmetric
Interest(accepted, has_tech_exp) = 1.190
Phi(accepted, has_tech_exp) = 0.582
IS(accepted, has_tech_exp) = 0.873


In [16]:
metrics_list = [
    "a",
    "b",
    "support",
    "support(a)",
    "support(b)",
    "confidence(a -> b)",
    "confidence(b -> a)",
    "interest(a, b)",
    "phi(a, b)",
    "is(a, b)",
]


def get_metric(df, metric):
    def compute(its):
        ct = contingency_table(df, its)
        return metric(*ct)

    return compute


rq["a"] = [tuple(s)[0] for s in rq["itemsets"]]
rq["b"] = [tuple(s)[1] for s in rq["itemsets"]]
rq["support(a)"] = rq["itemsets"].apply(get_metric(jdf, support_a))
rq["support(b)"] = rq["itemsets"].apply(get_metric(jdf, support_b))
rq["confidence(a -> b)"] = rq["itemsets"].apply(get_metric(jdf, confidence_ab))
rq["confidence(b -> a)"] = rq["itemsets"].apply(get_metric(jdf, confidence_ba))
rq["interest(a, b)"] = rq["itemsets"].apply(get_metric(jdf, interest_factor))
rq["phi(a, b)"] = rq["itemsets"].apply(get_metric(jdf, phi_correlation))
rq["is(a, b)"] = rq["itemsets"].apply(get_metric(jdf, is_score))
rq[metrics_list].sort_values(by=["is(a, b)"], ascending=False).head(10)

Unnamed: 0,a,b,support,support(a),support(b),confidence(a -> b),confidence(b -> a),"interest(a, b)","phi(a, b)","is(a, b)"
20,knows_java,took_oop,0.68,0.84,0.72,0.809524,0.944444,1.124339,0.456849,0.874386
57,accepted,has_tech_exp,0.64,0.64,0.84,1.0,0.761905,1.190476,0.581914,0.872872
47,knows_web,knows_js,0.52,0.64,0.56,0.8125,0.928571,1.450893,0.678234,0.868599
35,knows_java,has_tech_exp,0.72,0.84,0.84,0.857143,0.857143,1.020408,0.107143,0.857143
32,knows_java,knows_c,0.64,0.84,0.68,0.761905,0.941176,1.120448,0.402309,0.84681
45,knows_c,has_tech_exp,0.64,0.68,0.84,0.941176,0.761905,1.120448,0.402309,0.84681
25,has_tech_exp,took_oop,0.64,0.84,0.72,0.761905,0.888889,1.058201,0.213844,0.822951
51,has_tech_exp,has_projects,0.64,0.84,0.72,0.761905,0.888889,1.058201,0.213844,0.822951
34,knows_java,has_projects,0.64,0.84,0.72,0.761905,0.888889,1.058201,0.213844,0.822951
49,knows_web,has_tech_exp,0.6,0.64,0.84,0.9375,0.714286,1.116071,0.354604,0.818317


# Rules from Rejecting Itemsets

In [20]:
rq.query("rejecting == 1")[metrics_list].sort_values(by=["support"], ascending=False)

Unnamed: 0,a,b,support,support(a),support(b),confidence(a -> b),confidence(b -> a),"interest(a, b)","phi(a, b)","is(a, b)"
