In [34]:
import pandas as pd

fname = "/Users/simon.hughes/Google Drive/PhD/Data/CoralBleaching/Results/predictions_causal_and_codes.txt"
#fname = "/Users/simon.hughes/Google Drive/PhD/Data/SkinCancer/Results/sc_cause_effect_labels_predictions.txt"
data = pd.read_csv(fname, sep="|")

In [35]:
def group_by(df, bycols, agg_map):
    """

    @param df:      DataFrame
    @param bycols:  str or list
                        Column(s) to group by
    @param agg_map: dictionary or list of 2-tuples
                        Mapping from column to aggregate function e.g. [("city", "count"), ("salary", "mean"]
    @return:        DataFrame
                        Flattened dataframe, with multi-level index removed
    """
    grps = []
    if type(bycols) == str:
        bycols = [bycols]

    if type(agg_map) == dict:
        agg_map = agg_map.items()
    for k,v in agg_map:
        grp = df[bycols + [k]].groupby(bycols).agg(v)
        grp.reset_index(inplace=True)
        grps.append(grp)

    m = grps[0]
    for grp in grps[1:]:
        m = pd.merge(m, grp, on=bycols, how="inner")
    return m

In [36]:
data["Concept Codes"] = data["Concept Codes"].astype("str")
data["Concept Codes"] = data["Concept Codes"].apply(lambda s: "" if s == "nan" else s)
data["Predictions"] = data["Predictions"].astype("str")
data["Predictions"] = data["Predictions"].apply(lambda s: "" if s == "nan" else s)
data.head(10)

Unnamed: 0,Essay,Sent Number,Processed Sentence,Concept Codes,Predictions
0,EBA1415_AEKD_4_CB_ES-05568.ann,1,What leads to differences in the rates of cora...,50,50
1,EBA1415_AEKD_4_CB_ES-05568.ann,2,Coral is often mistaken for a rock but it is m...,,
2,EBA1415_AEKD_4_CB_ES-05568.ann,3,Coral bleaching shows bleaching and healthy bl...,50,50
3,EBA1415_AEKD_4_CB_ES-05568.ann,4,Coral bleaching is almost noticeable in the pa...,50,50
4,EBA1415_AEKD_4_CB_ES-05572.ann,1,The part of coral called zooanthellae are not ...,5,
5,EBA1415_AEKD_4_CB_ES-05572.ann,2,And if they get or much sunlight they start to...,"5,50,_C->R,_CRel,_RRel,Causer,Result,Causer:5,...",50
6,EBA1415_AEKD_4_CB_ES-05572.ann,3,The reason why is because the zooanthellae if ...,4,Result
7,EBA1415_AEKD_4_CB_ES-05572.ann,4,The coral also need INFREQUENT water temperatu...,,
8,EBA1415_AEKD_4_CB_ES-05572.ann,5,Also its a threats for us because means that m...,11,"11,Causer:11"
9,EBA1415_AEKD_4_CB_ES-05572.ann,6,Also the water us getting to salty,13,


In [37]:
def concat(lst):
    return ",".join(lst)

def make_unique(s):
    joined = s
    splt = joined.split(",")
    if len(splt) == 0:
        return ""
    un = set(splt)
    if "" in un:
        un.remove("")
    return ",".join(sorted(un))

def codes_only(s):
    splt = s.split(",")
    return ",".join([t for t in splt if len(t.strip()) > 0 and t[0].isdigit()])

def causal_only(s):
    splt = s.split(",")
    causal = ",".join([t for t in splt if len(t.strip()) > 0 and "->" in t and "Causer" in t and "Result" in t])
    return causal.replace("Causer:","").replace("Result:","")

grpd = group_by(data, "Essay", {"Concept Codes": concat, "Predictions": concat})
grpd["Concept Codes"] = grpd["Concept Codes"].apply(make_unique)
grpd["Predictions"] = grpd["Predictions"].apply(make_unique)

grpd["Ys_codes"] = grpd["Concept Codes"].apply(codes_only)
grpd["Pred_codes"] = grpd["Predictions"].apply(codes_only)

grpd["Ys_causal"] = grpd["Concept Codes"].apply(causal_only)
grpd["Pred_causal"] = grpd["Predictions"].apply(causal_only)
#Re-order cols
grpd = grpd[["Essay", "Concept Codes", "Ys_codes", "Ys_causal", "Predictions", "Pred_codes", "Pred_causal"]]
grpd.head(10)

Unnamed: 0,Essay,Concept Codes,Ys_codes,Ys_causal,Predictions,Pred_codes,Pred_causal
0,EBA1415_AEKD_4_CB_ES-05568.ann,50,50,,50,50,
1,EBA1415_AEKD_4_CB_ES-05572.ann,"11,13,4,5,50,Causer,Causer:5,Causer:5->Result:...",11134550,5->50,"11,50,Causer:11,Result",1150,
2,EBA1415_AEKD_4_CB_ES-05574.ann,1350,1350,,50,50,
3,EBA1415_AEKD_4_CB_ES-05902.ann,"3,4,50,7,Causer,Causer:3,Causer:3->Result:4,Ca...",34507,"3->4,7->50","3,4,50,7,Causer,Causer:3,Causer:3->Result:4,Ca...",34507,"3->4,7->50"
4,EBA1415_AEKD_5_CB-06232.ann,"50,Causer,Causer:7,Causer:7->Result:50,Result,...",50,7->50,"50,Causer,Causer:7,Causer:7->Result:50,Result,...",50,7->50
5,EBA1415_AEKD_5_CB_ES-05575.ann,"50,Causer,Causer:7,Causer:7->Result:50,Result,...",50,7->50,"50,Causer,Causer:7,Result,Result:50,_C->R,_CRe...",50,
6,EBA1415_AEKD_5_CB_ES-05579.ann,150,150,,"1,50,6,Causer,Causer:6,Result,Result:14,_C->R,...",1506,
7,EBA1415_AEKD_5_CB_ES-05582.ann,"50,7,Causer,Causer:7,Causer:7->Result:50,Resul...",507,7->50,"50,7,Causer,Causer:7,Causer:7->Result:50,Resul...",507,7->50
8,EBA1415_AEKD_5_CB_ES-05586.ann,50,50,,"50,Result",50,
9,EBA1415_BGJD_1_CB_ES-05725.ann,"3,50,Causer,Causer:3,Causer:3->Result:50,Cause...",350,"3->50,5->7","3,50,Causer,Causer:3,Causer:3->Result:50,Resul...",350,3->50


In [38]:
def friendly_tag(tag):
    return tag.replace("Causer:", "").replace("Result:", "")

def sort_key(cr):
    cr = cr.replace("5b", "5.5")
    # _'s last
    if cr[0] == "_":
        return (99999999, cr, cr, cr)
    # Casual second to last, ordered by the order of the cause then the effect
    if "->" in cr:
        cr = friendly_tag(cr)
        a,b = cr.split("->")
        if a.isdigit():
            a = float(a)
        if b.isdigit():
            b = float(b)
        return (9000, a,b, cr)
    # order causer's before results
    elif "Result:" in cr:
        cr = friendly_tag(cr)
        return (-1, float(cr),-1,cr)
    elif "Causer:" in cr:
        cr = friendly_tag(cr)
        return (-2, float(cr),-1,cr)
    else:
        #place regular tags first, numbers ahead of words
        if cr[0].isdigit():
            return (-10, float(cr),-1,cr)
        else:
            return (-10, 9999.9   ,-1,cr.lower())
    return (float(cr.split("->")[0]), cr) if cr.split("->")[0][0].isdigit() else (99999, cr)

def category(s):
    if not s or s =="" or s == "nan":
        return 1
    splt = s.strip().split(",")
    splt = filter(lambda s: len(s.strip()) > 0, splt)
    regular = [t.strip() for t in splt if t[0].isdigit()]
    any_causal  = [t.strip() for t in splt if "->" in t and (("Causer" in t and "Result" in t) or "C->R" in t) ]
    causal  = [t.strip() for t in splt if "->" in t and "Causer" in t and "Result" in t]
    if len(regular) == 0 and len(any_causal) == 0:
        return 1
    if len(any_causal) == 0: #i.e. by this point regular must have some
        return 2 # no causal
    # if only one causal then must be 3
    elif len(any_causal) == 1 or len(causal) == 1:
        return 3
    #Map to Num->Num, e.g. Causer:3->Results:50 becomes 3->5
    # Also map 6 to 16 and 7 to 17 to enforce the relative size relationship
    
    def map_cb(code):
        return code.replace("6","16").replace("7","17")
    
    def map_sc(code):
        return code.replace("4","14").replace("5","15").replace("6","16").replace("150","50")
    
    if "Coral" in fname:
        crels = sorted(map(lambda t: map_cb(t.replace("Causer:","").replace("Result:","")).strip(),causal), 
                       key=sort_key)
    elif "Skin" in fname:
        crels = sorted(map(lambda t: map_sc(t.replace("Causer:","").replace("Result:","")).strip(),\
                       causal), 
                       key=sort_key)
    else:
        raise Exception("Unrecognized filename")
    
    un_results = set()
    # For each unique pairwise combination
    for a in crels:
        for b in crels:
            if sort_key(b) >= sort_key(a): # don't compare each pair twice (a,b) == (b,a)
                break
            # b is always the smaller of the two
            bc,br = b.split("->")
            ac,ar = a.split("->")
            # if result from b is causer for a
            if br.strip() == ac.strip():
                un_results.add((b,a))
    
    if len(un_results) >=1:
        
        # To be a 4 or a 5, at least one relation needs to end in a 50
        joined = ",".join(map(str, un_results))
        if "->50" not in joined:
            return 3
        
        #CB and 6->7->50 ONLY
        if len(un_results) == 1 and "Coral" in fname and ("16->17", "17->50") in un_results:
            return 4
        if len(un_results) <=2 and "Skin" in fname:
            #4->5->6->50
            codes = set("14,15,16,50".split(","))
            un_results_cp = set(un_results)
            for a,b in un_results:
                alhs, arhs = a.split("->")
                blhs, brhs = b.split("->")
                if alhs in codes and arhs in codes and blhs in codes and brhs in codes:
                    un_results_cp.remove((a,b))
            if len(un_results_cp) == 0:
                return 4
        return 5
    else:
        return 3

In [39]:
def check(inp, expected):
    actual = category(inp)
    assert actual == expected, "Inp: %s Actual:%i Expected:%i" % (inp, actual, expected)

check("" , 1)
check(" ", 1)

check("1", 2)
check("1,3", 2)
check("1,,,,4,7,50,1289", 2)

check("Causer:1->Result:3", 3)
check("Causer:1->Result:3,Causer:4->Result:5", 3)
#Multiple going to 50 is a 3
check("Causer:1->Result:50,Causer:3->Result:50,Causer:5->Result:50", 3)
check("Causer:1->Result:3, Causer:4->Result:5", 3)

check("_C->R", 3)
check("3,4,5,_C->R", 3)

check("Causer:6->Result:7", 3)
check("Causer:5->Result:5b", 3)

check("Causer:3->Result:4, Causer:1->Result:4, Causer:3->Result:50", 3)
#no intervening
check("Causer:1->Result:4,Causer:6->Result:7", 3)
check("Causer:4->Result:5,Causer:1->Result:3", 3)
check("Causer:1->Result:50, Causer:2->Result:50, Causer:6->Result:50, Causer:7->Result:50", 3)

#Chains not going to 50
check("Causer:1->Result:3, Causer:5->Result:7, Causer:3->Result:6", 3)
check("Causer:1->Result:3, Causer:3->Result:4, Causer:4->Result:6", 3)

#universal 5's
check("Causer:5b->Result:50,Causer:5->Result:5b", 5) #Test 5b's
check("Causer:1->Result:3,Causer:3->Result:50", 5)
check("Causer:1->Result:3, Causer:5->Result:7, Causer:3->Result:50", 5)


if "Skin" in fname:
    print "Skin Cancer"
    check("Causer:11->Result:4,Causer:4->Result:6", 3)
    
    check("Causer:4->Result:5,Causer:5->Result:6,Causer:6->Result:50,", 4)
    check("Causer:4->Result:5,Causer:5->Result:50,", 4)
    check("Causer:4->Result:5,Causer:5->Result:50", 4)
    check("Causer:5->Result:50,Causer:4->Result:5", 4)
    check("Causer:5->Result:6,Causer:6->Result:50,", 4)
    check("Causer:4->Result:6,Causer:6->Result:50,", 4)
    check("1,Causer:4->Result:5,Causer:5->Result:6,Causer:6->Result:50,", 4)
    check("Causer:4->Result:5,Causer:5->Result:6,Causer:6->Result:50,Causer:1->Result:2", 4)

    check("Causer:1->Result:2,Causer:2->Result:50", 5)
    check("Causer:4->Result:5,Causer:5->Result:6,Causer:6->Result:50,Causer:2->Result:4", 5)
    
elif "Coral" in fname:
    print "Coral Bleaching"
    check("Causer:6->Result:7", 3)
    check("Causer:7->Result:50", 3)

    check("Causer:6->Result:7,Causer:7->Result:50", 4)
    check("Causer:7->Result:50,Causer:6->Result:7", 4)
    check("Causer:7->Result:50,Causer:6->Result:7,Causer:3->Result:4", 4)

    check("Causer:7->Result:50,Causer:6->Result:7,Causer:3->Result:6", 5)
    check("Causer:1->Result:6,Causer:6->Result:7,Causer:7->Result:50", 5)
    check("Causer:13->Result:6,Causer:6->Result:50", 5)

print "Tests passed!"

Coral Bleaching
Tests passed!


In [40]:
grpd["Ys_cat"] = grpd["Concept Codes"].apply(category)
grpd["Pred_cat"] = grpd["Predictions"].apply(category)

In [41]:
grpd["Diff"] = grpd["Ys_cat"] - grpd["Pred_cat"]
grpd["Diff"] = grpd["Diff"].abs()
abbrev = grpd[["Ys_codes","Ys_causal", "Ys_cat", "Pred_codes", "Pred_causal", "Pred_cat", "Diff"]]
abbrev.head(20)

Unnamed: 0,Ys_codes,Ys_causal,Ys_cat,Pred_codes,Pred_causal,Pred_cat,Diff
0,50.0,,2,50.0,,2,0
1,11134550.0,5->50,3,1150.0,,2,1
2,1350.0,,2,50.0,,2,0
3,34507.0,"3->4,7->50",3,34507.0,"3->4,7->50",3,0
4,50.0,7->50,3,50.0,7->50,3,0
5,50.0,7->50,3,50.0,,3,0
6,150.0,,2,1506.0,,3,1
7,507.0,7->50,3,507.0,7->50,3,0
8,50.0,,2,50.0,,2,0
9,350.0,"3->50,5->7",3,350.0,3->50,3,0


In [42]:
abbrev[abbrev["Ys_cat"] == 4][["Ys_codes","Ys_causal", "Ys_cat"]].head(20)

Unnamed: 0,Ys_codes,Ys_causal,Ys_cat
60,5067,"6->7,7->50",4
87,135067,"1->3,6->7,7->50",4
94,235067,"3->50,6->50,6->7,7->50",4
135,1111314345067,"1->3,11->14,13->14,4->50,6->7,7->50",4
142,145067,"14->50,6->14,6->7,7->50",4
151,145067,"14->50,6->14,6->7,7->50",4
162,25067,"6->7,7->50",4
215,111314345067,"11->14,13->14,4->14,6->7,7->50",4
237,1213145067,"6->7,7->50",4
263,145067,"14->50,6->14,6->7,7->50",4


In [43]:
def is_inverted(s):
    if "->" not in s:
        return False
    a,b = s.split("->")
    if int(a) > int(b):
        return True
    return False

def any_inverted(s):
    if not s.strip():
        return ""
    splt = s.split(",")
    fltd = filter(is_inverted, splt)
    if not fltd:
        return ""
    return ",".join(fltd)

grpd["Ys_inverted"] = grpd["Ys_causal"].apply(any_inverted)
grpd[grpd["Ys_inverted"].str.strip() != ""][["Essay","Ys_causal","Ys_inverted"]].head(10)

Unnamed: 0,Essay,Ys_causal,Ys_inverted
19,EBA1415_BGJD_2_CB_ES-05740.ann,"1->3,1->50,13->50,3->1,3->50",3->1
70,EBA1415_BLRW_5_CB_ES-05192.ann,"1->50,3->1",3->1
154,EBA1415_KNKC_1_CB_ES-05410.ann,"1->50,3->1,7->50",3->1
217,EBA1415_KYLS_5_CB_ES-05662.ann,"1->2,1->50,11->13,13->14,3->1,3->50,6->7,7->50",3->1
297,EBA1415_LRBL_4_CB-05167.ann,"3->1,6->7",3->1
315,EBA1415_LRJE_7_CB_ES-05135.ann,"1->50,14->50,3->1,3->4,3->5,4->14,7->50",3->1
354,EBA1415_RCGJ_4a_CB_ES-04678.ann,"3->1,4->5,7->50",3->1
371,EBA1415_RDCS_2_CB-04715.ann,"3->1,3->50",3->1
490,EBA1415_SDMK_7_CB_ES-04782.ann,"1->50,14->50,3->1,6->14,7->50",3->1
527,EBA1415_SEKL_1_CB_ES-04818.ann,"1->50,3->1,4->5",3->1


In [44]:
grpd["Ys_inverted"].unique()

array(['', '3->1'], dtype=object)

In [45]:
data[data["Concept Codes"].str.contains("Causer:3->Result:1")][["Essay", "Sent Number"]]

Unnamed: 0,Essay,Sent Number
129,EBA1415_BGJD_2_CB_ES-05740.ann,1
702,EBA1415_BLRW_5_CB_ES-05192.ann,1
1458,EBA1415_KNKC_1_CB_ES-05410.ann,5
2094,EBA1415_KYLS_5_CB_ES-05662.ann,2
2864,EBA1415_LRBL_4_CB-05167.ann,6
3025,EBA1415_LRJE_7_CB_ES-05135.ann,4
3419,EBA1415_RCGJ_4a_CB_ES-04678.ann,6
3549,EBA1415_RDCS_2_CB-04715.ann,3
4735,EBA1415_SDMK_7_CB_ES-04782.ann,10
5140,EBA1415_SEKL_1_CB_ES-04818.ann,4


## Accuracy

In [46]:
print "Accuracy", round(len(grpd[grpd["Ys_cat"] == grpd["Pred_cat"]]) / float(len(grpd)),4)
print "Adj     ", round(len(grpd[grpd["Diff"] <=1]) / float(len(grpd)),4)
print len(grpd), "essays"

Accuracy 0.7713
Adj      0.8679
1128 essays
