In [6]:
import pandas as pd
from EssayCategory import *

root = "/Users/simon.hughes/Google Drive/PhD/Data/ActiveLearning/"
out_predictions_file        = root + "output/predictions.txt"

data = pd.read_csv(out_predictions_file, sep="|")

In [7]:
def group_by(df, bycols, agg_map):
    """

    @param df:      DataFrame
    @param bycols:  str or list
                        Column(s) to group by
    @param agg_map: dictionary or list of 2-tuples
                        Mapping from column to aggregate function e.g. [("city", "count"), ("salary", "mean"]
    @return:        DataFrame
                        Flattened dataframe, with multi-level index removed
    """
    grps = []
    if type(bycols) == str:
        bycols = [bycols]

    if type(agg_map) == dict:
        agg_map = agg_map.items()
    for k,v in agg_map:
        grp = df[bycols + [k]].groupby(bycols).agg(v)
        grp.reset_index(inplace=True)
        grps.append(grp)

    m = grps[0]
    for grp in grps[1:]:
        m = pd.merge(m, grp, on=bycols, how="inner")
    return m

In [8]:
data["Concept Codes"] = data["Concept Codes"].astype("str")
data["Concept Codes"] = data["Concept Codes"].apply(lambda s: "" if s == "nan" else s)
data["Predictions"] = data["Predictions"].astype("str")
data["Predictions"] = data["Predictions"].apply(lambda s: "" if s == "nan" else s)
data.head(10)

Unnamed: 0,Essay,Sent Number,Processed Sentence,Concept Codes,Predictions
0,EBA1415_TFMV_3_CB_ES-05847.ann,1,"Coral bleaching , or a phenomenon where health...",50,"50,_C->R,Result:50"
1,EBA1415_TFMV_3_CB_ES-05847.ann,2,The rates of coral bleaching differ because of...,50,50
2,EBA1415_TFMV_3_CB_ES-05847.ann,3,"INFREQUENT , such as heavy storms and hurrican...","11,50,_C->R,_CRel,_RRel,Causer,Result,explicit...","11,50,_C->R,_CRel,_RRel,Causer,Result,explicit..."
3,EBA1415_TFMV_3_CB_ES-05847.ann,4,The massive amount of rain INFREQUENT sets the...,13,
4,EBA1415_TFMV_3_CB_ES-05847.ann,5,Corals are highly sensitive to the amount of s...,13,13
5,EBA1415_TFMV_3_CB_ES-05847.ann,6,"The storms cause a decrease , which is INFREQU...",11,"11,14,_C->R,_CRel,_RRel,Causer,Result,explicit..."
6,EBA1415_TFMV_3_CB_ES-05847.ann,7,Another reason are trade winds .,1,"1,_C->R,_CRel,_RRel,Causer,Result,explicit"
7,EBA1415_TFMV_3_CB_ES-05847.ann,8,"According to data collected from 0000 - 0000 ,...","1,50,_C->R,_CRel,_RRel,Causer,Result,Causer:1,...","1,50,_C->R,_CRel,_RRel,Causer,Result,explicit,..."
8,EBA1415_TFMV_3_CB_ES-05847.ann,9,"From 0000 - 00 , wind trade was almost at - 00...",1,
9,EBA1415_TFMV_3_CB_ES-05847.ann,10,Coral bleaching reports that year were over 00...,50,50


In [49]:
def concat(lst):
    return ",".join(lst)

def make_unique(s):
    joined = s
    splt = joined.split(",")
    if len(splt) == 0:
        return ""
    un = set(splt)
    if "" in un:
        un.remove("")
    return ",".join(sorted(un))

def codes_only(s):
    splt = s.split(",")
    return ",".join([t for t in splt if len(t.strip()) > 0 and t[0].isdigit()])

def causal_only(s):
    splt = s.split(",")
    causal = ",".join([t for t in splt if len(t.strip()) > 0 and "->" in t and "Causer" in t and "Result" in t])
    return causal.replace("Causer:","").replace("Result:","")

grpd = group_by(data, "Essay", {"Concept Codes": concat, "Predictions": concat})
grpd["Concept Codes"] = grpd["Concept Codes"].apply(make_unique)
grpd["Predictions"] = grpd["Predictions"].apply(make_unique)

grpd["Ys_codes"] = grpd["Concept Codes"].apply(codes_only)
grpd["Pred_codes"] = grpd["Predictions"].apply(codes_only)

grpd["Ys_causal"] = grpd["Concept Codes"].apply(causal_only)
grpd["Pred_causal"] = grpd["Predictions"].apply(causal_only)
#Re-order cols
grpd = grpd[["Essay", "Concept Codes", "Ys_codes", "Ys_causal", "Predictions", "Pred_codes", "Pred_causal"]]
grpd.head(10)

Unnamed: 0,Essay,Concept Codes,Ys_codes,Ys_causal,Predictions,Pred_codes,Pred_causal
0,EBA1415_AEKD_4_SC_ES-05566.ann,"1,12,2,3,4,5,50,Causer,Causer:1,Causer:1->Resu...",112234550.0,"1->2,1->50,4->5","1,12,2,3,4,5,50,Causer,Causer:1,Causer:1->Resu...",112234550.0,"1->2,12->3,4->5"
1,EBA1415_AEKD_4_SC_ES-05567.ann,"2,4,5,50,6,Causer,Causer:2,Causer:2->Result:50...",245506.0,"2->50,4->5","2,4,5,50,6,Causer,Causer:2,Causer:2->Result:50...",245506.0,"2->50,4->5"
2,EBA1415_AEKD_4_SC_ES-05568_9.ann,550,550.0,,"5,50,Causer,Causer:5->Result:50,Result,_C->R,_...",550.0,5->50
3,EBA1415_AEKD_4_SC_ES-05569.ann,"1,3,5,50,Causer,Causer:12,Causer:12->Result:2,...",13550.0,"12->2,12->3,12->50,3->50","3,5,50,Causer,Causer:12,Causer:12->Result:3,Ca...",3550.0,"12->3,3->50,5->50"
4,EBA1415_AEKD_4_SC_ES-05570.ann,"1,2,4,5,50,6,Causer,Causer:1,Causer:1->Result:...",1245506.0,"1->50,2->5,2->50,5->50,5->6,6->50","1,2,5,50,6,Causer,Causer:1,Causer:2,Causer:2->...",125506.0,"2->5,2->50,5->50,5->6,6->50"
5,EBA1415_AEKD_4_SC_ES-05571_9.ann,,,,,,
6,EBA1415_AEKD_4_SC_ES-05573.ann,"1,2,5,50,Causer,Causer:1,Causer:1->Result:2,Ca...",12550.0,"1->2,2->50,5->50","1,5,50,Causer,Causer:1,Causer:5,Causer:5->Resu...",1550.0,5->50
7,EBA1415_AEKD_4_SC_ES-05574_9.ann,"1,2,50,Causer,Causer:1,Causer:1->Result:50,Cau...",1250.0,"1->50,2->50","1,2,50,Causer,Causer:1,Causer:2,Causer:2->Resu...",1250.0,2->50
8,EBA1415_AEKD_4_SC_ES-05901.ann,50,50.0,,50,50.0,
9,EBA1415_AEKD_4_SC_ES-05902_9.ann,"5,50,Causer,Causer:5,Causer:5->Result:50,Resul...",550.0,5->50,"5,50,Causer,Causer:5->Result:50,Result,Result:...",550.0,5->50


In [50]:
def friendly_tag(tag):
    return tag.replace("Causer:", "").replace("Result:", "")

def sort_key(cr):
    cr = cr.replace("5b", "5.5")
    # _'s last
    if cr[0] == "_":
        return (99999999, cr, cr, cr)
    # Casual second to last, ordered by the order of the cause then the effect
    if "->" in cr:
        cr = friendly_tag(cr)
        a,b = cr.split("->")
        if a.isdigit():
            a = float(a)
        if b.isdigit():
            b = float(b)
        return (9000, a,b, cr)
    # order causer's before results
    elif "Result:" in cr:
        cr = friendly_tag(cr)
        return (-1, float(cr),-1,cr)
    elif "Causer:" in cr:
        cr = friendly_tag(cr)
        return (-2, float(cr),-1,cr)
    else:
        #place regular tags first, numbers ahead of words
        if cr[0].isdigit():
            return (-10, float(cr),-1,cr)
        else:
            return (-10, 9999.9   ,-1,cr.lower())
    return (float(cr.split("->")[0]), cr) if cr.split("->")[0][0].isdigit() else (99999, cr)

def category(s):
    if not s or s =="" or s == "nan":
        return 1
    splt = s.strip().split(",")
    splt = filter(lambda s: len(s.strip()) > 0, splt)
    regular = [t.strip() for t in splt if t[0].isdigit()]
    any_causal  = [t.strip() for t in splt if "->" in t and (("Causer" in t and "Result" in t) or "C->R" in t) ]
    causal  = [t.strip() for t in splt if "->" in t and "Causer" in t and "Result" in t]
    if len(regular) == 0 and len(any_causal) == 0:
        return 1
    if len(any_causal) == 0: #i.e. by this point regular must have some
        return 2 # no causal
    # if only one causal then must be 3
    elif len(any_causal) == 1 or len(causal) == 1:
        return 3
    #Map to Num->Num, e.g. Causer:3->Results:50 becomes 3->5
    # Also map 6 to 16 and 7 to 17 to enforce the relative size relationship
    
    def map_cb(code):
        return code.replace("6","16").replace("7","17")
    
    def map_sc(code):
        return code.replace("4","14").replace("5","15").replace("6","16").replace("150","50")
    
    if "Coral" in fname:
        crels = sorted(map(lambda t: map_cb(t.replace("Causer:","").replace("Result:","")).strip(),causal), 
                       key=sort_key)
    elif "Skin" in fname:
        crels = sorted(map(lambda t: map_sc(t.replace("Causer:","").replace("Result:","")).strip(),\
                       causal), 
                       key=sort_key)
    else:
        raise Exception("Unrecognized filename")
    
    un_results = set()
    # For each unique pairwise combination
    for a in crels:
        for b in crels:
            if sort_key(b) >= sort_key(a): # don't compare each pair twice (a,b) == (b,a)
                break
            # b is always the smaller of the two
            bc,br = b.split("->")
            ac,ar = a.split("->")
            # if result from b is causer for a
            if br.strip() == ac.strip():
                un_results.add((b,a))
    
    if len(un_results) >=1:
        
        # To be a 4 or a 5, at least one relation needs to end in a 50
        joined = ",".join(map(str, un_results))
        if "->50" not in joined:
            return 3
        
        #CB and 6->7->50 ONLY
        if len(un_results) == 1 and "Coral" in fname and ("16->17", "17->50") in un_results:
            return 4
        if len(un_results) <=2 and "Skin" in fname:
            #4->5->6->50
            codes = set("14,15,16,50".split(","))
            un_results_cp = set(un_results)
            for a,b in un_results:
                alhs, arhs = a.split("->")
                blhs, brhs = b.split("->")
                if alhs in codes and arhs in codes and blhs in codes and brhs in codes:
                    un_results_cp.remove((a,b))
            if len(un_results_cp) == 0:
                return 4
        return 5
    else:
        return 3

In [51]:
def check(inp, expected):
    actual = category(inp)
    assert actual == expected, "Inp: %s Actual:%i Expected:%i" % (inp, actual, expected)

check("" , 1)
check(" ", 1)

check("1", 2)
check("1,3", 2)
check("1,,,,4,7,50,1289", 2)

check("Causer:1->Result:3", 3)
check("Causer:1->Result:3,Causer:4->Result:5", 3)
#Multiple going to 50 is a 3
check("Causer:1->Result:50,Causer:3->Result:50,Causer:5->Result:50", 3)
check("Causer:1->Result:3, Causer:4->Result:5", 3)

check("_C->R", 3)
check("3,4,5,_C->R", 3)

check("Causer:6->Result:7", 3)
check("Causer:5->Result:5b", 3)

check("Causer:3->Result:4, Causer:1->Result:4, Causer:3->Result:50", 3)
#no intervening
check("Causer:1->Result:4,Causer:6->Result:7", 3)
check("Causer:4->Result:5,Causer:1->Result:3", 3)
check("Causer:1->Result:50, Causer:2->Result:50, Causer:6->Result:50, Causer:7->Result:50", 3)

#Chains not going to 50
check("Causer:1->Result:3, Causer:5->Result:7, Causer:3->Result:6", 3)
check("Causer:1->Result:3, Causer:3->Result:4, Causer:4->Result:6", 3)

#universal 5's
check("Causer:5b->Result:50,Causer:5->Result:5b", 5) #Test 5b's
check("Causer:1->Result:3,Causer:3->Result:50", 5)
check("Causer:1->Result:3, Causer:5->Result:7, Causer:3->Result:50", 5)


if "Skin" in fname:
    print "Skin Cancer"
    check("Causer:11->Result:4,Causer:4->Result:6", 3)
    
    check("Causer:4->Result:5,Causer:5->Result:6,Causer:6->Result:50,", 4)
    check("Causer:4->Result:5,Causer:5->Result:50,", 4)
    check("Causer:4->Result:5,Causer:5->Result:50", 4)
    check("Causer:5->Result:50,Causer:4->Result:5", 4)
    check("Causer:5->Result:6,Causer:6->Result:50,", 4)
    check("Causer:4->Result:6,Causer:6->Result:50,", 4)
    check("1,Causer:4->Result:5,Causer:5->Result:6,Causer:6->Result:50,", 4)
    check("Causer:4->Result:5,Causer:5->Result:6,Causer:6->Result:50,Causer:1->Result:2", 4)

    check("Causer:1->Result:2,Causer:2->Result:50", 5)
    check("Causer:4->Result:5,Causer:5->Result:6,Causer:6->Result:50,Causer:2->Result:4", 5)
    
elif "Coral" in fname:
    print "Coral Bleaching"
    check("Causer:6->Result:7", 3)
    check("Causer:7->Result:50", 3)

    check("Causer:6->Result:7,Causer:7->Result:50", 4)
    check("Causer:7->Result:50,Causer:6->Result:7", 4)
    check("Causer:7->Result:50,Causer:6->Result:7,Causer:3->Result:4", 4)

    check("Causer:7->Result:50,Causer:6->Result:7,Causer:3->Result:6", 5)
    check("Causer:1->Result:6,Causer:6->Result:7,Causer:7->Result:50", 5)
    check("Causer:13->Result:6,Causer:6->Result:50", 5)

print "Tests passed!"

Skin Cancer
Tests passed!


In [52]:
grpd["Ys_cat"] = grpd["Concept Codes"].apply(category)
grpd["Pred_cat"] = grpd["Predictions"].apply(category)

In [53]:
grpd["Diff"] = grpd["Ys_cat"] - grpd["Pred_cat"]
grpd["Diff"] = grpd["Diff"].abs()
abbrev = grpd[["Ys_codes","Ys_causal", "Ys_cat", "Pred_codes", "Pred_causal", "Pred_cat", "Diff"]]
abbrev.head(20)

Unnamed: 0,Ys_codes,Ys_causal,Ys_cat,Pred_codes,Pred_causal,Pred_cat,Diff
0,112234550.0,"1->2,1->50,4->5",3,112234550.0,"1->2,12->3,4->5",3,0
1,245506.0,"2->50,4->5",3,245506.0,"2->50,4->5",3,0
2,550.0,,2,550.0,5->50,3,1
3,13550.0,"12->2,12->3,12->50,3->50",3,3550.0,"12->3,3->50,5->50",3,0
4,1245506.0,"1->50,2->5,2->50,5->50,5->6,6->50",5,125506.0,"2->5,2->50,5->50,5->6,6->50",5,0
5,,,1,,,1,0
6,12550.0,"1->2,2->50,5->50",5,1550.0,5->50,3,2
7,1250.0,"1->50,2->50",3,1250.0,2->50,3,0
8,50.0,,2,50.0,,2,0
9,550.0,5->50,3,550.0,5->50,3,0


In [54]:
abbrev[abbrev["Ys_cat"] == 4][["Ys_codes","Ys_causal", "Ys_cat"]].head(20)

Unnamed: 0,Ys_codes,Ys_causal,Ys_cat
12,345506,"4->5,5->6,6->50",4
14,345506,"3->4,3->50,5->50,5->6,6->50",4
19,234550,"2->50,3->50,4->5,5->50,5->6",4
22,5506,"5->50,5->6,6->50",4
26,4550,"5->50,5->6,6->50",4
28,124550,"2->50,4->5,4->50,5->50",4
29,4550,"12->3,4->5,5->50",4
41,1225506,"2->50,5->50,5->6,6->50",4
51,245506,"2->50,5->4,5->6,6->50",4
71,235506,"12->3,2->50,5->50,5->6,6->50",4


In [55]:
def is_inverted(s):
    if "->" not in s:
        return False
    a,b = s.split("->")
    if int(a) > int(b):
        return True
    return False

def any_inverted(s):
    if not s.strip():
        return ""
    splt = s.split(",")
    fltd = filter(is_inverted, splt)
    if not fltd:
        return ""
    return ",".join(fltd)

grpd["Ys_inverted"] = grpd["Ys_causal"].apply(any_inverted)
grpd[grpd["Ys_inverted"].str.strip() != ""][["Essay","Ys_causal","Ys_inverted"]].head(10)

Unnamed: 0,Essay,Ys_causal,Ys_inverted
3,EBA1415_AEKD_4_SC_ES-05569.ann,"12->2,12->3,12->50,3->50","12->2,12->3"
17,EBA1415_BGJD_1_SC_ES-05732.ann,"2->5,2->50,4->6,5->4,6->50",5->4
24,EBA1415_BGJD_2_SC_ES-05752.ann,"1->2,12->2,12->3,3->4,4->5,5->50","12->2,12->3"
27,EBA1415_BGJD_2_SC_ES-5751_9.ann,"11->12,12->3,4->5,5->6,6->50",12->3
29,EBA1415_BGJD_2_SC_ES-5976_9.ann,"12->3,4->5,5->50",12->3
31,EBA1415_BLHT_5_SC_ES-05204.ann,"1->2,1->3,1->50,12->3,2->3,3->4",12->3
33,EBA1415_BLHT_5_SC_ES-05206.ann,"1->50,11->12,12->3,4->5,5->50",12->3
34,EBA1415_BLHT_5_SC_ES-05209.ann,"12->3,3->4,3->50,5->50",12->3
39,EBA1415_BLHT_6_SC_ES-05306.ann,"1->2,1->50,12->3,2->50,4->5,5->6",12->3
46,EBA1415_BLRW_3_SC_ES-05170.ann,"1->2,1->3,1->50,11->12,11->3,11->50,12->3,2->5...","11->3,12->3"


In [56]:
grpd["Ys_inverted"].unique()

array(['', '12->2,12->3', '5->4', '12->3', '11->3,12->3', '12->3,5->4',
       '12->2', '11->3', '12->2,5->4', '12->2,12->3,5->4'], dtype=object)

In [57]:
data[data["Concept Codes"].str.contains("Causer:3->Result:1")][["Essay", "Sent Number"]]

Unnamed: 0,Essay,Sent Number


## Accuracy

In [58]:
print "Accuracy", round(len(grpd[grpd["Ys_cat"] == grpd["Pred_cat"]]) / float(len(grpd)),4)
print "Adj     ", round(len(grpd[grpd["Diff"] <=1]) / float(len(grpd)),4)
print len(grpd), "essays"

Accuracy 0.7583
Adj      0.8621
1088 essays


## Output Processed Results

In [59]:
outp = grpd.copy(deep=True)

#Strip the .ann prefix
outp["Essay"] = outp["Essay"].apply(lambda s: s[:-4])
outp["Manual_Codes"] = outp["Ys_codes"]
outp["Manual_Causal"] = outp["Ys_causal"]

outp["Manual_Category"] = outp["Ys_cat"]

outp["Predicted_Codes"] = outp["Pred_codes"]
outp["Predicted_Causal"] = outp["Pred_causal"]

outp["Predicted_Category"] = outp["Pred_cat"]

outp["All_Manual_Codes"] = outp["Concept Codes"]
outp["All_Predictions"] = outp["Predictions"]

outp = outp[["Essay", "Manual_Codes", "Manual_Causal", "Manual_Category", "Predicted_Codes", "Predicted_Causal", "Predicted_Category", "All_Manual_Codes", "All_Predictions"]]

outp.head()

Unnamed: 0,Essay,Manual_Codes,Manual_Causal,Manual_Category,Predicted_Codes,Predicted_Causal,Predicted_Category,All_Manual_Codes,All_Predictions
0,EBA1415_AEKD_4_SC_ES-05566,112234550,"1->2,1->50,4->5",3,112234550,"1->2,12->3,4->5",3,"1,12,2,3,4,5,50,Causer,Causer:1,Causer:1->Resu...","1,12,2,3,4,5,50,Causer,Causer:1,Causer:1->Resu..."
1,EBA1415_AEKD_4_SC_ES-05567,245506,"2->50,4->5",3,245506,"2->50,4->5",3,"2,4,5,50,6,Causer,Causer:2,Causer:2->Result:50...","2,4,5,50,6,Causer,Causer:2,Causer:2->Result:50..."
2,EBA1415_AEKD_4_SC_ES-05568_9,550,,2,550,5->50,3,550,"5,50,Causer,Causer:5->Result:50,Result,_C->R,_..."
3,EBA1415_AEKD_4_SC_ES-05569,13550,"12->2,12->3,12->50,3->50",3,3550,"12->3,3->50,5->50",3,"1,3,5,50,Causer,Causer:12,Causer:12->Result:2,...","3,5,50,Causer,Causer:12,Causer:12->Result:3,Ca..."
4,EBA1415_AEKD_4_SC_ES-05570,1245506,"1->50,2->5,2->50,5->50,5->6,6->50",5,125506,"2->5,2->50,5->50,5->6,6->50",5,"1,2,4,5,50,6,Causer,Causer:1,Causer:1->Result:...","1,2,5,50,6,Causer,Causer:1,Causer:2,Causer:2->..."


In [60]:
outp.to_csv(fout)