In [43]:
import pandas as pd

def group_by(df, bycols, agg_map):
    """

    @param df:      DataFrame
    @param bycols:  str or list
                        Column(s) to group by
    @param agg_map: dictionary or list of 2-tuples
                        Mapping from column to aggregate function e.g. [("city", "count"), ("salary", "mean"]
    @return:        DataFrame
                        Flattened dataframe, with multi-level index removed
    """
    grps = []
    if type(bycols) == str:
        bycols = [bycols]

    if type(agg_map) == dict:
        agg_map = agg_map.items()
    for k,v in agg_map:
        grp = df[bycols + [k]].groupby(bycols).agg(v)
        grp.reset_index(inplace=True)
        grps.append(grp)

    m = grps[0]
    for grp in grps[1:]:
        m = pd.merge(m, grp, on=bycols, how="inner")
    return m

In [44]:
fname = "/Users/simon.hughes/Google Drive/PhD/Data/CoralBleaching/Results/predictions_causal_and_codes.txt"
data = pd.read_csv(fname, sep="|")

In [46]:
data["Concept Codes"] = data["Concept Codes"].astype("str")
data["Concept Codes"] = data["Concept Codes"].apply(lambda s: "" if s == "nan" else s)
data["Predictions"] = data["Predictions"].astype("str")
data["Predictions"] = data["Predictions"].apply(lambda s: "" if s == "nan" else s)
data.head(10)

Unnamed: 0,Essay,Sent Number,Processed Sentence,Concept Codes,Predictions
0,EBA1415_AEKD_4_CB_ES-05568.ann,1,What leads to differences in the rates of cora...,50,50
1,EBA1415_AEKD_4_CB_ES-05568.ann,2,Coral is often mistaken for a rock but it is m...,,
2,EBA1415_AEKD_4_CB_ES-05568.ann,3,Coral bleaching shows bleaching and healthy bl...,50,50
3,EBA1415_AEKD_4_CB_ES-05568.ann,4,Coral bleaching is almost noticeable in the pa...,50,50
4,EBA1415_AEKD_4_CB_ES-05572.ann,1,The part of coral called zooanthellae are not ...,5,
5,EBA1415_AEKD_4_CB_ES-05572.ann,2,And if they get or much sunlight they start to...,"5,50,_C->R,_CRel,_RRel,Causer,Result,Causer:5,...",50
6,EBA1415_AEKD_4_CB_ES-05572.ann,3,The reason why is because the zooanthellae if ...,4,Result
7,EBA1415_AEKD_4_CB_ES-05572.ann,4,The coral also need INFREQUENT water temperatu...,,
8,EBA1415_AEKD_4_CB_ES-05572.ann,5,Also its a threats for us because means that m...,11,"11,Causer:11"
9,EBA1415_AEKD_4_CB_ES-05572.ann,6,Also the water us getting to salty,13,


In [61]:
def concat(lst):
    return ",".join(lst)

def make_unique(s):
    joined = s
    splt = joined.split(",")
    if len(splt) == 0:
        return ""
    un = set(splt)
    if "" in un:
        un.remove("")
    return ",".join(sorted(un))

grpd = group_by(data, "Essay", {"Concept Codes": concat, "Predictions": concat})
grpd["Concept Codes"] = grpd["Concept Codes"].apply(make_unique)
grpd["Predictions"] = grpd["Predictions"].apply(make_unique)
#Re-order cols
grpd = grpd[["Essay", "Concept Codes", "Predictions"]]
grpd.head(20)

Unnamed: 0,Essay,Concept Codes,Predictions
0,EBA1415_AEKD_4_CB_ES-05568.ann,50,50
1,EBA1415_AEKD_4_CB_ES-05572.ann,"11,13,4,5,50,Causer,Causer:5,Causer:5->Result:...","11,50,Causer:11,Result"
2,EBA1415_AEKD_4_CB_ES-05574.ann,1350,50
3,EBA1415_AEKD_4_CB_ES-05902.ann,"3,4,50,7,Causer,Causer:3,Causer:3->Result:4,Ca...","3,4,50,7,Causer,Causer:3,Causer:3->Result:4,Ca..."
4,EBA1415_AEKD_5_CB-06232.ann,"50,Causer,Causer:7,Causer:7->Result:50,Result,...","50,Causer,Causer:7,Causer:7->Result:50,Result,..."
5,EBA1415_AEKD_5_CB_ES-05575.ann,"50,Causer,Causer:7,Causer:7->Result:50,Result,...","50,Causer,Causer:7,Result,Result:50,_C->R,_CRe..."
6,EBA1415_AEKD_5_CB_ES-05579.ann,150,"1,50,6,Causer,Causer:6,Result,Result:14,_C->R,..."
7,EBA1415_AEKD_5_CB_ES-05582.ann,"50,7,Causer,Causer:7,Causer:7->Result:50,Resul...","50,7,Causer,Causer:7,Causer:7->Result:50,Resul..."
8,EBA1415_AEKD_5_CB_ES-05586.ann,50,"50,Result"
9,EBA1415_BGJD_1_CB_ES-05725.ann,"3,50,Causer,Causer:3,Causer:3->Result:50,Cause...","3,50,Causer,Causer:3,Causer:3->Result:50,Resul..."


In [None]:
def category(s):
    if not s or s =="" or s == "nan":
        return 1
    splt = s.split(",")
    regular = [t for t in splt if t[0].isdigit()]
    causal  = [t for t in splt if "->" in t and "Causer" in t and "Result" in t]
    if len(regular) == 0 and len(causal) == 0:
        return 1
    if len(causal) == 0: #i.e. by this point regular must have some
        return 2 # no causal
    # if only one causal then must be 3
    elif len(causal) == 1:
        return 3
    #Map to Num->Num, e.g. Causer:3->Results:50 becomes 3->5
    crels = map(lambda t: t.replace("Causer:","").replace("Result:",""), causal)
    
    return 3

In [62]:
grpd["Ys_cat"] = grpd["Concept Codes"].apply(category)
grpd["Pred_cat"] = grpd["Predictions"].apply(category)
grpd.head()

Unnamed: 0,Essay,Concept Codes,Predictions,Ys_cat,Pred_cat
0,EBA1415_AEKD_4_CB_ES-05568.ann,50,50,2,2
1,EBA1415_AEKD_4_CB_ES-05572.ann,"11,13,4,5,50,Causer,Causer:5,Causer:5->Result:...","11,50,Causer:11,Result",3,2
2,EBA1415_AEKD_4_CB_ES-05574.ann,1350,50,2,2
3,EBA1415_AEKD_4_CB_ES-05902.ann,"3,4,50,7,Causer,Causer:3,Causer:3->Result:4,Ca...","3,4,50,7,Causer,Causer:3,Causer:3->Result:4,Ca...",3,3
4,EBA1415_AEKD_5_CB-06232.ann,"50,Causer,Causer:7,Causer:7->Result:50,Result,...","50,Causer,Causer:7,Causer:7->Result:50,Result,...",3,3


In [65]:
grpd["Diff"] = grpd["Ys_cat"] - grpd["Pred_cat"]
grpd["Diff"] = grpd["Diff"].abs()
grpd.head(10)

Unnamed: 0,Essay,Concept Codes,Predictions,Ys_cat,Pred_cat,Diff
0,EBA1415_AEKD_4_CB_ES-05568.ann,50,50,2,2,0
1,EBA1415_AEKD_4_CB_ES-05572.ann,"11,13,4,5,50,Causer,Causer:5,Causer:5->Result:...","11,50,Causer:11,Result",3,2,1
2,EBA1415_AEKD_4_CB_ES-05574.ann,1350,50,2,2,0
3,EBA1415_AEKD_4_CB_ES-05902.ann,"3,4,50,7,Causer,Causer:3,Causer:3->Result:4,Ca...","3,4,50,7,Causer,Causer:3,Causer:3->Result:4,Ca...",3,3,0
4,EBA1415_AEKD_5_CB-06232.ann,"50,Causer,Causer:7,Causer:7->Result:50,Result,...","50,Causer,Causer:7,Causer:7->Result:50,Result,...",3,3,0
5,EBA1415_AEKD_5_CB_ES-05575.ann,"50,Causer,Causer:7,Causer:7->Result:50,Result,...","50,Causer,Causer:7,Result,Result:50,_C->R,_CRe...",3,2,1
6,EBA1415_AEKD_5_CB_ES-05579.ann,150,"1,50,6,Causer,Causer:6,Result,Result:14,_C->R,...",2,2,0
7,EBA1415_AEKD_5_CB_ES-05582.ann,"50,7,Causer,Causer:7,Causer:7->Result:50,Resul...","50,7,Causer,Causer:7,Causer:7->Result:50,Resul...",3,3,0
8,EBA1415_AEKD_5_CB_ES-05586.ann,50,"50,Result",2,2,0
9,EBA1415_BGJD_1_CB_ES-05725.ann,"3,50,Causer,Causer:3,Causer:3->Result:50,Cause...","3,50,Causer,Causer:3,Causer:3->Result:50,Resul...",3,3,0


In [68]:
print "Accuracy", round(len(grpd[grpd["Ys_cat"] == grpd["Pred_cat"]]) / float(len(grpd)),4)
print "Adj     ", round(len(grpd[grpd["Diff"] <=1]) / float(len(grpd)),4)
print len(data), "sentences"

Accuracy 0.9176
Adj      1.0
10155 sentences
