In [2]:
ROOT_FOLDER = "/Users/simon.hughes/Google Drive/PhD/Data/CoralBleaching/PhraseExtractionAnalysis"

AP_SYNONYMS_FILE     = "%s/Ap_Synonyms.txt" % ROOT_FOLDER
KM_SYNONYMS_FILE     = "%s/KM_Synonyms.txt" % ROOT_FOLDER

AP_CLUSTERS_FILE = "%s/Ap_Clusters.txt" % ROOT_FOLDER
KM_CLUSTERS_FILE = "%s/KMeans_Clusters.txt" % ROOT_FOLDER

In [3]:
import re
from collections import defaultdict

re_collapse_spaces = re.compile("\s+")
def collapse_spaces(s):
    return re_collapse_spaces.sub(" ", s)

re1 = re.compile("[;:\'\"\*/\),\(\|\s]+")
def clean_str(s):
    s = str(s).replace("'s"," ")
    #doesn't work in regex
    s = s.replace("-", " ").replace("\\"," ")
    s = re1.sub(" ",s).strip()
    return collapse_spaces(s)

In [4]:
from collections import defaultdict

class SynonymMapper(object):
    def __init__(self, mapper, nested, case_sensitive=False):
        self.case_sensitive = case_sensitive
        self.mapper = mapper
        self.nested = nested
        self.synonyms = set()
        for rhs in self.mapper.values():
            for syn in rhs:
                self.synonyms.add(syn)
        
    def is_synonym(self, term):
        return term in self.synonyms
        
    def map_synonyms(self, tokens, debug=False):
        mapped = []
        size = len(tokens)
        if not self.case_sensitive:
            tmp_tokens = map(lambda s: s.lower(), tokens)
        else:
            tmp_tokens = tokens
        ix = 0
        while ix < size:
            if debug:
                print "ix", ix
            best, best_key = None, None
            tmp_ix = ix        
            max_ix = ix
            current = ""
            d = self.nested
            while tmp_ix < size and tmp_tokens[tmp_ix] in d:
                current += tmp_tokens[tmp_ix] + " "
                key = current.strip()
                if key in self.mapper:
                    if debug:
                        if best is not None:
                            print(ix, tmp_ix, "new best:", key, "=>", self.mapper[key])
                        else:
                            print(ix, tmp_ix, "best:", key, "=>", self.mapper[key])
                    best = self.mapper[key]
                    best_key = key
                    max_ix = tmp_ix                    
                d = d[tmp_tokens[tmp_ix]]
                tmp_ix += 1
            if not best:
                #retain original casing
                mapped.append(tokens[ix])
            else:
                ix = max_ix
                #yields a set
                for item in sorted(best):
                    mapped.append(item)
            ix += 1
        return mapped

    def __repr__(self):
        return "Synonym Mapper: %i synonyms mapped" % len(self.mapper)

def build_synonym_filter(files, case_sensitive=False):
    # recursively define a defaultdict generator
    mapper = defaultdict(set)
    def dd():
        return defaultdict(dd)
    nested_map = defaultdict(dd)
    file_locn = dict()
    if type(files) == str:
        files = [files]
    for f in files:
        with open(f, "r+") as fin:
            for line in fin:
                line = line.strip()
                if len(line) > 0 and not line[0] == "#":
                    if "=>" in line:
                        left, right = line.split("=>")
                        right = set(right.split(","))
                        left_parts = left.split(",")
                    else:
                        left_parts = line.split(",")
                        right = set(left_parts)

                    for syn in left_parts:
                        for rhs in right:
                            mapper[syn].add(rhs)
                        file_locn[syn] = f

                        tokens = syn.split(" ")
                        prev = tokens[0]
                        d = nested_map[prev]
                        for token in tokens[1:]:
                            d = d[token]
                            prev = token                        
    return SynonymMapper(mapper, nested_map, case_sensitive)

In [5]:
#String processing
def white_space_tokenize(s):
    return s.split(" ")

__punct__ = set(".?!,;:")
def remove_punct_at_end(s):
    while len(s) > 1 and s[-1] in __punct__:
        s = s[:-1]
    return s

#Token Filters
def fact_len_filter(max_len):
    def len_filter(tokens):
        return filter(lambda s: len(s) >= max_len, tokens)
    return len_filter

remove_empty_tokens_filter = fact_len_filter(1)

def lower_case_filter(tokens):
    if type(tokens) == str:
        return tokens.lower()
    return map(lambda t: t.lower(), tokens)

__punct__ = set(".?!,;:")

def remove_punct_at_end_filter(tokens):
    return map(remove_punct_at_end, tokens)

def fact_is_synonym_filter(syn_mapper):
    def is_synonym_filter(tokens):
        return filter(syn_mapper.is_synonym, tokens)
    return is_synonym_filter

def is_cluster_filter(tokens):
    return filter(lambda s: s.startswith("cluster_"), tokens)

def remove_cluster_filter(tokens):
    return map(lambda s: int(s.replace("cluster_","")), tokens)

def to_set_filter(tokens):
    return set(tokens)

def to_distinct_sequential_items(tokens):
    last = None
    sequence = []
    for tok in tokens:
        if tok != last:
            sequence.append(tok)
            last = tok
    return sequence

In [6]:
def analyze(s, filters):
    temp = s
    for f in filters:
        temp = f(temp)
    return temp

def debug_analyze(s, filters):
    temp = s
    pad = 30
    print "START".ljust(pad), temp
    for f in filters:
        temp = f(temp)
        if type(temp) == list:
            s_temp = "|".join(map(str,temp))
        else:
            s_temp = str(temp)
        print f.func_name.ljust(pad), s_temp
    return temp

In [7]:
syn_mapper = build_synonym_filter([AP_SYNONYMS_FILE], False)

In [8]:
set_analysis_chain = [clean_str,
                  white_space_tokenize,
                  remove_punct_at_end_filter,
                  lower_case_filter,
                  syn_mapper.map_synonyms,
                  is_cluster_filter,
                  remove_cluster_filter,
                  to_set_filter]

sequence_analysis_chain = [clean_str,
                  white_space_tokenize,
                  remove_punct_at_end_filter,
                  lower_case_filter,
                  syn_mapper.map_synonyms,
                  is_cluster_filter,
                  remove_cluster_filter,
                  to_distinct_sequential_items]

In [9]:
#coral bleaching can be caused by raising co2 levels
debug_analyze("some algae called zooxanthellae live in the ocean.", set_analysis_chain)

START                          some algae called zooxanthellae live in the ocean.
clean_str                      some algae called zooxanthellae live in the ocean.
white_space_tokenize           some|algae|called|zooxanthellae|live|in|the|ocean.
remove_punct_at_end_filter     some|algae|called|zooxanthellae|live|in|the|ocean
lower_case_filter              some|algae|called|zooxanthellae|live|in|the|ocean
map_synonyms                   some|cluster_80|in|the|cluster_19
is_cluster_filter              cluster_80|cluster_19
remove_cluster_filter          80|19
to_set_filter                  set([80, 19])


{19, 80}

In [10]:
debug_analyze(" algae zooxanthellae the ocean ocean is ocean coral bleaching coral coral wind.", sequence_analysis_chain)

START                           algae zooxanthellae the ocean ocean is ocean coral bleaching coral coral wind.
clean_str                      algae zooxanthellae the ocean ocean is ocean coral bleaching coral coral wind.
white_space_tokenize           algae|zooxanthellae|the|ocean|ocean|is|ocean|coral|bleaching|coral|coral|wind.
remove_punct_at_end_filter     algae|zooxanthellae|the|ocean|ocean|is|ocean|coral|bleaching|coral|coral|wind
lower_case_filter              algae|zooxanthellae|the|ocean|ocean|is|ocean|coral|bleaching|coral|coral|wind
map_synonyms                   cluster_80|the|cluster_19|cluster_19|is|cluster_19|cluster_17|cluster_55|cluster_55|cluster_51
is_cluster_filter              cluster_80|cluster_19|cluster_19|cluster_19|cluster_17|cluster_55|cluster_55|cluster_51
remove_cluster_filter          80|19|19|19|17|55|55|51
to_distinct_sequential_items   80|19|17|55|51


[80, 19, 17, 55, 51]

In [11]:
import pandas as pd

def to_concepts_only(s):
    splt = s.split(",")
    if not splt:
        return set()
    filtered = filter(lambda s: s and s[0].isdigit() and not "-" in s, splt)
    return sorted(set(filtered))

def apply_cluster_extraction(s):
    return sorted(analyze(s, set_analysis_chain))

def apply_cluster_sequence_extraction(s):
    return analyze(s, sequence_analysis_chain)

fname = "/Users/simon.hughes/Google Drive/PhD/Data/CoralBleaching/Results/predictions_causal_and_codes.txt"
data = pd.read_csv(fname, sep="|")
data = data[["Essay", "Sent Number", "Processed Sentence", "Concept Codes"]]
data["Concept Codes"] = data["Concept Codes"].astype(str).apply(lambda s: "" if s == "nan" else s)
    
data["Concept_Codes"] = data["Concept Codes"].apply(to_concepts_only)
del data["Concept Codes"]
data["Clusters"] = data["Processed Sentence"].apply(apply_cluster_extraction)
data["Seq_Clusters"] = data["Processed Sentence"].apply(apply_cluster_sequence_extraction)
#data["Num_Concept_Codes"] = data["set_Concept_Codes"].apply(lambda st: len(st))

data.head(30)

Unnamed: 0,Essay,Sent Number,Processed Sentence,Concept_Codes,Clusters,Seq_Clusters
0,EBA1415_AEKD_4_CB_ES-05568.ann,1,What leads to differences in the rates of cora...,[50],[17],[17]
1,EBA1415_AEKD_4_CB_ES-05568.ann,2,Coral is often mistaken for a rock but it is m...,[],"[12, 55]","[55, 12]"
2,EBA1415_AEKD_4_CB_ES-05568.ann,3,Coral bleaching shows bleaching and healthy bl...,[50],"[5, 6, 17, 39, 41]","[17, 39, 5, 6, 41]"
3,EBA1415_AEKD_4_CB_ES-05568.ann,4,Coral bleaching is almost noticeable in the pa...,[50],"[17, 64, 69]","[17, 69, 64]"
4,EBA1415_AEKD_4_CB_ES-05572.ann,1,The part of coral called zooanthellae are not ...,[5],"[12, 37, 38, 55, 68]","[38, 55, 12, 37, 55, 68]"
5,EBA1415_AEKD_4_CB_ES-05572.ann,2,And if they get or much sunlight they start to...,"[5, 50]","[8, 46, 55, 60, 62, 68]","[55, 8, 68, 46, 60, 62]"
6,EBA1415_AEKD_4_CB_ES-05572.ann,3,The reason why is because the zooanthellae if ...,[4],"[3, 32, 37, 55, 67]","[67, 37, 55, 3, 32]"
7,EBA1415_AEKD_4_CB_ES-05572.ann,4,The coral also need INFREQUENT water temperatu...,[],"[6, 23, 45, 55]","[55, 6, 23, 45]"
8,EBA1415_AEKD_4_CB_ES-05572.ann,5,Also its a threats for us because means that m...,[11],"[2, 8, 18, 19, 22, 55, 81]","[2, 18, 55, 22, 81, 8, 19]"
9,EBA1415_AEKD_4_CB_ES-05572.ann,6,Also the water us getting to salty,[13],"[55, 70]","[55, 70]"


In [152]:
code2clusters  = defaultdict(lambda : defaultdict(int))
clusters2codes = defaultdict(lambda : defaultdict(int))
codeSentFreq   = defaultdict(int)
clusterSentFreq = defaultdict(int)

clusterUnigramFreq = defaultdict(int)
clusterBigramFreq  = defaultdict(int)
for i in range(len(data)):
    row = data.iloc[i]
    codes = row["Concept_Codes"]
    if codes:
        clusters = row["Clusters"]
        if clusters:
            for code in codes:
                codeSentFreq[code] +=1
                for cluster in clusters:
                    code2clusters[code][cluster] += 1
                    clusters2codes[cluster][code] += 1
        cluster_seq = row["Seq_Clusters"]  
        for i in range(len(cluster_seq)):
            cluster = cluster_seq[i]
            clusterUnigramFreq[cluster] += 1
            if i < len(cluster_seq) - 1:
                next_cl = cluster_seq[i+1]
                a,b = cluster, next_cl
                # ensure a is the smaller (collapse pairs)
                #if next_cl < cluster:
                #    a,b = next_cl, cluster
                clusterBigramFreq[(a,b)] += 1
                
#compute LIFT
priors = dict()
unigram_total = float(sum(clusterUnigramFreq.values()))
bigram_total  = float(sum(clusterBigramFreq.values()))
for cluster, freq in clusterUnigramFreq.items():
    priors[cluster] = freq / unigram_total

joints = dict()
lifts = dict()
for (a,b), freq in clusterBigramFreq.items():
    pAandB = freq / bigram_total
    joints[(a,b)] = pAandB
    pA = priors[a]
    pB = priors[b]
    lifts[(a,b)] = pAandB / (pA * pB)

In [153]:
sum(joints.values()), sum(priors.values()), len(lifts)

(0.999999999999959, 1.0, 4013)

In [155]:
best = set()
causes = set()
effects = set()
for k,v in lifts.items():
    if v >= 2.0:
        best.add(k[0])
        causes.add(k[0])
        best.add(k[1])
        effects.add(k[1])
len(best), len(priors), len(causes), len(effects)

(82, 82, 82, 82)

In [151]:
sorted(lifts.items(), key = lambda (k,v): -v)

[((43, 63), 172.92962786762848),
 ((12, 63), 114.61614870296307),
 ((14, 65), 64.72219990367381),
 ((24, 42), 55.326324167456114),
 ((46, 63), 51.60727114374254),
 ((27, 37), 43.185054932989374),
 ((11, 65), 42.53285345611575),
 ((48, 49), 35.87802400125107),
 ((11, 14), 33.57856851798612),
 ((19, 40), 33.34287962268017),
 ((14, 68), 32.5514593633183),
 ((51, 74), 27.730658995652956),
 ((21, 78), 27.440414205282867),
 ((32, 58), 27.115320877184494),
 ((7, 43), 26.10258533850996),
 ((57, 70), 24.701137171920365),
 ((12, 27), 24.31251639153762),
 ((43, 64), 24.238114957187825),
 ((40, 48), 23.737480526080255),
 ((12, 37), 22.094679268041073),
 ((1, 16), 21.932270031526947),
 ((43, 77), 21.691521265625877),
 ((16, 74), 21.468820277884557),
 ((69, 73), 21.224777449864785),
 ((37, 59), 20.95745312924484),
 ((4, 66), 20.873623316727866),
 ((49, 51), 20.783561218116024),
 ((63, 79), 20.535393309280884),
 ((3, 21), 20.133423908333256),
 ((3, 56), 19.933836062897488),
 ((34, 71), 19.91310866354

In [117]:
for code in code2clusters.keys():
    tally = code2clusters[code]
    total = float(codeSentFreq[code])
    proportions = []
    for cl, cnt in tally.items():
        prop = cnt / total
        proportions.append((cl, prop))
    print code
    for cl, prop in sorted(proportions, key = lambda (k,v): -v):
        print str(cl).rjust(5), "\t", str(round(prop * 100,2)).rjust(5) + "%"
        if prop < .5:
            break

11
   28 	66.67%
   70 	49.73%
13
   70 	59.63%
   28 	49.48%
12
   28 	 95.9%
   70 	68.03%
   19 	63.93%
   32 	 58.2%
   57 	 58.2%
   55 	33.61%
14
   55 	42.51%
50
   17 	64.74%
   55 	34.48%
1
   23 	64.65%
   17 	36.38%
3
   23 	 64.5%
   55 	44.48%
2
   77 	38.61%
5
   45 	81.76%
   55 	50.59%
   23 	35.88%
4
   32 	 73.6%
   58 	52.28%
   55 	50.25%
   45 	31.47%
7
   55 	 57.2%
   61 	38.82%
6
   55 	 74.2%
   54 	66.34%
    2 	40.05%


In [95]:
for cluster in clusters2codes.keys():
    tally = clusters2codes[cluster]
    total = float(clusterSentFreq[cluster])
    proportions = []
    for code, cnt in tally.items():
        prop = cnt / total
        proportions.append((code, prop))
    print cluster
    cumulative = 0.0
    for code, prop in sorted(proportions, key = lambda (k,v): -v):
        print str(code).rjust(5), str(round(prop * 100,2)).rjust(5) + "%"
        if prop < 0.5:
            break

0
   50  62.8%
    1  27.9%
1
    1 40.94%
2
   50  52.8%
    6  32.6%
3
   50 54.92%
    3 47.54%
4
    3 62.84%
   50 37.84%
5
   50 89.16%
    3 16.75%
6
   50 58.79%
    3 26.06%
7
   50 67.74%
    3 16.13%
8
   50 68.57%
    1 25.71%
9
   50 65.82%
    7 25.51%
10
   50  65.9%
    7 42.47%
11
   50  66.0%
    3  28.0%
12
   50  64.1%
    1 28.85%
13
    3 56.03%
    1 43.97%
14
   50 60.38%
    3 32.08%
15
   50 80.25%
    3 22.02%
16
    3  48.0%
17
   50 96.92%
    1 16.58%
18
   50 65.75%
    7 34.25%
19
   50  54.7%
   11 26.24%
20
   50 86.51%
    1 16.92%
21
    3 60.92%
    1 43.68%
22
   50 40.64%
23
    3 60.15%
    1 56.07%
   50 47.41%
24
   50 80.74%
    3 13.52%
25
   50 68.31%
    1 21.54%
26
   50 91.67%
    3  15.0%
27
   50  65.0%
    3 23.33%
28
   11 56.49%
   13 54.44%
   50 29.84%
29
   50 45.21%
30
   50 65.53%
    3 17.95%
31
   50 83.71%
    7 78.57%
    6  7.14%
32
    4  46.1%
33
   50 81.79%
    1 22.39%
34
   50 72.92%
    3 20.83%
35
   50 91.95%
    1

# Could we Use the Powerset Method?

In [12]:
data["Str_Concept_codes"] = data["Concept_Codes"].apply(lambda s: " ".join(s))
data.head()

Unnamed: 0,Essay,Sent Number,Processed Sentence,Concept_Codes,Clusters,Seq_Clusters,Str_Concept_codes
0,EBA1415_AEKD_4_CB_ES-05568.ann,1,What leads to differences in the rates of cora...,[50],[17],[17],50.0
1,EBA1415_AEKD_4_CB_ES-05568.ann,2,Coral is often mistaken for a rock but it is m...,[],"[12, 55]","[55, 12]",
2,EBA1415_AEKD_4_CB_ES-05568.ann,3,Coral bleaching shows bleaching and healthy bl...,[50],"[5, 6, 17, 39, 41]","[17, 39, 5, 6, 41]",50.0
3,EBA1415_AEKD_4_CB_ES-05568.ann,4,Coral bleaching is almost noticeable in the pa...,[50],"[17, 64, 69]","[17, 69, 64]",50.0
4,EBA1415_AEKD_4_CB_ES-05572.ann,1,The part of coral called zooanthellae are not ...,[5],"[12, 37, 38, 55, 68]","[38, 55, 12, 37, 55, 68]",5.0


In [15]:
len(data["Str_Concept_codes"].unique())

185

In [17]:
fname = "/Users/simon.hughes/Google Drive/PhD/Data/CoralBleaching/Results/predictions_causal_and_codes.txt"
data = pd.read_csv(fname, sep="|")
data.tail()

Unnamed: 0,Essay,Sent Number,Processed Sentence,Concept Codes,Predictions
10150,EBA1415post_WSKT_6_CB_ES-05347.ann,6,The coral provides a protected nutrient rich e...,,
10151,EBA1415post_WSKT_6_CB_ES-05347.ann,7,This relationship also benefits the coral,,
10152,EBA1415post_WSKT_6_CB_ES-05347.ann,8,The zooxanthellae pass some of the food they m...,,
10153,EBA1415post_WSKT_6_CB_ES-05347.ann,9,It also states coral bleaching is one example ...,"6,14,50,_C->R,_CRel,_RRel,Causer,Result,Causer...","6,14,50,_C->R,_CRel,_RRel,Causer,Result,Causer..."
10154,EBA1415post_WSKT_6_CB_ES-05347.ann,10,During bleaching corals turn white due to the ...,"7,50,_C->R,_CRel,_RRel,Causer,Result,Causer:7,...","7,50,_C->R,_CRel,_RRel,Causer,Result,Causer:7,..."


In [18]:
len(data["Concept Codes"].unique())

630