In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.util import ngrams
import itertools
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import networkx as nx

nltk.download('brown')
nltk.download('punkt')
nltk.download('stopwords')
stopwords = stopwords.words('english')

np.set_printoptions(suppress=True)

[nltk_data] Downloading package brown to /Users/r0g06z5/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /Users/r0g06z5/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/r0g06z5/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Word Sense Disambiguation - ML approaches

In this notebook, we will look into following approaches for WSD: 
1. **Supervised:** Decision List algorithm
2. **Semi-Supervised:** Yarowsky's Method
3. **Unsupervised:** HyperLex

#### Supervised approach - Decision List Classifier

We are interested in finding appropriate sense given a sentence. In this approach, we are given with labelled dataset with (sentence, sense) pairs. 

The steps for Decision List Classifier are as follows:
1. We split the data into train and validation sets. 
2. Learn the Decision List on training set. We do this by computing log-likelihood ratio between two senses of ambiguous word given a collocation.
3. Evaluate it on validation set.

Here as an example, we are considering adjacent words (bigrams) as collocations. Data can be downloaded from the below link.

In [2]:
# !rm -rf public
# !git clone https://github.com/myTomorrows-research/public.git

In [3]:
def make_data(path):
    df = pd.read_csv(path, sep='\t')
    df['Term'].replace({'Albumin':'Albumine', 'BBC':'BCC'}, inplace=True)
    idx_map = {'ACS':1, 'Albumine':2, 'BCC':0}

    for i in range(len(df)):
        study = df.loc[i, 'ID_study']
        term = df.loc[i, 'Term']
        idx = idx_map[term]
        study_path = f'public/HealthInf2021/{term}/{idx}_{study}.txt'
        with open(study_path) as f:
            data = f.read().splitlines()
        try:
            summary = data[data.index('brief_summary:') + 1].strip()
            df.loc[i, 'summary'] = summary
        except:
            pass

    # preprocessing
    df.dropna(inplace=True)
    df.drop(columns='ID_study', inplace=True)
    df.rename(columns={'Final sense':'sense'}, inplace=True)
    df.reset_index(drop=True, inplace=True)
    df['Term'] = list(map(lambda x: x.strip(), df['Term']))
    df['sense'] = list(map(lambda x: x.strip(), df['sense']))
    return df

def get_collocations(sentence, tokenizer, n_gram):
    words = tokenizer.tokenize(sentence)
    words = [word.lower() for word in words if word.isalnum() and word not in stopwords]
    collocations = list(ngrams(words, n_gram))
    return collocations    

In [4]:
n_gram = 2
path = 'public/HealthInf2021/benchmark_myT_WSD.txt'
tokenizer = RegexpTokenizer(r'\w+')

# making data
df = make_data(path)

# splitting into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=0)
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
len(train_df), len(val_df)

(100, 25)

In [5]:
def get_all_collocations(df):
    corpus = df['summary'].values.tolist()
    all_collocations = list(map(lambda sent: get_collocations(sent, tokenizer, n_gram), corpus))
    flattened_all_collocations = list(itertools.chain(*all_collocations))
    unique_collocations = list(set(flattened_all_collocations))
    return unique_collocations, all_collocations

In [6]:
# get collocation_sense_mapping
def get_collocation_sense_map(df, unique_collocations, all_collocations):
    collocation_sense_map = {}
    for c in unique_collocations:
        collocation_sense_map[c] = []
        for idx in range(len(df)):
            if c in all_collocations[idx]:
                term = df.loc[idx,'Term']
                sense = df.loc[idx,'sense']
                collocation_sense_map[c].append(term + '-' + sense)
    return collocation_sense_map

# compute log-likelihood ratio and find sense for given collocation
def get_sense_given_collocation(test_collocation, collocation_sense_map):
    senses = collocation_sense_map[test_collocation]
    senses = dict(Counter(senses))
    cond_probs = {k:v/sum(senses.values()) for k,v in senses.items()}

    max_llr = -float('inf')
    sense = None
    for iter_senseA, probA in cond_probs.items():
        for iter_senseB, probB in cond_probs.items():
            # log-likelihood ratio
            llr = np.log(probA / probB)
            if llr > max_llr:
                max_llr = llr
                sense = iter_senseA
    return sense, max_llr

# get final decision list
def get_decision_list(collocation_sense_map):
    decision_list = []
    for test_collocation in collocation_sense_map.keys():
        sense, max_llr = get_sense_given_collocation(test_collocation, collocation_sense_map)
        decision_list.append([test_collocation, sense, max_llr])

    decision_list = sorted(decision_list, key=lambda x: x[2], reverse=True)    
    decision_list = pd.DataFrame(decision_list, columns=['collocations','sense','LLR'])
    return decision_list

In [7]:
unique_collocations, all_collocations = get_all_collocations(train_df)
print('Number of unique collocations:', len(unique_collocations))

collocation_sense_map = get_collocation_sense_map(train_df, unique_collocations, all_collocations)
decision_list = get_decision_list(collocation_sense_map)
decision_list.head(10)

Number of unique collocations: 3501


Unnamed: 0,collocations,sense,LLR
0,"(syndrome, acs)",ACS-Sense6,2.197225
1,"(study, evaluate)",BCC-Sense3,1.609438
2,"(the, aim)",Albumine-Sense2,1.609438
3,"(the, purpose)",BCC-Sense3,1.203973
4,"(purpose, study)",BCC-Sense3,1.203973
5,"(septic, shock)",Albumine-Sense2,1.098612
6,"(the, study)",ACS-Sense6,1.098612
7,"(this, study)",Albumine-Sense2,1.098612
8,"(standard, care)",Albumine-Sense2,1.098612
9,"(coronary, artery)",ACS-Sense6,1.098612


In [8]:
def get_true_labels(val_df):
    y_true = (val_df['Term'] + '-' + val_df['sense']).values
    return np.array(y_true)

def get_predictions(val_df, decision_list):
    y_pred = []
    val_corpus = val_df['summary'].values.tolist()
    val_all_collocations = list(map(lambda sent: get_collocations(sent, tokenizer, n_gram), val_corpus))    
    for val_collocations in val_all_collocations:
        bool_found = 0
        for idx in range(len(decision_list)):
            colloc = decision_list.loc[idx,'collocations']
            sense = decision_list.loc[idx,'sense']
            if colloc in val_collocations:
                bool_found = 1
                break
        if not bool_found:
            sense = ''
        y_pred.append(sense)
    return np.array(y_pred)

In [9]:
# true labels
y_val = get_true_labels(val_df)

# prediction labels
y_pred = get_predictions(val_df, decision_list)

assert len(y_val) == len(y_pred)
accuracy_score(y_val, y_pred)

0.64

### Semi-Supervised approaches

The problem with Supervised approaches is that you need labelled data. However, you may not have it all the time or you have only a small sample of it. Now, we will look into **Yarowsky's method** which is semi-supervised. 

It's based on the idea of bootstrapping. You start with some small sample of data with labels and progressively label remaining examples.

In [10]:
def single_pass(bootstrap_df, oob_df):
    unique_collocations, all_collocations = get_all_collocations(bootstrap_df)
    collocation_sense_map = get_collocation_sense_map(bootstrap_df, unique_collocations, all_collocations)
    decision_list = get_decision_list(collocation_sense_map)
    decision_list = decision_list[decision_list['LLR']>0].reset_index(drop=True)    

    y_val = get_true_labels(oob_df)
    y_pred = get_predictions(oob_df, decision_list)
    assert len(y_val) == len(y_pred)
    acc_score = accuracy_score(y_val, y_pred)      

    bootstrap_idx = np.where(y_pred!='')[0]
    oob_idx = np.where(y_pred=='')[0]
    bootstrap_df = pd.concat([bootstrap_df, oob_df.loc[bootstrap_idx]]).reset_index(drop=True)  
    oob_df = oob_df.loc[oob_idx].reset_index(drop=True)
    return bootstrap_df, oob_df, decision_list, acc_score

def run_Yarowsky_algorithm(df, bootstrap_size, random_state, n_iters):
    # splitting into two sets
    oob_df, bootstrap_df = train_test_split(df, test_size=bootstrap_size, random_state = random_state)
    oob_df.reset_index(drop=True, inplace=True)
    bootstrap_df.reset_index(drop=True, inplace=True)

    for iter in range(n_iters):
        bootstrap_df, oob_df, decision_list, acc_score = single_pass(bootstrap_df, oob_df)
        if acc_score==0:
            print('\nNo collocations matched between decision list and OOB sample..')
            print('Terminating..')
            break
        else:
            print(f'Iteration:{iter+1} | Decision List Length: {len(decision_list)} | Accuracy Score: {acc_score.round(3)}')

In [11]:
n_gram = 2
path = 'public/HealthInf2021/benchmark_myT_WSD.txt'
tokenizer = RegexpTokenizer(r'\w+')
bootstrap_size = 0.1
random_state = 0
n_iters = 10

# making data
df = make_data(path)

# running algorithm
run_Yarowsky_algorithm(df, bootstrap_size, random_state, n_iters)

Iteration:1 | Decision List Length: 1 | Accuracy Score: 0.045
Iteration:2 | Decision List Length: 10 | Accuracy Score: 0.202
Iteration:3 | Decision List Length: 23 | Accuracy Score: 0.158
Iteration:4 | Decision List Length: 34 | Accuracy Score: 0.103

No collocations matched between decision list and OOB sample..
Terminating..


### Unsupervised approach

The limitation of the above semi-supervised approach is that the accuracy depends on how good your initial seed set is. 

#### Hyperlex
Now, we will look into an approach which requires no labels at all. The idea is that for an ambiguous word, there will be many connections among context words within different senses but not many among context words between different senses. 

For example - context words for word run can be (government, politics, authority) and (walk, fast, speed). So, there will be connections among (government, politics, authority) and among (walk, fast, speed) but not between say government and fast.

The steps are as follows:

**Training:**

1. Build the co-occurence graph $G$ based on statistics on your data 
2. Given a target ambiguous word and context, we build a spanning tree with:
    1. Target word as the root node 
    2. Hubs as neighbours to target word in $G$

**Run-time:**

1. Now, we compute score vector of length k (one for each hub) for all context words
2. Sum hub scores for all context words 
3. The hub with maximum score is the sense of ambiguous word being used in given context

In [12]:
n_sents = 500
window_size = 5
brown = nltk.corpus.brown
sents = brown.sents()[:n_sents]

print('Processing sentences..\n')
processed_sents = []
for sent in sents:
    processed_sents.append([word.lower() for word in sent if word.isalnum() and word not in stopwords])

tokens = list(set(list(itertools.chain(*processed_sents))))   
n_tokens = len(tokens)
print('Number of Sentences:', len(sents)) 
print('Number of Tokens:', n_tokens)

Processing sentences..

Number of Sentences: 500
Number of Tokens: 2493


In [13]:
def get_co_occurences(token, processed_sents, window_size):
    co_occurences = []
    for sent in processed_sents:
        if len(sent):
            for idx in (np.array(sent)==token).nonzero()[0]:
                co_occurences.append(sent[max(0, idx-window_size):min(idx+window_size+1, len(sent))])

    co_occurences = list(itertools.chain(*co_occurences))
    co_occurence_idxs = list(map(lambda x: token2int[x], co_occurences))
    co_occurence_dict = Counter(co_occurence_idxs)
    co_occurence_dict = dict(sorted(co_occurence_dict.items()))
    return co_occurence_dict

def get_co_occurence_matrix(tokens, processed_sents, window_size):
    for token in tokens:
        token_idx = token2int[token]
        co_occurence_dict = get_co_occurences(token, processed_sents, window_size)
        co_occurence_matrix[token_idx, list(co_occurence_dict.keys())] = list(co_occurence_dict.values())
        
    np.fill_diagonal(co_occurence_matrix, 0)    
    return co_occurence_matrix

In [14]:
token2int = dict(zip(tokens, range(len(tokens))))
int2token = {v:k for k,v in token2int.items()} 

print('Building co-occurence matrix..')
co_occurence_matrix = np.zeros(shape=(len(tokens), len(tokens)), dtype='int')
co_occurence_matrix = get_co_occurence_matrix(tokens, processed_sents, window_size)
print('Co-occurence matrix shape:', co_occurence_matrix.shape)
assert co_occurence_matrix.shape == (n_tokens, n_tokens)

# co-occurence matrix is similar
assert np.all(co_occurence_matrix.T == co_occurence_matrix)

Building co-occurence matrix..
Co-occurence matrix shape: (2493, 2493)


In [15]:
# adding nodes
def add_nodes(G, n_tokens):
    nodes = range(n_tokens)
    G.add_nodes_from(nodes)
    return G

def make_edges(co_occurence_matrix, row_idx):
    row = co_occurence_matrix[row_idx]
    idxs = np.where(row!=0)[0]

    # weights as distance
    weights = 1 - row[idxs]
    edge_weights = list(zip(['weight'] * len(idxs), weights))
    edge_weights = list(map(lambda x: {x[0]:x[1]}, edge_weights))
    edges = list(tuple(zip([row_idx] * len(idxs), idxs, edge_weights)))
    return edges

def add_edges(co_occurence_matrix, n_tokens):
    all_edges = list(map(lambda row_idx: make_edges(co_occurence_matrix, row_idx), range(n_tokens)))
    all_edges = list(itertools.chain(*all_edges))
    G.add_edges_from(all_edges)
    return G

In [16]:
# removing isolated nodes
# edges to/from removed nodes are removed as well
def remove_isolated_nodes(G):
    nodes = G.nodes()
    for node in nodes:
        if len(G.successors(node))==0 and len(G.predecessors(node))==0:
            G.remove_node(node)   
    return G 

# normalzing edge weights
def normalize_edge_weights(G):
    for node in G.nodes():
        if len(G.successors(node)) > 0:
            total_weight = 0
            for neighbor in G.successors(node):
                total_weight += G[node][neighbor]['weight']
            if total_weight:
                for neighbor in G.successors(node):
                    G[node][neighbor]['weight']/= total_weight  
    return G

In [17]:
G = nx.DiGraph()
G = add_nodes(G, n_tokens)
G = add_edges(co_occurence_matrix, n_tokens)
G = remove_isolated_nodes(G)
G = normalize_edge_weights(G)
G_orig = G.copy()

print('Final number of nodes: ', len(G.nodes()))
print('Final number of edges: ', len(G.edges()))

Final number of nodes:  2492
Final number of edges:  42648


In [18]:
def step(node, G, G_new, dist_threshold, start_flag):
    # filter edges based on threshold 
    edges = np.array(G.edges(node))
    edge_weights_dict = np.array(list(G[node].values()))
    if start_flag:
        edge_weights = np.array([0] * len(edge_weights_dict))
    else:
        edge_weights = np.array(list(map(lambda x: list(x.values())[0], edge_weights_dict)))
    chosen_idxs = np.where(edge_weights <= dist_threshold)[0]

    if len(chosen_idxs):
        # add edges in new graph
        # nodes are automatically added 
        edges = edges[chosen_idxs]
        edge_weights_dict = edge_weights_dict[chosen_idxs]
        assert len(edges) == len(edge_weights_dict)
        
        from_nodes = np.array(edges)[:,0]
        to_nodes = np.array(edges)[:,1]
        new_edges = list(zip(from_nodes, to_nodes, edge_weights_dict))
        G_new.add_edges_from(new_edges)

        # remove edges from original graph
        G.remove_edges_from(new_edges)
    return G, G_new

In [19]:
num_senses = list(map(lambda x: len(wn.synsets(x)), tokens))
token_senses = list(zip(tokens, num_senses))
token_senses = list(map(lambda x: list(x), token_senses))
token_senses = sorted(token_senses, key= lambda x: x[1], reverse=True)
print('Tokens with most number of senses:')
print(np.array(token_senses[:5]))

Tokens with most number of senses:
[['cut' '70']
 ['run' '57']
 ['making' '52']
 ['running' '52']
 ['made' '52']]


In [20]:
def training(target_node, G, dist_threshold):
    G_new = nx.DiGraph()
    n_iter = 0

    hubs = np.array(G.successors(target_node))
    degrees = np.array(list(map(lambda x: G.degree(x), hubs)))
    hubs = hubs[degrees.argsort()[::-1]].tolist()
    nodes = [target_node] + hubs
    
    for node in nodes:
        if node==target_node:
            start_flag = 1
        else:
            start_flag = 0
        G, G_new = step(node, G, G_new, dist_threshold, start_flag)  
        n_iter += 1

        if n_iter%5==0:
            print(f'Iteration: {n_iter}')
            print(f'Nodes in Original Tree: {len(G.nodes())} | Edges in Original Tree: {len(G.edges())}')
            print(f'Nodes in Spanning Tree: {len(G_new.nodes())} | Edges in Spanning Tree: {len(G_new.edges())}')
            print()
    return G_new

In [21]:
target_word = 'run'
dist_threshold = 0

target_node = token2int[target_word]
senses = wn.synsets(target_word)
print('Number of senses:',len(senses),'\n')
G_new = training(target_node, G, dist_threshold)

Number of senses: 57 

Iteration: 5
Nodes in Original Tree: 2492 | Edges in Original Tree: 42010
Nodes in Spanning Tree: 544 | Edges in Spanning Tree: 638

Iteration: 10
Nodes in Original Tree: 2492 | Edges in Original Tree: 41814
Nodes in Spanning Tree: 634 | Edges in Spanning Tree: 834

Iteration: 15
Nodes in Original Tree: 2492 | Edges in Original Tree: 41725
Nodes in Spanning Tree: 655 | Edges in Spanning Tree: 923



In [22]:
def get_path_length(G, path):
    path_length = 0
    for path_idx in range(len(path)-1):
        start = path[path_idx]
        end = path[path_idx+1]
        path_length += list(G[start][end].values())[0]
    return path_length

def evaluation(G, hubs, present_context_nodes):
    context_scores = []
    for context_node in present_context_nodes:
        scores = []
        if context_node in G.nodes():
            for hub in hubs:
                if nx.has_path(G, hub, context_node):
                    if hub==context_node:
                        score = 1
                    else:
                        path = nx.shortest_path(G, hub, context_node)
                        path_length = get_path_length(G, path)
                        score = 1 / (1 + path_length)
                else:
                    score = 0
                scores.append(score)        
        else:
            scores = [0] * len(hubs)
        context_scores.append(scores)

    context_scores = np.array(context_scores)
    context_scores = np.sum(context_scores, axis=0)
    return context_scores

In [23]:
context = 'authorities political decisions government mayor votes gas firms federal'
context_words = context.split(' ')

present_context_words = []
present_context_nodes = []
for word in context_words:
    if word in tokens:
        node = token2int[word]
        present_context_words.append(word)
        present_context_nodes.append(node)
hubs = G_new.successors(target_node)

print('Context words:', present_context_words)
print('Number of hubs:', len(hubs))

context_scores = evaluation(G_new, hubs, present_context_nodes)
sense_idxs = np.where(context_scores == max(context_scores))[0]
senses = list(map(lambda x: int2token[hubs[x]], sense_idxs))
print('Plausible senses:', senses)

Context words: ['authorities', 'political', 'decisions', 'government', 'mayor', 'votes', 'gas', 'firms', 'federal']
Number of hubs: 15
Plausible senses: ['primary', 'announced', '1', 'hartsfield', 'mayor', 'reelection', 'would']
