After learning the embeddings, obtain a _common_ vocabulary. This vocabulary should:

- be common to both the early documents as well as the late documents.
- should not contain any names

The input embeddings are ```model.wv.vectors``` and the outout vectors are ```model.trainables.syn1neg```.

In [1]:
import os
from random import shuffle
import gensim
import sys
if "../" not in sys.path: sys.path.append ("../")

from modules.propernames import classifier

In [2]:
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score
from collections import Counter

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed (1)

<torch._C.Generator at 0x7f7cd9d38690>

In [4]:
MODELS_DIR = "/hg191/corpora/legaldata/models"
STATS_DIR = "/hg191/corpora/legaldata/data/stats"
EARLY_FILE = os.path.join (MODELS_DIR, "sgns.500K.early.100.model")
LATER_FILE = os.path.join (MODELS_DIR, "sgns.500K.later.100.model")

In [5]:
early_model = gensim.models.Word2Vec.load(EARLY_FILE)
later_model = gensim.models.Word2Vec.load(LATER_FILE)

**Common vocabulary**

In [6]:
early_vocab = {key for key in early_model.wv.vocab.keys()}
later_vocab = {key for key in later_model.wv.vocab.keys()}
common_vocab = set.intersection (early_vocab, later_vocab)

**Name removal**

In [10]:
def readExamples (filename):
    with open (filename) as fin:
        sample = [line.strip() for line in fin]
    positives = [candidate.strip("+") for candidate in sample if candidate.startswith ("+")]
    negatives = [candidate for candidate in sample if not candidate.startswith ("+")]
    
    return positives, negatives

positives, negatives = readExamples ("../data/names/annotated.txt")
sample = positives + negatives
shuffle (sample)

In [11]:
# All the constants!
embeds_dim = 300
hidden_dim = 300
label_size = 2
labels_map = {"Name":1, "NonName":0}
iLabels_map = {1:"Name", 0:"NonName"}
nEpochs = 10
train_size, dev_size = 1000, 200

In [12]:
data = [(candidate, iLabels_map[int (candidate in positives)]) 
        for candidate in sample 
        if candidate in early_vocab and candidate in later_vocab]

In [13]:
def use_embeddings (instance, emds):
    """Takes an instance and coverts into a vector"""
    vec = torch.Tensor (emds.wv[instance])
    return vec.view(1,-1)

def make_target (label, mapping):
    """Takes a label and coverts it into a categorical variable"""
    return torch.LongTensor([mapping[label]])

In [14]:
train_data, dev_data, test_data = data[:train_size], data[train_size:train_size+dev_size], data[train_size+dev_size:]
print (len (train_data), len (dev_data), len (test_data))

1000 200 381


Name removal using word embeddings approach!

In [15]:
def train (instances, emds, input_dim, hidden_dim, output_dim, iters, verbose=False):
    global labels_map
    model = classifier.TwoLayerNet(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim)
    loss_function = nn.NLLLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.1)
    
    for epoch in range (iters):
        training_loss = 0
        for instance, label in instances:
            # Step 1. Torch always accumulates the gradient by 
            # default, so make it zero if we want to backprop 
            # for every instance.
            model.zero_grad()
            
            # Step2. Create the input and output
            src = use_embeddings(instance, emds)
            tgt = make_target (label, labels_map)
            
            # Step 3. Run the forward pass
            log_probs = model.forward (src, F.relu)
            
            # Step 4. Calculate the loss; run the backward pass
            # and make the gradient update.
            loss = loss_function(log_probs, tgt)
            loss.backward ()
            optimizer.step ()
        
            training_loss += loss.item()
            
        if verbose:
            print ("Loss after epoch {0}:{1}".format (epoch+1, training_loss/len(instances)))
    return model

def evaluate (model, instances, verbose=False):
    global iLabels_map
    predictions = list ()
    labels = list ()
    with torch.no_grad():
        for instance, label in instances:
            src = use_embeddings(instance, emds)
            log_probs = model.forward (src, F.relu)
            #prediction = F.softmax (log_probs, dim=1)
            prediction = torch.argmax (log_probs, dim=1)[0].item()
            predictions.append (iLabels_map[prediction])
            labels.append (label)

    acc = accuracy_score (labels, predictions)
    auc = roc_auc_score (labels, predictions)
    if verbose:
        print ("Accuracy after epoch {0}:{1}".format (epoch+1, acc))
        print ("AUC after epoch {0}:{1}".format (epoch+1, auc))
            
    return labels, predictions

def train_and_evaluate (train_instances, test_instances, emds, input_dim, hidden_dim, output_dim, iters, verbose=False):
    global labels_map
    global iLabels_map
    model = classifier.TwoLayerNet(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim)
    loss_function = nn.NLLLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.1)
    
    for epoch in range (iters):
        training_loss = 0
        for instance, label in train_instances:
            # Step 1. Torch always accumulates the gradient by 
            # default, so make it zero if we want to backprop 
            # for every instance.
            model.zero_grad()
            
            # Step2. Create the input and output
            src = use_embeddings(instance, emds)
            tgt = make_target (label, labels_map)
            
            # Step 3. Run the forward pass
            log_probs = model.forward (src, F.relu)
            
            # Step 4. Calculate the loss; run the backward pass
            # and make the gradient update.
            loss = loss_function(log_probs, tgt)
            loss.backward ()
            optimizer.step ()
        
            training_loss += loss.item()
            
        if verbose:
            print ("Loss after epoch {0}:{1}".format (epoch+1, training_loss/len(train_instances)))
            
        test_predictions = list ()
        test_labels = list ()
        with torch.no_grad():
            for instance, label in test_instances:
                src = use_embeddings(instance, emds)
                log_probs = model.forward (src, F.relu)
                #prediction = F.softmax (log_probs, dim=1)
                prediction = torch.argmax (log_probs, dim=1)[0].item()
                test_predictions.append (prediction)
                test_labels.append (labels_map[label])
        acc = accuracy_score (test_labels, test_predictions)
        auc = roc_auc_score (test_labels, test_predictions)
        if verbose:
            print ("Accuracy after epoch {0}:{1}".format (epoch+1, acc))
            print ("AUC after epoch {0}:{1}".format (epoch+1, auc))
            
    return model, test_labels, test_predictions

In [16]:
twoLayerNet = classifier.TwoLayerNet(300, 300, 2)

In [17]:
m1, _, _ = train_and_evaluate (train_data+dev_data, test_data, early_model, embeds_dim, hidden_dim, label_size, nEpochs, verbose=True)

Loss after epoch 1:0.41055406073729195
Accuracy after epoch 1:0.8923884514435696
AUC after epoch 1:0.8985346826902805
Loss after epoch 2:0.14180787801742553
Accuracy after epoch 2:0.8740157480314961
AUC after epoch 2:0.8393421884882984
Loss after epoch 3:0.07544833421707153
Accuracy after epoch 3:0.89501312335958
AUC after epoch 3:0.8692283364958886
Loss after epoch 4:0.04563492377599081
Accuracy after epoch 4:0.8792650918635171
AUC after epoch 4:0.8304870335230867
Loss after epoch 5:0.02474735418955485
Accuracy after epoch 5:0.8792650918635171
AUC after epoch 5:0.821157495256167
Loss after epoch 6:0.018852357069651285
Accuracy after epoch 6:0.89501312335958
AUC after epoch 6:0.844349567784103
Loss after epoch 7:0.007827977339426676
Accuracy after epoch 7:0.8766404199475065
AUC after epoch 7:0.8038161501159603
Loss after epoch 8:0.002113466262817383
Accuracy after epoch 8:0.9028871391076115
AUC after epoch 8:0.8746046805819102
Loss after epoch 9:0.0011298807462056478
Accuracy after epo

In [18]:
m2, _, _ = train_and_evaluate (train_data+dev_data, test_data, later_model, embeds_dim, hidden_dim, label_size, nEpochs, verbose=True)

Loss after epoch 1:0.49724227527777354
Accuracy after epoch 1:0.863517060367454
AUC after epoch 1:0.8694918827746152
Loss after epoch 2:0.18090183138847352
Accuracy after epoch 2:0.8451443569553806
AUC after epoch 2:0.8569470799072317
Loss after epoch 3:0.08653055707613627
Accuracy after epoch 3:0.8792650918635171
AUC after epoch 3:0.8553658022348725
Loss after epoch 4:0.039121803442637125
Accuracy after epoch 4:0.8740157480314961
AUC after epoch 4:0.8517815728441914
Loss after epoch 5:0.03561392863591512
Accuracy after epoch 5:0.89501312335958
AUC after epoch 5:0.8630086443179421
Loss after epoch 6:0.00785568634668986
Accuracy after epoch 6:0.8871391076115486
AUC after epoch 6:0.8576323002319206
Loss after epoch 7:0.002132910887400309
Accuracy after epoch 7:0.8818897637795275
AUC after epoch 7:0.8447185325743201
Loss after epoch 8:0.000983403523763021
Accuracy after epoch 8:0.8871391076115486
AUC after epoch 8:0.8545224541429474
Loss after epoch 9:0.0006149959564208984
Accuracy after 

In [19]:
# Baseline (most frequent class)
print ("Baseline")
baseline_predictions = [0 for instance, label in test_data]
baseline_truth = [labels_map[label] for instance, label in test_data]
print (accuracy_score (baseline_truth, baseline_predictions))
print (roc_auc_score (baseline_truth, baseline_predictions))

Baseline
0.7322834645669292
0.5


In [20]:
def apply2vocab (model, emds):
    global iLabels_map
    predictions = dict()
    with torch.no_grad():
        for instance in emds.wv.vocab:
            src = use_embeddings(instance, emds)
            log_probs = model.forward (src, F.relu)
            prediction = torch.argmax (log_probs, dim=1)[0].item()
            predictions[instance] = iLabels_map[prediction]
            
    return predictions

In [21]:
V1 = apply2vocab (m1, early_model)
V2 = apply2vocab (m2, later_model)

In [54]:
neural_candidates = set (list (V1.keys())) & set (list (V2.keys()))
neural_names = {candidate for candidate in neural_candidates if V1[candidate] == "Name" or V2[candidate] == "Name"}

print (len(V1))
print (len(V2))
print (len (neural_names))

159639
247005
46512


Name removal using part of speech tagging heuristic!

In [55]:
thresh = 0.5
with open (os.path.join (STATS_DIR, "ops.500K.early.nounprob")) as fin:
    P1 = {line.strip().split("\t")[0] for line in fin if float(line.strip().split("\t")[1]) >= thresh}
with open (os.path.join (STATS_DIR, "ops.500K.later.nounprob")) as fin:
    P2 = {line.strip().split("\t")[0] for line in fin if float(line.strip().split("\t")[1]) >= thresh}

In [56]:
pos_names = P1 | P2

In [57]:
print (len (pos_names))

789657


In [58]:
with open (os.path.join(STATS_DIR, "names.neural"), "w") as fout:
    for name in neural_names:
        fout.write ("{0}\n".format (name))

with open (os.path.join (STATS_DIR, "names.tagging"), "w") as fout:
    for name in pos_names:
        fout.write ("{0}\n".format (name))

Finally, eliminate all words in the vocabulary that are either identified as names by the embedding approach or by part of speech tagging.

In [59]:
len({v for v in common_vocab if v not in pos_names and v not in neural_names})

34769