Two heuristics are used to identify names:

* Word classification: Use word embeddings learned from the two skipgram models to train separate neural classifiers. A word is considered as a name if both the classifiers classify it as a name.
* PoS tagging: A word is considered as a name if more than 90% times it is tagged as NNP or NNPS.

### 1. Name Classification

In [1]:
import numpy as np
from random import shuffle
from sklearn.metrics import accuracy_score, roc_auc_score
from collections import Counter

In [2]:
import sys
if "../modules/" not in sys.path: sys.path.append ("../modules/")

In [3]:
from propernames.classifier import TwoLayerNet
from semshift.embeddings import TrainedModel    

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed (1)

<torch._C.Generator at 0x7f66a29bd110>

In [5]:
M1 = TrainedModel ("/hg191/corpora/legaldata/sc-docs/0.model")
M2 = TrainedModel ("/hg191/corpora/legaldata/sc-docs/7.model")

In [6]:
def readExamples (filename):
    with open (filename) as fin:
        sample = [line.strip() for line in fin]
    positives = [candidate.strip("+") for candidate in sample if candidate.startswith ("+")]
    negatives = [candidate for candidate in sample if not candidate.startswith ("+")]
    
    return positives, negatives

positives, negatives = readExamples ("../data/names/annotated.txt")
sample = positives + negatives
shuffle (sample)

In [19]:
# All the constants!
embeds_dim = 100
hidden_dim = 100
label_size = 2
labels_map = {"Name":1, "NonName":0}
iLabels_map = {1:"Name", 0:"NonName"}
nEpochs = 15
train_size, dev_size = 1000, 200

In [8]:
data = [(candidate, iLabels_map[int (candidate in positives)]) 
        for candidate in sample 
        if candidate in M1.m.wv.vocab and candidate in M2.m.wv.vocab]

In [9]:
def use_embeddings (instance, emds):
    """Takes an instance and coverts into a vector"""
    vec = torch.Tensor (emds.m.wv[instance])
    return vec.view(1,-1)

def make_target (label, mapping):
    """Takes a label and coverts it into a categorical variable"""
    return torch.LongTensor([mapping[label]])    

In [10]:
train_data, dev_data, test_data = data[:train_size], data[train_size:train_size+dev_size], data[train_size+dev_size:]
print (len (train_data), len (dev_data), len (test_data))

1000 200 389


In [17]:
def train (instances, emds, input_dim, hidden_dim, output_dim, iters, verbose=False):
    global labels_map
    model = TwoLayerNet(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim)
    loss_function = nn.NLLLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.1)
    
    for epoch in range (iters):
        training_loss = 0
        for instance, label in instances:
            # Step 1. Torch always accumulates the gradient by 
            # default, so make it zero if we want to backprop 
            # for every instance.
            model.zero_grad()
            
            # Step2. Create the input and output
            src = use_embeddings(instance, emds)
            tgt = make_target (label, labels_map)
            
            # Step 3. Run the forward pass
            log_probs = model.forward (src, F.relu)
            
            # Step 4. Calculate the loss; run the backward pass
            # and make the gradient update.
            loss = loss_function(log_probs, tgt)
            loss.backward ()
            optimizer.step ()
        
            training_loss += loss.item()
            
        if verbose:
            print ("Loss after epoch {0}:{1}".format (epoch+1, training_loss/len(instances)))
    return model

def evaluate (model, instances, verbose=False):
    global iLabels_map
    predictions = list ()
    labels = list ()
    with torch.no_grad():
        for instance, label in instances:
            src = use_embeddings(instance, emds)
            log_probs = model.forward (src, F.relu)
            #prediction = F.softmax (log_probs, dim=1)
            prediction = torch.argmax (log_probs, dim=1)[0].item()
            predictions.append (iLabels_map[prediction])
            labels.append (label)

    acc = accuracy_score (labels, predictions)
    auc = roc_auc_score (labels, predictions)
    if verbose:
        print ("Accuracy after epoch {0}:{1}".format (epoch+1, acc))
        print ("AUC after epoch {0}:{1}".format (epoch+1, auc))
            
    return labels, predictions

def train_and_evaluate (train_instances, test_instances, emds, input_dim, hidden_dim, output_dim, iters, verbose=False):
    global labels_map
    global iLabels_map
    model = TwoLayerNet(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim)
    loss_function = nn.NLLLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.1)
    
    for epoch in range (iters):
        training_loss = 0
        for instance, label in train_instances:
            # Step 1. Torch always accumulates the gradient by 
            # default, so make it zero if we want to backprop 
            # for every instance.
            model.zero_grad()
            
            # Step2. Create the input and output
            src = use_embeddings(instance, emds)
            tgt = make_target (label, labels_map)
            
            # Step 3. Run the forward pass
            log_probs = model.forward (src, F.relu)
            
            # Step 4. Calculate the loss; run the backward pass
            # and make the gradient update.
            loss = loss_function(log_probs, tgt)
            loss.backward ()
            optimizer.step ()
        
            training_loss += loss.item()
            
        if verbose:
            print ("Loss after epoch {0}:{1}".format (epoch+1, training_loss/len(train_instances)))
            
        test_predictions = list ()
        test_labels = list ()
        with torch.no_grad():
            for instance, label in test_instances:
                src = use_embeddings(instance, emds)
                log_probs = model.forward (src, F.relu)
                #prediction = F.softmax (log_probs, dim=1)
                prediction = torch.argmax (log_probs, dim=1)[0].item()
                test_predictions.append (prediction)
                test_labels.append (labels_map[label])
        acc = accuracy_score (test_labels, test_predictions)
        auc = roc_auc_score (test_labels, test_predictions)
        if verbose:
            print ("Accuracy after epoch {0}:{1}".format (epoch+1, acc))
            print ("AUC after epoch {0}:{1}".format (epoch+1, auc))
            
    return model, test_labels, test_predictions

In [20]:
model1, _, _ = train_and_evaluate (train_data+dev_data, test_data, M1, embeds_dim, hidden_dim, label_size, nEpochs, verbose=True)

Loss after epoch 1:0.4165320779879888
Accuracy after epoch 1:0.922879177377892
AUC after epoch 1:0.9324698598892147
Loss after epoch 2:0.23977441946665445
Accuracy after epoch 2:0.922879177377892
AUC after epoch 2:0.9352231997393288
Loss after epoch 3:0.217659543355306
Accuracy after epoch 3:0.9280205655526992
AUC after epoch 3:0.9415607689801239
Loss after epoch 4:0.20182730952898661
Accuracy after epoch 4:0.9280205655526992
AUC after epoch 4:0.9415607689801239
Loss after epoch 5:0.19010080774625143
Accuracy after epoch 5:0.9280205655526992
AUC after epoch 5:0.9415607689801239
Loss after epoch 6:0.17712292671203614
Accuracy after epoch 6:0.9280205655526992
AUC after epoch 6:0.9388074291300098
Loss after epoch 7:0.17312798897425333
Accuracy after epoch 7:0.9254498714652957
AUC after epoch 7:0.9370153144346693
Loss after epoch 8:0.1615299435456594
Accuracy after epoch 8:0.9254498714652957
AUC after epoch 8:0.9370153144346693
Loss after epoch 9:0.14836889266967773
Accuracy after epoch 9:

In [21]:
model2, _, _ = train_and_evaluate (train_data+dev_data, test_data, M2, embeds_dim, hidden_dim, label_size, nEpochs, verbose=True)

Loss after epoch 1:0.427378475467364
Accuracy after epoch 1:0.9151670951156813
AUC after epoch 1:0.9078201368523949
Loss after epoch 2:0.23741979161898294
Accuracy after epoch 2:0.9177377892030848
AUC after epoch 2:0.9178722710980776
Loss after epoch 3:0.21525579690933228
Accuracy after epoch 3:0.9125964010282777
AUC after epoch 3:0.9142880417073966
Loss after epoch 4:0.19917047142982483
Accuracy after epoch 4:0.910025706940874
AUC after epoch 4:0.9152492668621701
Loss after epoch 5:0.19035330096880596
Accuracy after epoch 5:0.9023136246786633
AUC after epoch 5:0.9098729227761485
Loss after epoch 6:0.18008546352386476
Accuracy after epoch 6:0.8946015424164524
AUC after epoch 6:0.9072499185402412
Loss after epoch 7:0.1696396239598592
Accuracy after epoch 7:0.8946015424164524
AUC after epoch 7:0.904496578690127
Loss after epoch 8:0.15700855334599814
Accuracy after epoch 8:0.8920308483290489
AUC after epoch 8:0.9082111436950147
Loss after epoch 9:0.14716411550839742
Accuracy after epoch 9

In [22]:
# Baseline (most frequent class)
print ("Baseline")
baseline_predictions = [0 for instance, label in test_data]
baseline_truth = [labels_map[label] for instance, label in test_data]
print (accuracy_score (baseline_truth, baseline_predictions))
print (roc_auc_score (baseline_truth, baseline_predictions))

Baseline
0.7172236503856041
0.5


In [23]:
def apply2vocab (model, emds):
    global iLabels_map
    predictions = dict()
    with torch.no_grad():
        for instance in emds.m.wv.vocab:
            src = use_embeddings(instance, emds)
            log_probs = model.forward (src, F.relu)
            prediction = torch.argmax (log_probs, dim=1)[0].item()
            predictions[instance] = iLabels_map[prediction]
            
    return predictions

In [24]:
V1 = apply2vocab (model1, M1)
V2 = apply2vocab (model2, M2)

In [25]:
neural_candidates = set (list (V1.keys())) & set (list (V2.keys()))
neural_names = {candidate for candidate in neural_candidates if V1[candidate] == "Name" and V2[candidate] == "Name"}

print (len(V1))
print (len(V2))
print (len (neural_names))

158629
190451
34559


In [26]:
print (len ({c for c in neural_candidates if V1[c] == "Name" or V2[c] == "Name"}))

51522


### 2. PoS tagging

In [38]:
thresh = 0.90
with open ("/hg191/corpora/legaldata/sc-docs/0.nounprob") as fin:
    P1 = {line.strip().split("\t")[0] for line in fin if float(line.strip().split("\t")[1]) >= thresh}
with open ("/hg191/corpora/legaldata/sc-docs/7.nounprob") as fin:
    P2 = {line.strip().split("\t")[0] for line in fin if float(line.strip().split("\t")[1]) >= thresh}

In [39]:
pos_names = P1 & P2

Write output of both the methods to file

In [41]:
with open ("/hg191/corpora/legaldata/sc-docs/names.neural", "w") as fout:
    for name in neural_names:
        fout.write ("{0}\n".format (name))

with open ("/hg191/corpora/legaldata/sc-docs/names.tagging", "w") as fout:
    for name in pos_names:
        fout.write ("{0}\n".format (name))