Measure the mobility of characters in books in [litbank](https://github.com/dbamman/litbank/blob/master/README.md).

In [15]:
import pandas as pd
import os
from collections import defaultdict
from tqdm import tqdm

Let's get all the gutenberg IDs from litbank

In [4]:
books_file = "../data/gutenberg_metadata/publication_dates.csv"
litbank_books = pd.read_csv (books_file, sep="|")
litbank_books = litbank_books.drop(['Unnamed: 0', 'Unnamed: 5'], axis=1)

Now read the geonames from the file and store it as fast lookup.

In [30]:
geonames_file = "../data/geonames/lat_longs.tsv"
geo_names = pd.read_csv (geonames_file, sep="\t")
geo_names_lookup = defaultdict (list)
for i, row in tqdm (geo_names.iterrows ()):
    geo_names_lookup[str(row["name"]).lower()].append ((row["lat"], row["lon"]))
print (len (geo_names_lookup))

12355198it [12:05, 17022.50it/s]

7575941





Now we will read all the collocations from all the books and keep only the ones that are GPE tagged for place names.

In [31]:
collocations = list ()
for i, row in litbank_books.iterrows ():
    gut_id = row["Gutenberg ID"]
    filename = os.path.join (collocations_dir, f"{gut_id}.collocations")
    if os.path.exists (filename):
        collocation_book = pd.read_csv (filename, sep="\t")
        collocations.append (collocation_book)
all_collocations = pd.concat (collocations)
# Keep only those locations that are tagged as GPE
all_collocations = all_collocations[all_collocations["locations_cat"] == "GPE"]
# Keep only those locations that are present in geo names lookup
all_collocations = all_collocations[all_collocations["locations_text"].str.lower().isin (geo_names_lookup)]
print (len (all_collocations))

85466


Generate a test set of examples now

In [50]:
context_window = 100
content_dirs = [
    "/mnt/data0/kentkchang/charemotions/corpus/booknlp.1.0.7/gutenberg_fiction_tagged_1_4",
    "/mnt/data0/kentkchang/charemotions/corpus/booknlp.1.0.7/gutenberg_fiction_tagged_5_9"
]
output = list ()
for i,row in tqdm (all_collocations.iterrows ()):
    book_id = row["book_id"]
    start = min(int (row["persons_start_token"]), int (row["locations_start_token"]))
    end = max(int (row["persons_end_token"]), int(row["locations_end_token"]))
    from_here = start - context_window
    till_there = end + context_window
    for content_dir in content_dirs:
        path = os.path.join (content_dir, f"{book_id}.tokens")
        if os.path.exists(path) and os.path.getsize(path) > 0:
            break
    
    with open (path) as fin:
        context = list ()
        for j, line in enumerate (fin):
            if j == 0:
                continue
            parts = line.strip().split ("\t")
            token_id = int (parts[3])
            if token_id < from_here:
                continue
            elif token_id >= from_here and token_id <= till_there:
                context.append (parts[4])
            else:
                break

        context = " ".join (context)
        toks = context.split ()
        for k in range (context_window + end + 1 - start, len (toks)):
            if toks[k] == ".": # end of sentence
                break
        toks = toks[0:k+1]
    row[f"context_{context_window}"] = context
    output.append (row)
output_df = pd.DataFrame (output)

62704it [2:22:03,  7.36it/s] 


In [58]:
output_df.to_csv ("../data/experiments/mobility_analysis/testset.tsv", sep="\t", header=True, index=False)

## Classifiers

Now let's train our two classifiers: (a) To predict whether the category belongs to the good or bad categories and (b) To predict the actual spatial category.

We'll then apply these classifiers on the test set that we created.

In [52]:
import torch
torch.manual_seed (96)
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import json

from transformers import BertTokenizer, BertModel
from collections import Counter
from sklearn.metrics import f1_score, confusion_matrix, classification_report

from sklearn.model_selection import train_test_split

In [53]:
def search_sublist (sentence_as_tokens, entity_as_tokens):
    """
    For the entity span (decomposed into tokens), find all the start and end
    positions within the sentence (decomposed into tokens)
    """
    sublist_positions = list ()
    for i, token in enumerate (sentence_as_tokens):
        #print (token, entity_as_tokens[0])
        if token == entity_as_tokens[0]: # see if the first character matches
            if all ([tok == sentence_as_tokens[i+j] if (i+j) < len(sentence_as_tokens) else False for j,tok in enumerate (entity_as_tokens)]):
                sublist_positions.append ((i, i+len(entity_as_tokens)))

    return sublist_positions

In [54]:
class BERTRelationPrediction (nn.Module):
    def __init__ (self, model_name="bert-base-cased", bert_dims=768, n_labels=8):
        super().__init__()
        self.tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=False, do_basic_tokenize=False)
        self.bert = BertModel.from_pretrained(model_name)
        self.n_labels = n_labels
        self.fc = nn.Linear (2*bert_dims, self.n_labels)

    def forward (self, text, per_entity_span, loc_entity_span, device="cpu"):
        # get entity span representations, concatenate and pass it through a 
        # feedforward network.
        token_wordpieces = self.tokenizer.convert_ids_to_tokens (self.tokenizer (text)['input_ids'][1:-1])
        per_entity_wordpieces = self.tokenizer.convert_ids_to_tokens(self.tokenizer (per_entity_span)['input_ids'][1:-1])
        loc_entity_wordpieces = self.tokenizer.convert_ids_to_tokens(self.tokenizer (loc_entity_span)['input_ids'][1:-1])

        # We'll have to change this eventually
        per_entity_positions = search_sublist (token_wordpieces, per_entity_wordpieces)
        loc_entity_positions = search_sublist (token_wordpieces, loc_entity_wordpieces)

        encoded_input = self.tokenizer (text, return_tensors="pt")
        encoded_input.to(device)
        _, pooled_inputs, sequence_outputs =  self.bert (**encoded_input, output_hidden_states=True, return_dict=False)
        last_layer_output = sequence_outputs[-1][0]
        per_entity_repr = last_layer_output[per_entity_positions[0][0]: per_entity_positions[0][1],:].mean (dim=0)
        loc_entity_repr = last_layer_output[loc_entity_positions[0][0]: loc_entity_positions[0][1],:].mean (dim=0)

        input_repr = torch.cat ((per_entity_repr, loc_entity_repr), 0)
        output = self.fc (input_repr)
        return output

    def evaluate (self):
        pass

In [56]:
annotations_file = "/mnt/data0/ssoni/projects/mobility-books/data/annotations/final_annotations/final_annotations.v1.tsv"
pretrained_model_name = "bert-base-cased"
num_epochs = 10
context_field = "context_100"

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
all_labels = [ "NO RELATIONSHIP ASSERTED",
               "TOWARD(got there)",
               "FROM",
               "NEAR",
               "IN",
               "NEGATIVE ASSERTION",
               "THROUGH",
               "TOWARD (uncertain got there)",
               "BAD LOC",
               "BAD PER",
               "UNCERTAIN ASSERTION"]

bad_labels = [ "BAD LOC",
               "BAD PER",
               "UNCERTAIN ASSERTION"]

label_names = {0:"GOOD", 1:"BAD"}
accepted_labels = ["GOOD", "BAD"]

df = pd.read_csv (annotations_file, sep="\t")
df = df[df["Spatial Relation"] != ""]
df = df[df["Spatial Relation"].isin (all_labels)]
df["Spatial SuperRelation"] = df["Spatial Relation"].isin (bad_labels)
#train_df, test_df = train_test_split(df, test_size=0.2, random_state=96)

bertRE = BERTRelationPrediction (model_name=pretrained_model_name, bert_dims=768, n_labels=2)
bertRE.to(device)
optimizer = torch.optim.Adam(bertRE.parameters(), lr=1e-5)
cross_entropy=nn.CrossEntropyLoss()

for epoch in range(num_epochs):
    print (f"Epoch: {epoch}")
    bertRE.train()
    # Train
    for i in tqdm (range (len (df))):
        # get the extracted quantities
        text = df[context_field].iloc[i]
        per_entity_span = df["persons_text"].iloc[i]
        loc_entity_span = df["locations_text"].iloc[i]
        label = label_names[int (df["Spatial SuperRelation"].iloc[i])]
        #label = train_df["Spatial Relation"].iloc[i] 
        y_pred = bertRE.forward (text, per_entity_span, loc_entity_span, device=device)
        y_truth = accepted_labels.index (label)
        #print (y_pred, torch.tensor (y_truth))
        loss = cross_entropy (y_pred.unsqueeze (0), torch.tensor ([y_truth]).to(device))
        optimizer.zero_grad ()
        loss.backward ()
        optimizer.step ()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: 0


  7%|███████████                                                                                                                                                          | 85/1261 [01:19<18:01,  1.09it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (517 > 512). Running this sequence through the model will result in indexing errors
  7%|███████████                                                                                                                                                          | 85/1261 [01:19<18:22,  1.07it/s]


RuntimeError: The size of tensor a (517) must match the size of tensor b (512) at non-singleton dimension 1

In [None]:
# Evaluation
   groundtruth, predictions = list (), list ()
    bertRE.eval()
    with torch.no_grad():
        for i in tqdm (range (len (test_df))):
            # get the extracted quantities
            text = test_df[args.context].iloc[i]
            per_entity_span = test_df["persons_text"].iloc[i]
            loc_entity_span = test_df["locations_text"].iloc[i]
            #label = test_df["Spatial Relation"].iloc[i]
            label = label_names[int (test_df["Spatial SuperRelation"].iloc[i])]
            y_truth = accepted_labels.index (label)
            y_pred = bertRE.forward (text, per_entity_span, loc_entity_span, device=device)
            groundtruth.append (y_truth)
            predictions.append (torch.argmax (torch.nn.functional.softmax (y_pred)).item())

    print (classification_report (groundtruth, predictions))
    print (classification_report (groundtruth, [4]*len(predictions))) #baseline

test_df["ground_truth"] = groundtruth
test_df["predictions"] = predictions

test_df.to_csv (args.output_filename, sep="\t", index=False, header=True)