# Location classifier

Classifies an entry as either location or non-location.

Adapted from the paper "Mapping the Past: Geographically Linking an Early 20th Century Swedish Encyclopedia with Wikidata", code available under `CC BY-NC-SA` at https://github.com/axelahlin/uggleupplagan

In [None]:
!pip install datasets

In [None]:
import torch
import json
import joblib
import pandas as pd
import numpy as np
import json
from tqdm.notebook import tqdm
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModel
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyClassifier
# from sklearn.metrics import classification_report
# from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
# import matplotlib.pyplot as plt
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_ckpt = "KB/bert-base-swedish-cased"
model = AutoModel.from_pretrained(model_ckpt).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
RANDOM_STATE = 42

classifier_data = '/content/drive/MyDrive/Colab Notebooks/EDAN70/clean_data_classifier.csv'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def create_location_classifier():

    debug = True

    df = pd.read_csv(classifier_data)
    train_validation_test = Dataset.from_pandas(df).train_test_split(test_size=0.2, seed=RANDOM_STATE)
    data = DatasetDict(
        {
          "train": train_validation_test["train"],
          "test": train_validation_test["test"]
        }
    )

    # Uncomment this for validation set as well
    valid = data["train"].train_test_split(test_size=0.1, seed=RANDOM_STATE)
    data = DatasetDict(
        {
            "train": valid["train"],
            "validation": valid["test"],
            "test": train_validation_test["test"],
        }
    )

    # function for tokenizing the whole corpus
    def tokenize(batch):
        return tokenizer(batch["text"], padding=True, truncation=True)

    # function for extracting all hidden states
    def extract_hidden_states(batch):
        # Place model inputs on the GPU
        inputs = {
            k: v.to(device)
            for k, v in batch.items()
            if k in tokenizer.model_input_names
        }
        # Extract last hidden states
        with torch.no_grad():
            last_hidden_state = model(**inputs).last_hidden_state
        # Return vector for [CLS] token
        return {"hidden_state": last_hidden_state[:, 0].cpu().numpy()}

    data_encoded = data.map(tokenize, batched=True, batch_size=None)
    print(data_encoded["train"].column_names)

    data_encoded.set_format("torch", columns=["input_ids", "attention_mask"])
    data_hidden = data_encoded.map(extract_hidden_states, batched=True)
    print(data_hidden["train"].column_names)

    # Creating a feature matrix (REMEMBER TO CHANGE back valid and test)
    X_train = np.array(data_hidden["train"]["hidden_state"])
    X_test = np.array(data_hidden["validation"]["hidden_state"])
    y_train = np.array(data_hidden["train"]["is_loc"])
    y_test = np.array(data_hidden["validation"]["is_loc"])
    X_valid = np.array(data_hidden["test"]["hidden_state"])
    y_valid = np.array(data_hidden["test"]["is_loc"])

    if debug:
      print(X_train.shape, X_valid.shape)

    # training classifier
    lr_clf = LogisticRegression(max_iter=3000)
    lr_clf.fit(X_train, y_train)
    if debug:
      print(f"logistic regression classifier score: {lr_clf.score(X_test, y_test)}")


    # Use cross-validation to evaluate the performance of the model
    scores = cross_val_score(lr_clf, X_valid, y_valid, cv=20)
    print("Accuracy after cv:", scores.mean())

    # define baseline
    dummy_clf = DummyClassifier(random_state=42)
    dummy_clf.fit(X_train, y_train)
    print("Baseline: ", dummy_clf.score(X_test, y_test))

    return lr_clf


def plot_confusion_matrix(y_preds, y_true, labels, title):
    cm = confusion_matrix(y_true, y_preds, normalize="true")
    fig, ax = plt.subplots(figsize=(6, 6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
    plt.title(title)
    plt.show()


def predict_loc(s, clf):
    input_sentence = s
    encoded_input = tokenizer(
        input_sentence, padding=True, truncation=True, return_tensors="pt"
    )

    # Extract the hidden states for the input sentence
    with torch.no_grad():
        last_hidden_state = model(
            encoded_input["input_ids"].to(device),
            encoded_input["attention_mask"].to(device),
        ).last_hidden_state
    input_hidden_state = last_hidden_state[:, 0].cpu().numpy()

    # Reshape input_hidden_state to have shape (1, hidden_size)
    input_hidden_state = input_hidden_state.reshape(1, -1)

    # Use the logistic regression model to predict the label
    predicted_label = clf.predict(input_hidden_state)
    return predicted_label[0].flat[0]


def classify_edition(edition_name: str, clf):

    filename = f'/content/drive/MyDrive/Colab Notebooks/EDAN70/{edition_name}'

    with open(f"{filename}.json", 'r', encoding='utf-8') as infile:
        items = json.loads(infile.read())

    for entry in tqdm(items):
        text = entry["text"]
        # print(f"text: {entry['text']}")
        pred = predict_loc(text, clf)
        # print(f"prediction: {pred}")

        entry['class'] = int(pred)
        # print(f"entry['class']: {entry['class']}")

    with open(f"{filename}.json", 'w', encoding='utf-8') as outfile:
        json.dump(items, outfile, ensure_ascii=False, indent=4)

In [None]:
clf = create_location_classifier()

In [None]:
classify_edition('e1', clf)
classify_edition('e2', clf)