<a href="https://colab.research.google.com/github/suchirsalhan/evaluation-pipeline/blob/main/SLING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SLING BabyLM Evaluation

In [2]:
#needed dependencies for io in Drive
!pip install -U -q PyDrive
!pip install h5py
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from google.colab import drive
drive.mount('/content/drive')
!ls "/content/drive/MyDrive/CST/Project/"

Mounted at /content/drive
 Acquisition	      'Corpus Phonemicisers.gdoc'   MAO-BabyBERTa	    Tokeniser
 Analysis	       Evaluation		    Preprocessing	    Untitled1.ipynb
'BabyBERTA HPC.gdoc'   Examples			    SemTagger
 Baselines	      'Interactive Node HPC.gdoc'  'TO DO – Project.gdoc'


In [10]:
import os
import re
import copy
import nltk
import torch
import glob
import tqdm
import numpy as np
from nltk import Tree
from collections import defaultdict
import json

# Functions for causal language models. Return lists.
def get_token_log_prob(model, tokenizer, sentence):
    with torch.inference_mode():
        inputs = tokenizer(sentence, return_tensors='pt')

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        model.to(device)

        outs = model(**inputs)

        all_log_probs = torch.log_softmax(outs['logits'], dim=-1)
        token_log_probs = torch.gather(all_log_probs[0, :-1], 1, inputs['input_ids'][0, 1:].unsqueeze(1))
        tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0, 1:].cpu())

        return tokens, token_log_probs

def get_ppl(model, tokenizer, list_of_sentences):
    all_neg_ppl = []
    all_lens = []

    for sentence in tqdm.tqdm(list_of_sentences):
        _, token_log_probs = get_token_log_prob(model, tokenizer, sentence)
        ppl = torch.exp(-1 * token_log_probs.squeeze().mean())
        all_lens.append(len(token_log_probs.squeeze()))

        all_neg_ppl.append(-1 * ppl)

    return all_neg_ppl, all_lens

def get_prob(model, tokenizer, list_of_sentences):
    all_prob = []

    for sentence in tqdm.tqdm(list_of_sentences):
        _, token_log_probs = get_token_log_prob(model, tokenizer, sentence)
        prob = token_log_probs.squeeze().sum()

        all_prob.append(prob)

    return all_prob

# Functions for masked language models. Return lists.
def get_token_pll(model, tokenizer, sentence):  # pseudo log likelihood
    token_log_probs = []
    MASK = tokenizer.mask_token_id

    with torch.inference_mode():
        inputs = tokenizer(sentence, return_tensors='pt')
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        model.to(device)

        # skip first ([CLS]) and last ([SEP]) tokens for for loop
        for i in range(1, len(inputs['input_ids'][-1]) - 1):
            # store a copy of token_id at mask_index position
            true_id = inputs['input_ids'][-1][i].item()
            # replace inputs['input_ids'][0, i] with [MASK] (id: 103)
            inputs['input_ids'][-1][i] = MASK

            outs = model(**inputs)
            masked_token_logits = outs['logits'][-1][i]
            log_prob = torch.log_softmax(masked_token_logits, dim=-1)
            token_log_probs.append(log_prob[true_id].item())
            # replace [MASK] with true_id
            inputs['input_ids'][-1][i] = true_id

        tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0, 1:-1].cpu())
        return tokens, token_log_probs

def get_pppl(model, tokenizer, list_of_sentences, func_type='bert'):  # Pseudo PerPLexity
    all_neg_pppl = []
    func = get_token_pll
    all_N = []
    for sentence in tqdm.tqdm(list_of_sentences):
        _, token_log_probs = func(model, tokenizer, sentence)
        sent_pll = sum(token_log_probs)
        N = len(token_log_probs)
        pppl = torch.tensor(np.exp(-sent_pll / N))
        all_N.append(N)

        all_neg_pppl.append(-1 * pppl)
    return all_neg_pppl, all_N

def AveragePerplexity(perplexity_list):
    neg_log_prob_sum = 0
    count = 0

    for i in perplexity_list:
        neg_log_prob = torch.log(-i)
        neg_log_prob_sum += neg_log_prob
        count += 1

    ave_neg_log_prob = neg_log_prob_sum / count
    ave_ppl = torch.exp(ave_neg_log_prob)
    return -ave_ppl

def AvePplGoodBad(good_ppl, bad_ppl):
    ave_ppl_good = AveragePerplexity(good_ppl)
    ave_ppl_bad = AveragePerplexity(bad_ppl)
    return ave_ppl_good, ave_ppl_bad

def find_failed_cases(good_sent_ppl, bad_sent_ppl):
    failed_case_idx = []
    for x, y in tqdm.tqdm(zip(good_sent_ppl, bad_sent_ppl)):
        if x < y:
            failed_case_idx.append(good_sent_ppl.index(x))
    return failed_case_idx

def run_masked_models(model, tokenizer, good_sent_list, bad_sent_list, func_type='bert', metric="perplexity"):
    if metric == "perplexity":
        good_sent_pscore, good_lens = get_pppl(model, tokenizer, good_sent_list, func_type)
        bad_sent_pscore, bad_lens = get_pppl(model, tokenizer, bad_sent_list, func_type)
    failed_case_idx = find_failed_cases(good_sent_pscore, bad_sent_pscore)
    accuracy = 1 - len(failed_case_idx) / len(good_sent_pscore)
    return accuracy, good_sent_pscore, bad_sent_pscore


## Chinese BabyBERTa

Loading Model Checkpoints from HuggingFace

In [None]:
!git clone "https://huggingface.co/cambridge-climb/chinese-climb-roberta_pre_layer_norm-model" "/content/drive/MyDrive/CST/Project/Evaluation/Chinese/BabyBERTa/"

Cloning into '/content/drive/MyDrive/CST/Project/Evaluation/Chinese/BabyBERTa'...
remote: Enumerating objects: 162, done.[K
remote: Counting objects: 100% (159/159), done.[K
remote: Compressing objects: 100% (158/158), done.[K
remote: Total 162 (delta 47), reused 0 (delta 0), pack-reused 3 (from 1)[K
Receiving objects: 100% (162/162), 195.67 KiB | 4.77 MiB/s, done.
Resolving deltas: 100% (47/47), done.


In [None]:
!cd "/content/drive/MyDrive/CST/Project/Evaluation/Chinese/BabyBERTa/"

In [None]:
!git init

Reinitialized existing Git repository in /content/drive/MyDrive/CST/Project/Evaluation/Chinese/BabyBERTa/.git/


In [None]:
!git branch --all

* [32mmain[m
  [31mremotes/origin/HEAD[m -> origin/main
  [31mremotes/origin/chinese-climb[m
  [31mremotes/origin/chinese-growing[m
  [31mremotes/origin/chinese-inward[m
  [31mremotes/origin/chinese-mmm[m
  [31mremotes/origin/main[m


In [None]:
!git fetch --all

Fetching origin


In [None]:
!git checkout "remotes/origin/chinese-climb"

Filtering content: 100% (12/12), 262.05 MiB | 85.08 MiB/s, done.
Note: switching to 'remotes/origin/chinese-climb'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

HEAD is now at 387f21c update model card README.md
fatal: cannot exec '.git/hooks/post-checkout': Permission denied


Loading Model Checkpoint and Tokeniser

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("cambridge-climb/ZH-CamBabyTokenizer")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/136k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/87.1k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/362k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

In [7]:
import torch
import json
from transformers import RobertaForMaskedLM, RobertaConfig

# Load the state dictionary with the specified map_location, if cpu
#model_state_dict = torch.load('/content/drive/MyDrive/CST/Project/Evaluation/Chinese/BabyBERTa/pytorch_model.bin', map_location=map_location)

model_state_dict = torch.load('/content/drive/MyDrive/CST/Project/Evaluation/Chinese/BabyBERTa/pytorch_model.bin')
# Load the configuration file
with open('/content/drive/MyDrive/CST/Project/Evaluation/Chinese/BabyBERTa/config.json', 'r') as f:
    config_dict = json.load(f)

# Initialize the model with the loaded configuration
config = RobertaConfig.from_dict(config_dict)
model = RobertaForMaskedLM(config)

print("Model loaded successfully!")


Model loaded successfully!


Evaluating Model on SLING

In [8]:
sling_files = glob.glob("/content/drive/MyDrive/CST/Project/Evaluation/SLING/*.jsonl", recursive=True)
mp_dict_list = []
for sling_file in sling_files:
    good_sent, bad_sent = [], []
    with open(sling_file, "r") as file:
        mp_dict_list.extend([json.loads(x) for x in file.read().strip().split("\n")])
    for mp_dict in mp_dict_list:
        good_sent.append(mp_dict["sentence_good"])
        bad_sent.append(mp_dict["sentence_bad"])

In [None]:
print(mp_dict_list[1])

{'sentence_good': '她见到了那个制止了暴利的女警察。', 'sentence_bad': '她见到了那个她制止了暴利的女警察。', 'phenomenon': 'RelativeClause', 'paradigm': 'rc_resumptive_pronoun', 'pair_ID': 1, 'field': 'syntax'}


In [None]:
for sling_file in sling_files:
    accuracy, good_pppl, bad_pppl = run_masked_models(model, tokenizer, good_sent, bad_sent, metric="perplexity")
    ave_ppl_good,ave_ppl_bad = AvePplGoodBad(good_pppl,bad_pppl)
    print(f"\t{sling_file}\t{accuracy*100:.5f}\t{ave_ppl_good:.5f}\t{ave_ppl_bad:.5f}\n")

  1%|          | 288/40000 [00:37<1:17:52,  8.50it/s]