In [None]:
from transformers import AutoModelForCausalLM , AutoTokenizer, pipeline
import sys
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import os
import re
from huggingface_hub import InferenceClient
from math import log2
from google.colab import userdata

In [None]:
!pip install evaluate

In [None]:
import evaluate

In [None]:
class LMHeadModel:

    def __init__(self, model_name):
        # Initialize the model and the tokenizer.
        self.model = AutoModelForCausalLM.from_pretrained(model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def get_tokenizer(self):
        return self.tokenizer

    def get_predictions(self, sentence):
        # Encode the sentence using the tokenizer and return the model predictions.
        inputs = self.tokenizer.encode(sentence, return_tensors="pt")
        with torch.no_grad():
            outputs = self.model(inputs)
            predictions = outputs[0]
        return predictions

    def get_next_word_probabilities(self, sentence, top_k_words):
        predictions = self.get_predictions(sentence)
        next_token_candidates_tensor =  predictions[0, -1, :]
        topk_candidates_indexes = torch.topk(
            next_token_candidates_tensor, top_k_words).indices.tolist()
        topk_candidates_tokens = \
            [self.tokenizer.decode([idx]).strip() for idx in topk_candidates_indexes]
        topk_candidates_indexes=[idx for idx in topk_candidates_indexes]
        all_candidates_probabilities = torch.nn.functional.softmax(
            next_token_candidates_tensor, dim=-1)
        topk_candidates_probabilities = \
            all_candidates_probabilities[topk_candidates_indexes].tolist()

        return zip(topk_candidates_tokens, topk_candidates_probabilities)

In [None]:
DETERMINERS=["the","a"]
DETERMINER_TAGS=['determiner','article']

In [None]:
def generate_probs(lines, lmm, top_k, output_file):
    print("Generating probabilities...")
    Sentences=[]
    Outputs=[]
    Probs=[]

    #HF_TOKEN=os.environ["HF_TOKEN"] # run first: export HUGGINGFACE_TOKEN="..." in shell


    HF_TOKEN=userdata.get('HF_TOKEN')

    client = InferenceClient(
                api_key=HF_TOKEN, # secret key via Colab
            )
    for i in tqdm(range(len(lines))):
        line=lines[i]
        sentence=""
        full_sent=' '.join(lines[i])

        messages = [
	                {
		            "role": "user",
		            "content": "Assign parts of speech to the following text. You will output a tag for every separate word. The tags are limited to adjective, noun, verb, preposition, article, adverb, conjunction, propernoun. Output format is: word/tag, per separate word. No asterisk or numbers. Replace spaces in tags with underscores. Example: I ate bananas. Output: I/pronoun ate/verb bananas/noun. Sentence: "+full_sent
	                }
                    ]

        completion = client.chat.completions.create(
                 model="meta-llama/Meta-Llama-3-8B-Instruct",
	             messages=messages,
	             max_tokens=500,
                 temperature=0.0,
            )

        pos_tagged=completion.choices[0].message.content
        pos_tagged=re.sub("\n"," ",pos_tagged)
        #print(pos_tagged)

        posD={}
        j=0
        for tag in pos_tagged.split(" "):
            m=re.match("^([^\/]+)\/(.+)", tag)
            if m:
                tag=re.sub("^\_","",m.group(2)) # _noun
                posD[(j,line[j])]=tag.lower() # to repair tokenization errors by LLM
                j+=1

        output=""
        for time in range(0, len(line)-1):
            word=line[time]
            sentence+=" "+word
            output+=" "+word
            probs=lmm.get_next_word_probabilities(sentence, top_k)

            list_probs=list(probs)

            #probsD=dict(list(probs)[:100])

            # by means of deep copy...
            probsD=dict(list_probs[:100])

            #log probs to file:
            #print(sentence)
            #print("=========================================\n\n")
            #print(probsD,"\n")

            next_word=line[time+1]

            for det in DETERMINERS:
                if det not in probsD:
                    probsD[det]=0.0
                if next_word not in probsD:
                    probsD[next_word]=0.0


            probs_bigram=lmm.get_next_word_probabilities(word+" "+next_word, top_k) # the Leiden
            list_probs_bigram=list(probs_bigram)
            probsD_bigram=dict(list_probs_bigram[:100])
            if next_word not in probsD_bigram:
                probsD_bigram[next_word]=1e-5



            # git clone https://github.com/huggingface/evaluate.git
            #perplexity = evaluate.load("evaluate/metrics/perplexity/perplexity.py")
            perplexity = evaluate.load("perplexity", module_type="metric")


            missing=False
            for det in DETERMINERS:
                if probsD[det] > probsD[next_word] and posD[(time+1,next_word)] in ['noun', 'adjective']: #  trigger: pattern
                    text1=' '.join(line[time:])
                    text2 = line[time] + " the "+' '.join(line[time+1:]) # insert det

                    results = perplexity.compute(model_id='gpt2',
                             add_start_token=False,
                             predictions=[text1,text2])
                    ppl1=round(results["perplexities"][0], 2)
                    ppl2=round(results["perplexities"][1], 2)
                    #print("M:",text1, ppl1, text2, ppl2)

                    if ppl1>ppl2:
                        output+=" [MISSING DET:%s] "%(det)
                        print("MISSING DET ", next_word, det)
                        missing=True
            if not missing: # outdent to prevent 2x missing det
                  if posD[(time,word)] in DETERMINER_TAGS and posD[(time+1, next_word)]!='noun': # trigger: pattern
                      text1 = ' '.join(line[time-1:])
                      text2 = line[time-1]+" "+' '.join(line[time+1:]) # remove det

                      results = perplexity.compute(model_id='gpt2',
                             add_start_token=False,
                             predictions=[text1,text2])
                      ppl1=round(results["perplexities"][0], 2)
                      ppl2=round(results["perplexities"][1], 2)
                      #print("E:",text1, ppl1, text2, ppl2)

                      if ppl1>ppl2:
                          output+=" [EXTRA DET:%s] "%(word)
                          print("EXTRA DET ", next_word, det)


        sentence+=" "+line[time+1]
        output+=" "+line[time+1]
        Sentences.append(sentence)
        Outputs.append(output)
        Probs.append(probs)

    outp=open(output_file,"w")

    for (sentence, output) in zip(Sentences, Outputs):
        outp.write("Sentence:%s\n"%(sentence))
        outp.write("Output:%s\n"%(output))
        outp.write("\n\n")

    outp.close()
    print("See %s for output."%(output_file))
    return Outputs


In [None]:
def main(input_file, top_k, output_file):
    with open(input_file,"r") as f:
        lines = [z for z in [x.rstrip().split(" ") for x in f.readlines()]]

    llm = LMHeadModel("meta-llama/Meta-Llama-3-8B-Instruct")
    #lmm = LMHeadModel("NousResearch/Llama-2-7b-hf") # No Huggingface key necessary for this one

    outputs=generate_probs(lines, llm, top_k, output_file)
    for output in outputs:
        print(output)


#if __name__=="__main__":
#    if len(sys.argv)!=4:
#        print("Usage: python get-nextword-probs.py <sentence file: one sentence per line> <desired top k words probabilities (number)> <output file name>")
#        print("Example: python3.9 get-nextword-probs.py sentences.txt 10 nextword-probs.txt")
#        exit(0)
#    main(sys.argv[1], int(sys.argv[2]), sys.argv[3]) # file with one sentence per line, top k word probabilities (number), output file name
    # Like: python python3.9 get-nextword-probs-determiners.py det.txt 10000 det.out


In [None]:
# Put input file (1 sentence per line) in Drive, e.g. in temp folder (mount drive first, then use "Upload to session storage")
main("det.txt", 10000, "det.out")