# Week 5: Sentence Completion Challenge

The cell below will load the language_model class (developed last week) and train it using the files in the training directory.

In [None]:
%load_ext autoreload
%autoreload 2  #this means that language_model will be reloaded when you run this cell - this is important if you change the language_model class!
import os
from lab5resources.language_model import * ## import language model from previous lab
parentdir="/Users/juliewe/Dropbox/teaching/AdvancedNLP/2024/week4/lab4/lab4resources/sentence-completion" #you may need to update this 

trainingdir=os.path.join(parentdir,"Holmes_Training_Data")
training,testing=get_training_testing(trainingdir)
MAX_FILES=20   #use a small number here whilst developing your solutions
mylm=language_model(trainingdir=trainingdir,files=training[:MAX_FILES],adjust_unknowns=True)

In [None]:
%config IPCompleter.greedy=True

Let's have a look at the most frequent words in the training data.

In [None]:
vocab=sorted(mylm.unigram.items(),key=lambda x:x[1],reverse =True)


In [None]:
vocab[:10]

How big is the vocabulary?  What kind of words are low frequency?  What kind of words are mid-frequency?

In [None]:
len(vocab)

In [None]:
vocab[-10:]

In [None]:
topvocab=vocab[:9500]

In [None]:
topvocab[-10:]

Make sure you can:
* look up bigram probabilities
* generatate a sentence according to the model
* calculate the perplexity of a test sentence

Now lets load in and have a look at the sentence completion challenge questions.

In [None]:
import pandas as pd, csv
questions=os.path.join(parentdir,"testing_data.csv")
answers=os.path.join(parentdir,"test_answer.csv")

with open(questions) as instream:
    csvreader=csv.reader(instream)
    lines=list(csvreader)
qs_df=pd.DataFrame(lines[1:],columns=lines[0])
qs_df.head()

Need to be able to tokenize questions so that the gaps can be located.

In [None]:
from nltk import word_tokenize as tokenize

tokens=[tokenize(q) for q in qs_df['question']]
print(tokens)

Getting the context of the blank: looking at the preceding words (number given in window)

In [None]:
def get_left_context(sent_tokens,window,target="_____"):
    found=-1
    for i,token in enumerate(sent_tokens):
        if token==target:
            found=i
            break 
            
    if found>-1:
        return sent_tokens[i-window:i]
    else:
        return []
    

qs_df['tokens']=qs_df['question'].map(tokenize)
qs_df['left_context']=qs_df['tokens'].map(lambda x: get_left_context(x,2))
qs_df.head()    

##  Building and evaluating an SCC system
1. always predict the same answer (e.g., "a")


In [None]:
# from lab3resources.scc import *
### you can import this the above line but I have included the code here to make it easier to inspect it

class question:
    
    def __init__(self,aline):
        self.fields=aline
    
    def get_field(self,field):
        return self.fields[question.colnames[field]]
    
    def add_answer(self,fields):
        self.answer=fields[1]
   
    def chooseA(self):
        return("a")
    
    def predict(self,method="chooseA"):
        #eventually there will be lots of methods to choose from
        if method=="chooseA":
            return self.chooseA()
        
    def predict_and_score(self,method="chooseA"):
        
        #compare prediction according to method with the correct answer
        #return 1 or 0 accordingly
        prediction=self.predict(method=method)
        if prediction ==self.answer:
            return 1
        else:
            return 0

class scc_reader:
    
    def __init__(self,qs=questions,ans=answers):
        self.qs=qs
        self.ans=ans
        self.read_files()
        
    def read_files(self):
        
        #read in the question file
        with open(self.qs) as instream:
            csvreader=csv.reader(instream)
            qlines=list(csvreader)
        
        #store the column names as a reverse index so they can be used to reference parts of the question
        question.colnames={item:i for i,item in enumerate(qlines[0])}
        
        #create a question instance for each line of the file (other than heading line)
        self.questions=[question(qline) for qline in qlines[1:]]
        
        #read in the answer file
        with open(self.ans) as instream:
            csvreader=csv.reader(instream)
            alines=list(csvreader)
            
        #add answers to questions so predictions can be checked    
        for q,aline in zip(self.questions,alines[1:]):
            q.add_answer(aline)
        
    def get_field(self,field):
        return [q.get_field(field) for q in self.questions] 
    
    def predict(self,method="chooseA"):
        return [q.predict(method=method) for q in self.questions]
    
    def predict_and_score(self,method="chooseA"):
        scores=[q.predict_and_score(method=method) for q in self.questions]
        return sum(scores)/len(scores)
    
            

In [None]:
SCC = scc_reader()

In [None]:
SCC.get_field("b)")

In [None]:
SCC.predict()

In [None]:
SCC.predict_and_score()

### Adding a random choice

In [None]:
import numpy as np
class question(question):
    
    #you wouldn't normally have a class inherit from itself like this
    #but it is quite a neat way in jupyter notebooks to extend pre-existing classes
    #you could alternatively redefine the class (copying all of the pre-existing class)

    def chooserandom(self):
        choices=["a","b","c","d","e"]
        return np.random.choice(choices)
    def predict(self,method="chooseA"):
        if method=="chooseA":
            return self.chooseA()
        elif method=="random":
            return self.chooserandom()
    

In [None]:
SCC=scc_reader()
SCC.predict_and_score(method="random")

### Using the language model
using unigram probabilities

In [None]:
class question(question):
    #you wouldn't normally have a class inherit from itself like this
    #but it is quite a neat way in jupyter notebooks to extend pre-existing classes
    #you could alternatively redefine the class (copying all of the pre-existing class)

    def chooseunigram(self,lm):
        choices=["a","b","c","d","e"]      
        probs=[lm.unigram.get(self.get_field(ch+")"),0) for ch in choices]
        maxprob=max(probs)
        bestchoices=[ch for ch,prob in zip(choices,probs) if prob == maxprob]
        #if len(bestchoices)>1:
        #    print("Randomly choosing from {}".format(len(bestchoices)))
        return np.random.choice(bestchoices)
    
    def predict(self,method="chooseA",lm=mylm):
        if method=="chooseA":
            return self.chooseA()
        elif method=="random":
            return self.chooserandom()
        elif method=="unigram":
            return self.chooseunigram(lm=lm)

In [None]:
SCC=scc_reader()
SCC.predict_and_score(method="unigram")

### Adding Context
looking up context and bigram probabilities


In [None]:
class question(question):
    #you wouldn't normally have a class inherit from itself like this
    #but it is quite a neat way in jupyter notebooks to extend pre-existing classes
    #you could alternatively redefine the class (copying all of the pre-existing class)

    
    def get_tokens(self):
        return ["__START"]+tokenize(self.fields[question.colnames["question"]])+["__END"]
    
    def get_left_context(self,window=1,target="_____"):
        found=-1
        sent_tokens=self.get_tokens()
        for i,token in enumerate(sent_tokens):
            if token==target:
                found=i
                break  
            
        if found>-1:
            return sent_tokens[i-window:i]
        else:
            return []
    
    def choose(self,lm,method="bigram",choices=[]):
        if choices==[]:
            choices=["a","b","c","d","e"]
        context=self.get_left_context(window=1)
        probs=[lm.get_prob(self.get_field(ch+")"),context,methodparams={"method":method}) for ch in choices]
        maxprob=max(probs)
        bestchoices=[ch for ch,prob in zip(choices,probs) if prob == maxprob]
        #if len(bestchoices)>1:
        #    print("Randomly choosing from {}".format(len(bestchoices)))
        return np.random.choice(bestchoices)
    
    def predict(self,method="chooseA",model=mylm):
        if method=="chooseA":
            return self.chooseA()
        elif method=="random":
            return self.chooserandom()
        else:
            return self.choose(mylm,method=method)

In [None]:
SCC=scc_reader()
SCC.predict_and_score(method="bigram")

In [None]:
qs_df["bigram_pred"]=SCC.predict(method="bigram")
qs_df

In [None]:
qs_df["unigram_pred"]=SCC.predict(method="unigram")
qs_df

In [None]:
mylm.unigram["theological"]

In [None]:
mylm.get_prob("theological")

In [None]:
mylm.bigram[","]["theological"]

In [None]:
mylm.get_prob("theological",context=[","],methodparams={"method":"bigram"})

In [None]:
mylm.unigram["residing"]

In [None]:
mylm.get_prob("residing")

In [None]:
mylm.get_prob("residing",context=["are"],methodparams={"method":"bigram"})

## Right context

In [None]:
class question(question):
    #you wouldn't normally have a class inherit from itself like this
    #but it is quite a neat way in jupyter notebooks to extend pre-existing classes
    #you could alternatively redefine the class (copying all of the pre-existing class)

    def get_right_context(self,window=1,target="_____"):
        found=-1
        sent_tokens=self.get_tokens()
        for i,token in enumerate(sent_tokens):
            if token==target:
                found=i
                break  
          
        if found>-1:
            
            return sent_tokens[found+1:found+window+1]
           
        else:
            return []
    
    def choose(self,lm,method="bigram_left",choices=[]):
        if choices==[]:
            choices=["a","b","c","d","e"]
        if method=="bigram_right":
            context=self.get_right_context(window=1)
            probs=[lm.get_prob(context[0],[self.get_field(ch+")")],methodparams={"method":method.split("_")[0]}) for ch in choices]
        else:
            context=self.get_left_context(window=1)
            probs=[lm.get_prob(self.get_field(ch+")"),context,methodparams={"method":method.split("_")[0]}) for ch in choices]
        maxprob=max(probs)
        bestchoices=[ch for ch,prob in zip(choices,probs) if prob == maxprob]
        #if len(bestchoices)>1:
        #    print("Randomly choosing from {}".format(len(bestchoices)))
        return np.random.choice(bestchoices)
    
    

In [None]:
SCC=scc_reader()
SCC.predict_and_score(method="bigram_right")

In [None]:
class question(question):
    #you wouldn't normally have a class inherit from itself like this
    #but it is quite a neat way in jupyter notebooks to extend pre-existing classes
    #you could alternatively redefine the class (copying all of the pre-existing class)

   
    
    def choose(self,lm,method="bigram",choices=[]):
        if choices==[]:
            choices=["a","b","c","d","e"]
        if method=="bigram":
            rc=self.get_right_context(window=1)
            lc=self.get_left_context(window=1)
            probs=[lm.get_prob(rc[0],[self.get_field(ch+")")],methodparams={"method":method.split("_")[0]})*lm.get_prob(self.get_field(ch+")"),lc,methodparams={"method":method.split("_")[0]}) for ch in choices]
        elif method=="bigram_right":
            context=self.get_right_context(window=1)
            probs=[lm.get_prob(context[0],[self.get_field(ch+")")],methodparams={"method":method.split("_")[0]}) for ch in choices]
        else:
            #this covers bigram_left and unigram
            context=self.get_left_context(window=1)
            probs=[lm.get_prob(self.get_field(ch+")"),context,methodparams={"method":method.split("_")[0]}) for ch in choices]
        maxprob=max(probs)
        bestchoices=[ch for ch,prob in zip(choices,probs) if prob == maxprob]
        #if len(bestchoices)>1:
        #    print("Randomly choosing from {}".format(len(bestchoices)))
        return np.random.choice(bestchoices)
    
    

In [None]:
SCC=scc_reader()
SCC.predict_and_score(method="bigram")

### Backing off to unigram probs

In [None]:
class question(question):
    #you wouldn't normally have a class inherit from itself like this
    #but it is quite a neat way in jupyter notebooks to extend pre-existing classes
    #you could alternatively redefine the class (copying all of the pre-existing class)

    
    def choose_backoff(self,lm,methods=['bigram','unigram'],choices=["a","b","c","d","e"]):
        context=self.get_left_context(window=1)
        probs=[lm.get_prob(self.get_field(ch+")"),context,methodparams={"method":methods[0]}) for ch in choices]
        maxprob=max(probs)
        bestchoices=[ch for ch,prob in zip(choices,probs) if prob == maxprob]
        if len(bestchoices)>1:
            print("Backing off on {}".format(len(bestchoices)))
        return self.choose(lm,choices=bestchoices,method=methods[1])
    
    def predict(self,method="chooseA",model=mylm):
        if method=="chooseA":
            return self.chooseA()
        elif method=="random":
            return self.chooserandom()
        elif method=="bigram_backoff":
            return self.choose_backoff(mylm,methods=["bigram","unigram"])
        else:
            return self.choose(mylm,method=method)

In [None]:
SCC=scc_reader()
SCC.predict_and_score(method="bigram_backoff")

Backing off might not change the decision (the correct answer may not be in the bestchoices given back by the bigram model)

Investigate: 
* the effect of the amount of training data on each of the strategies
* plot on a graph - should see a cross-over (unigram than bigram for small training data but bigram better than unigram for large training data)

Extend:
* trigram model
* incorporation of distributional similarity / word2vec vectors
* RNNLM ...?