# Advanced Natural Language Engineering - Assignment 1

This assignment asks us to compete in The Microsoft Research Sentence Completion Challenge - MRSCC (Zweig and Burges, 2011), it requires a system to be able to predict which is the most likely word (from a set of 5 possibilities) to complete a sentence. 

There are 4 different methods that will be compared in this challenge:

1.   Unigram model
2.   Bigram model
3. Maxium Entropy model (Logisitic Regression)
4. RoBERTa model

### Loading challenge data

For this challenge we are provided with:

1.   A training corpus of 19th century novels data (522 files)
2.   1040 sentences with one missing word and 5 options to choose from

This dataset was constructed from Project Gutenberg data. Seed sentences were selected from five of Sir
Arthur Conan Doyle’s Sherlock Holmes novels, and then imposter words were suggested with the
aid of a language model trained on over 500 19th century novels. The strategy for competing in this challenge will be to create training and validation data from the complete corpus. This will then help us make predictions in the unseen MRSCC challenge data.

In [None]:
%%capture
!pip install nltk
import nltk
nltk.download('punkt')
import os
import random
from nltk import word_tokenize as tokenize
import operator

In [None]:
mrscc_dir = '/content/drive/MyDrive/university/2021/ANLE/lab2resources/sentence-completion'

def get_train_val(training_dir=mrscc_dir,split=0.2):
    filenames=os.listdir(training_dir)
    n=len(filenames)
    print("There are {} files in the training directory: {}".format(n,training_dir))
    random.seed(7) #if you want the same random split every time
    random.shuffle(filenames)
    index=int(n*split)
    return(filenames[:index],filenames[index:])

trainingdir=os.path.join(mrscc_dir,"Holmes_Training_Data")
training,testing=get_train_val(trainingdir)

There are 522 files in the training directory: /content/drive/MyDrive/university/2021/ANLE/lab2resources/sentence-completion/Holmes_Training_Data


### 1. Unigram & Bigram model

This language model has been taken and adapted from the ANLE lab resources.

In [None]:
class language_model():
    
    def __init__(self,trainingdir=mrscc_dir,files=[]):
        self.training_dir=trainingdir
        self.files=files
        self.train()
        
    def train(self):    
        self.unigram={}
        self.bigram={}
         
        self._processfiles()
        self._make_unknowns()
        self._discount()
        self._convert_to_probs()
        
    
    def _processline(self,line, i):
        tokens=["__START"]+tokenize(line)+["__END"]
        previous="__END"
        for token in tokens:
            self.unigram[token]=self.unigram.get(token,0)+1
            current=self.bigram.get(previous,{})
            current[token]=current.get(token,0)+1
            self.bigram[previous]=current
            previous=token
            
    
    def _processfiles(self):
      for i, afile in enumerate(self.files):
          # print("Processing {}".format(afile))
          try:
              with open(os.path.join(self.training_dir,afile)) as instream:
                  for line in instream:
                      line=line.rstrip()
                      if len(line)>0:
                          self._processline(line, i)
          except UnicodeDecodeError:
              print("UnicodeDecodeError processing {}: ignoring rest of file".format(afile))
      print('file processing complete')
      
            
    def _convert_to_probs(self):
        self.unigram={k:v/sum(self.unigram.values()) for (k,v) in self.unigram.items()}
        self.bigram={key:{k:v/sum(adict.values()) for (k,v) in adict.items()} for (key,adict) in self.bigram.items()}
        self.kn={k:v/sum(self.kn.values()) for (k,v) in self.kn.items()}
        
    def get_prob(self,token,context="",methodparams={}):
        if methodparams.get("method","unigram")=="unigram":
            return self.unigram.get(token,self.unigram.get("__UNK",0))
        else:
            if methodparams.get("smoothing","kneser-ney")=="kneser-ney":
                unidist=self.kn
            else:
                unidist=self.unigram
            bigram=self.bigram.get(context[-1],self.bigram.get("__UNK",{}))
            big_p=bigram.get(token,bigram.get("__UNK",0))
            lmbda=bigram["__DISCOUNT"]
            uni_p=unidist.get(token,unidist.get("__UNK",0))
            #print(big_p,lmbda,uni_p)
            p=big_p+lmbda*uni_p            
            return p
    
    
    def nextlikely(self,k=1,current="",method="unigram"):
        #use probabilities according to method to generate a likely next sequence
        #choose random token from k best
        blacklist=["__START","__UNK","__DISCOUNT"]
       
        if method=="unigram":
            dist=self.unigram
        else:
            dist=self.bigram.get(current,self.bigram.get("__UNK",{}))    
        #sort the tokens by unigram probability
        mostlikely=sorted(list(dist.items()),key=operator.itemgetter(1),reverse=True)
        #filter out any undesirable tokens
        filtered=[w for (w,p) in mostlikely if w not in blacklist]
        #choose one randomly from the top k
        res=random.choice(filtered[:k])
        return res
    
    def generate(self,k=1,end="__END",limit=20,method="bigram",methodparams={}):
        if method=="":
            method=methodparams.get("method","bigram")
        current="__START"
        tokens=[]
        while current!=end and len(tokens)<limit:
            current=self.nextlikely(k=k,current=current,method=method)
            tokens.append(current)
        return " ".join(tokens[:-1])
    
    
    def compute_prob_line(self,line,methodparams={}):
        #this will add _start to the beginning of a line of text
        #compute the probability of the line according to the desired model
        #and returns probability together with number of tokens       
        tokens=["__START"]+tokenize(line)+["__END"]
        acc=0
        for i,token in enumerate(tokens[1:]):
            acc+=math.log(self.get_prob(token,tokens[:i+1],methodparams))
        return acc,len(tokens[1:])
    
    def compute_probability(self,filenames=[],methodparams={}):
        #computes the probability (and length) of a corpus contained in filenames
        if filenames==[]:
            filenames=self.files      
        total_p=0
        total_N=0
        for i,afile in enumerate(filenames):
            print("Processing file {}:{}".format(i,afile))
            try:
                with open(os.path.join(self.training_dir,afile)) as instream:
                    for line in instream:
                        line=line.rstrip()
                        if len(line)>0:
                            p,N=self.compute_prob_line(line,methodparams=methodparams)
                            total_p+=p
                            total_N+=N
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing file {}: ignoring rest of file".format(afile))
        return total_p,total_N
    
    def compute_perplexity(self,filenames=[],methodparams={"method":"bigram","smoothing":"kneser-ney"}):
        #compute the probability and length of the corpus
        #calculate perplexity
        #lower perplexity means that the model better explains the data
        p,N=self.compute_probability(filenames=filenames,methodparams=methodparams)
        #print(p,N)
        pp=math.exp(-p/N)
        return pp
    
    def compute_line_perplexity(self, line):
        line_prob, line_len = self.compute_prob_line(line)
        return math.exp(-line_prob/line_len)
    
    def _make_unknowns(self,known=2):
        unknown=0
        for (k,v) in list(self.unigram.items()):
            if v<known:
                del self.unigram[k]
                self.unigram["__UNK"]=self.unigram.get("__UNK",0)+v
        for (k,adict) in list(self.bigram.items()):
            for (kk,v) in list(adict.items()):
                isknown=self.unigram.get(kk,0)
                if isknown==0:
                    adict["__UNK"]=adict.get("__UNK",0)+v
                    del adict[kk]
            isknown=self.unigram.get(k,0)
            if isknown==0:
                del self.bigram[k]
                current=self.bigram.get("__UNK",{})
                current.update(adict)
                self.bigram["__UNK"]=current
                
            else:
                self.bigram[k]=adict
                
    def _discount(self,discount=0.75):
        #discount each bigram count by a small fixed amount
        self.bigram={k:{kk:value-discount for (kk,value) in adict.items()}for (k,adict) in self.bigram.items()}
        
        #for each word, store the total amount of the discount so that the total is the same 
        #i.e., so we are reserving this as probability mass
        for k in self.bigram.keys():
            lamb=len(self.bigram[k])
            self.bigram[k]["__DISCOUNT"]=lamb*discount
            
        #work out kneser-ney unigram probabilities
        #count the number of contexts each word has been seen in
        self.kn={}
        for (k,adict) in self.bigram.items():
            for kk in adict.keys():
                self.kn[kk]=self.kn.get(kk,0)+1

In [None]:
MAX_FILES=20
mylm=language_model(trainingdir=trainingdir,files=training[:MAX_FILES])

UnicodeDecodeError processing TNGLW10.TXT: ignoring rest of file
file processing complete


In [None]:
vocab=sorted(mylm.unigram.items(),key=lambda x:x[1],reverse=True)
# vocab[:20]  
# mylm.bigram

In [None]:
import pandas as pd, csv
questions=pd.read_csv(os.path.join(mrscc_dir,"testing_data.csv"))
answers=pd.read_csv(os.path.join(mrscc_dir,"test_answer.csv"))

questions.rename(columns={'a)':'a','b)':'b','c)':'c','d)':'d','e)':'e'}, inplace=True)
questions.head()

Unnamed: 0,id,question,a,b,c,d,e
0,1,I have it from the same source that you are bo...,crying,instantaneously,residing,matched,walking
1,2,It was furnished partly as a sitting and partl...,daintily,privately,inadvertently,miserably,comfortably
2,3,"As I descended , my old ally , the _____ , cam...",gods,moon,panther,guard,country-dance
3,4,"We got off , _____ our fare , and the trap rat...",rubbing,doubling,paid,naming,carrying
4,5,"He held in his hand a _____ of blue paper , sc...",supply,parcel,sign,sheet,chorus


In [None]:
class question:
    
    def __init__(self,aline, lm):
        self.sentence=aline[1]
        self.choices = ["a", "b", "c", "d", "e"]
        self.word_choices = {index:word for index,word in zip(self.choices,aline[2:])}
        self.lm = lm

    def add_answer(self,fields):
        self.answer=fields[1]
   
    def chooseA(self):
        return("a")
    
    def chooseRandom(self):
        return random.choice(self.choices)
    
    def chooseUnigram(self):     
        # matches choices with words    
        words = self.word_choices.values()
        # make dictionary keys:values {take first character e.g. a: word probability from unigram}          
        probabilities = {index[0]:self.lm.unigram.get(word,0) for index,word in zip(self.choices, words)}
        # sort probs         
        sorted_probabilities = dict(sorted(probabilities.items(), key=lambda item: item[1], reverse=True))
        # take first key (highest probability word but return key)     
        return list(sorted_probabilities.keys())[0]

    def chooseBigramLeft(self, left_word):
      # get bigram dict for this word
        bigram_for_left_word = self.lm.bigram.get(left_word, 0)
      # get probabilites of each bigram P(w-1|w) - if exists
        bigram_probs_for_left = {key:bigram_for_left_word.get(w, 0) for key,w in self.word_choices.items() if bigram_for_left_word != 0}
        return dict(sorted(bigram_probs_for_left.items(), key=lambda item: item[1], reverse=True))

    def chooseBigramRight(self, right_word):
        # get bigram dict for each word choice
        bigram_for_word_choice = {key:self.lm.bigram.get(w, 0) for key,w in self.word_choices.items()}
        # get probs for each choice and word to right
        bigram_probs_for_right = {key:b.get(right_word, 0) for key,b in bigram_for_word_choice.items() if b != 0}
        return dict(sorted(bigram_probs_for_right.items(), key=lambda item: item[1], reverse=True))

    def chooseBigram(self, left, right):
        context = self.get_window_context(self.sentence, 1, 1)

        # ---------- left bigram
        if left:
          l = self.chooseBigramLeft(context[0])
          if not right:
            # if no bigram for left context or max prob is still 0
            if not l or list(l.values())[0] == 0:
              return self.chooseUnigram()
            return list(l.keys())[0]

        # ---------- right bigram
        if right:
          r = self.chooseBigramRight(context[-1])
          if not left:
            # if no bigram for left context or all probs zero resort to unigram
            if not r or list(r.values())[0] == 0:
              return self.chooseUnigram()
            return list(r.keys())[0]

        # ---------- both contexts
        if right and left:
          bigram_both = {key:bigram*r[key] for key,bigram in l.items() if key in r.keys()}
          bigram_both = dict(sorted(bigram_both.items(), key=lambda item: item[1], reverse=True))
          if not r or not l:
            return self.chooseUnigram()
        return list(bigram_both.keys())[0]
        
    def predict_and_score(self,method="chooseA", left=None, right=None):
        #compare prediction according to method with the correct answer
        #return 1 or 0 accordingly
        prediction=self.predict(method, left, right)
        if prediction ==self.answer:
            return 1
        else:
            return 0
        
    def get_window_context(self,sent_tokens,window_left, window_right,target="_____"):
        tokens=tokenize(sent_tokens)
        # print(tokens)
        found=False
        for i,token in enumerate(tokens):
            if token==target:
                found=True
                break 
        if found:
            return tokens[i-window_left:i+1+window_right]
        else:
            return []

    def predict(self,method="chooseA", left=None, right=None):
        #eventually there will be lots of methods to choose from
        if method=="chooseA":
            return self.chooseA()
        if method=="random":
            return self.chooseRandom()
        if method=="unigram":
            return self.chooseUnigram()
        if method=="bigram":
            return self.chooseBigram(left, right)
          

In [None]:
class scc_reader:
    
    def __init__(self, lm, qs=questions, ans=answers):
        self.qs=qs
        self.ans=ans
        self.lm = lm
        self.read_files()
   
    def read_files(self):
        #create a question instance for each line of the file (other than heading line)
        self.questions=[question(questions.iloc[i], self.lm) for i in range(len(questions))]
        #add answers to questions so predictions can be checked    
        for i,q in enumerate(self.questions):
            q.add_answer(answers.iloc[i])
        
    def get_field(self,field):
        return [q.get_field(field) for q in self.questions] 
    
    def predict(self,method="chooseA"):
        return [q.predict(method=method) for q in self.questions]
    
    def predict_and_score(self,method="chooseA", left=None, right=None):
        scores=[q.predict_and_score(method, left, right) for q in self.questions]
        return sum(scores)/len(scores)

In [None]:
SCC = scc_reader(lm=mylm)

In [None]:
SCC.predict_and_score('random')

0.2173076923076923

In [None]:
print('random {}'.format(SCC.predict_and_score('random')))
print('unigram {}'.format(SCC.predict_and_score('unigram')))
print('bigram left {}'.format(SCC.predict_and_score("bigram", 1, 0)))
print('bigram right {}'.format(SCC.predict_and_score("bigram", 0, 1)))
print('bigram both {}'.format(SCC.predict_and_score("bigram", 1, 1)))

random 0.20384615384615384
unigram 0.24711538461538463
bigram left 0.2605769230769231
bigram right 0.2048076923076923
bigram both 0.23653846153846153


In [None]:
def training_data_by_accuracy(max_iters):
  accuracy_per_model = {'unigram':[], 'bigram left':[], 'bigram right':[], 'bigram both':[]}
  for max in max_iters:
    lm=language_model(trainingdir=trainingdir,files=training[:max])
    SCC = scc_reader(lm)
    accuracy_per_model['unigram'].append(SCC.predict_and_score('unigram'))
    accuracy_per_model['bigram left'].append(SCC.predict_and_score('bigram', 1, 0))
    accuracy_per_model['bigram right'].append(SCC.predict_and_score('bigram', 0, 1))
    accuracy_per_model['bigram both'].append(SCC.predict_and_score('bigram', 1, 1))
  return accuracy_per_model

In [None]:
test = training_data_by_accuracy([1,5,10,100])

file processing complete
file processing complete
UnicodeDecodeError processing TNGLW10.TXT: ignoring rest of file
file processing complete
UnicodeDecodeError processing TNGLW10.TXT: ignoring rest of file
UnicodeDecodeError processing HHOHG10.TXT: ignoring rest of file
file processing complete


In [None]:
test

{'bigram both': [0.20673076923076922,
  0.2326923076923077,
  0.2375,
  0.26826923076923076],
 'bigram left': [0.2201923076923077,
  0.26634615384615384,
  0.2798076923076923,
  0.27403846153846156],
 'bigram right': [0.2144230769230769,
  0.23173076923076924,
  0.21153846153846154,
  0.20865384615384616],
 'unigram': [0.21923076923076923,
  0.2605769230769231,
  0.2548076923076923,
  0.2567307692307692]}