In [104]:
from nltk import word_tokenize as tokenize
import pandas as pd, csv
import operator
import os,random,math
import numpy as np
from gensim.models import KeyedVectors,Word2Vec, FastText
from gensim.downloader import base_dir

In [105]:
filename = r"D:\Study\自然语言工程\fasttext-wiki-news-subwords-300"
mymodel = KeyedVectors.load_word2vec_format(filename)

In [140]:
path = "D:\Study\自然语言工程\高级\week2\lab2resources\lab2resources\sentence-completion\Holmes_Training_Data"

In [173]:
filenames = os.listdir(path)
n = len(filenames)
random.shuffle(filenames)
trainingfiles = filenames[:int(n*0.5)]
heldoutfiles = filenames[int(n*0.5):]

In [174]:
class lanugage_model:
    
    def __init__(self,path,filesize,method):
        self.words = []
        self.unigram = {}
        self.bigram = {}
        self.trigram = {}
        self.quadrigram = {}
        
        self.gram = {}
        
        self.path = path
        self.filesize = filesize
        self.method = method
        
        self.get_words()
        self._processfiles()
        self._make_unknowns()
        self._discount()
        self._convert_to_probs()
        #self.get_prob()
        
    def get_words(self):

        for file in trainingfiles[:self.filesize]:
            print(f"processing {file}.text ")
            try:
                with open (os.path.join(path,file)) as instream:
                    for line in instream:
                        line = line.rstrip()
                        if len(line)>0:             
                            tokens = tokens=["__START"]+tokenize(line)+["__END"]
                            self.words.append(tokens)
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing {}: ignoring rest of file".format(file))
                        
    
    def _processfiles(self):
    
        for i in self.words:
            for j in i:
                self.unigram[j] = 0

        for i in self.words:
            for j in i:
                self.unigram[j] += 1

        if self.method == "bigram":
            for i in self.words:
                for j in range(len(i)-1):
                    self.bigram[i[j]] = {}
            for i in self.words:
                for j in range(len(i)-1):
                    self.bigram[i[j]][i[j+1]] = 0
            for i in self.words:
                for j in range(len(i)-1):
                    self.bigram[i[j]][i[j+1]] += 1
                    
            self.gram = self.bigram
                    
        if self.method == "trigram":
            for i in self.words:
                for j in range(len(i)-2):
                    self.trigram[i[j]] = {}
            for i in self.words:
                for j in range(len(i)-2):
                    self.trigram[i[j]][i[j+1],i[j+2]] = 0
            for i in self.words:        
                 for j in range(len(i)-2):
                    self.trigram[i[j]][i[j+1],i[j+2]] += 1
                    
            self.gram = self.trigram
                    
        if self.method == "quadrigram":
            for i in self.words:
                for j in range(len(i)-3):
                    self.quadrigram[i[j]] = {}
            for i in self.words:
                for j in range(len(i)-3):
                    self.quadrigram[i[j]][i[j+1],i[j+2],i[j+3]] = 0
            for i in self.words:        
                 for j in range(len(i)-3):
                    self.quadrigram[i[j]][i[j+1],i[j+2],i[j+3]] += 1
                    
            self.gram = self.quadrigram
            
            
    def _convert_to_probs(self):
        self.unigram = {k:v/sum(self.unigram.values()) for (k,v) in self.unigram.items()} 
        self.gram = {key:{k:v/sum(adict.values()) for (k,v) in adict.items()} for (key,adict) in self.gram.items()}
    
    def get_prob(self,token,context=""):
        
        if self.method == "unigram":
            return self.unigram.get(token,self.unigram.get("__UNK",0))
        else:
            gram = self.gram.get(context[-1],self.gram.get("__UNK",{}))
            big_p = gram.get(token,gram.get("__UNK",0))
            
            lmbda = gram["__DISCOUNT"]
            
            uni_p = self.unigram.get(token,self.unigram.get("__UNK",0))
            #print(big_p,lmbda,uni_p)
            p = big_p + lmbda * uni_p            
            return p          
        
    def nextlikely(self,current=""):
        blacklist=["__START","__DISCOUNT"]
       
        if self.method == "unigram":
            dist = self.unigram
        else:
            dist = self.gram.get(current,{})
    
        mostlikely=list(dist.items())
        #filter out any undesirable tokens
        filtered=[(w,p) for (w,p) in mostlikely if w not in blacklist]
        print(current,len(filtered))
        #choose one randomly from the top k
        words,probdist = zip(*filtered)
        res = random.choices(words,probdist)[0]
        return res
    
    def generate(self,end="__END",limit=20):
        current="__START"
        tokens=[]
        while  current != end and len(tokens) < limit:
            current=self.nextlikely(current=current)
            tokens.append(current)
        return " ".join(tokens[:-1])
    
    def compute_prob_line(self,line):
       
        tokens=["__START"]+tokenize(line)+["__END"]
        acc=0
        for i,token in enumerate(tokens[1:]):
            acc += np.log(self.get_prob(token,tokens[:i+1]))
        return acc,len(tokens[1:])
    
    def compute_probability(self):
        #computes the probability (and length) of a corpus contained in filenames
        
        total_p=0
        total_N=0
        for i,afile in enumerate(heldoutfiles[:self.filesize]):
            print("Processing file {}:{}".format(i,afile))
            try:
                with open(os.path.join(path,afile)) as instream:
                    for line in instream:
                        line=line.rstrip()
                        if len(line)>0:
                            p,N = self.compute_prob_line(line)
                            total_p += p
                            total_N += N
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing file {}: ignoring rest of file".format(afile))
        return total_p,total_N
    
    def compute_perplexity(self):
        
        #compute the probability and length of the corpus
        #calculate perplexity
        #lower perplexity means that the model better explains the data
        
        p,N = self.compute_probability()
        #print(p,N)
        pp = np.exp(-p/N)
        return pp  
    
    def _make_unknowns(self,known=2):
        unknown = 0
        if self.method == "unigram"or self.method == "bigram":
            for (k,v) in list(self.unigram.items()):
                if v < known:
                    del self.unigram[k]
                    self.unigram["__UNK"] = self.unigram.get("__UNK",0) + v
            for (k,adict) in list(self.gram.items()):
                for (kk,v) in list(adict.items()):
                    isknown = self.unigram.get(kk,0)
                    if isknown == 0:
                        adict["__UNK"] = adict.get("__UNK",0) + v
                        del adict[kk]
                isknown = self.unigram.get(k,0)
                if isknown == 0:
                    del self.gram[k]
                    current = self.gram.get("__UNK",{})
                    current.update(adict)
                    self.gram["__UNK"] = current

                else:
                    self.gram[k] = adict
                    
        if self.method == "trigram"or self.method == "quadrigram":
            for (k,v) in list(self.unigram.items()):
                if v < known:
                    del self.unigram[k]
                    self.unigram["__UNK"] = self.unigram.get("__UNK",0) + v
            for (k,adict) in list(self.gram.items()):
                for (kk,v) in list(adict.items()):
                    for vv in kk:
                        isknown = self.unigram.get(vv,0)
                        dels = False
                        if isknown == 0:
                            adict["__UNK"] = adict.get("__UNK",0) + v
                            dels = True
                    if dels == True:
                        del adict[kk]
                isknown = self.unigram.get(k,0)
                if isknown == 0:
                    del self.gram[k]
                    current = self.gram.get("__UNK",{})
                    current.update(adict)
                    self.gram["__UNK"] = current
                    
                else:
                    self.gram[k] = adict
        
    def _discount(self,discount=0.75):
        #discount each bigram count by a small fixed amount
        self.gram={k:{kk:value-discount for (kk,value) in adict.items()}for (k,adict) in self.gram.items()}
        
        #for each word, store the total amount of the discount so that the total is the same 
        #i.e., so we are reserving this as probability mass
        for k in self.gram.keys():
            lamb = len(self.gram[k])
            self.gram[k]["__DISCOUNT"] = lamb * discount

In [175]:
mylm = lanugage_model(path,10,"trigram")
#mylm.compute_perplexity()

processing TYIFC10.TXT.text 
processing TETHR10.TXT.text 
processing 1BOON10.TXT.text 
processing MARKT10.TXT.text 
processing CDRPR10.TXT.text 
processing MORLL10.TXT.text 
processing EMMA10.TXT.text 
processing SARAC10.TXT.text 
processing TBSCC10.TXT.text 
processing TDITW10.TXT.text 


In [176]:
parentdir =  "D:\Study\自然语言工程\高级\week2\lab2resources\lab2resources\sentence-completion"

questions=os.path.join(parentdir,"testing_data.csv")
answers=os.path.join(parentdir,"test_answer.csv")

In [177]:
with open(questions) as instream:
    csvreader=csv.reader(instream)
    lines=list(csvreader)
qs_df=pd.DataFrame(lines[1:],columns=lines[0])
qs_df.head()

Unnamed: 0,id,question,a),b),c),d),e)
0,1,I have it from the same source that you are bo...,crying,instantaneously,residing,matched,walking
1,2,It was furnished partly as a sitting and partl...,daintily,privately,inadvertently,miserably,comfortably
2,3,"As I descended , my old ally , the _____ , cam...",gods,moon,panther,guard,country-dance
3,4,"We got off , _____ our fare , and the trap rat...",rubbing,doubling,paid,naming,carrying
4,5,"He held in his hand a _____ of blue paper , sc...",supply,parcel,sign,sheet,chorus


In [178]:
def get_left_context(sent_tokens,window,target="_____"):
    found=-1
    for i,token in enumerate(sent_tokens):
        if token==target:
            found=i
            break 
            
    if found>-1:
        return sent_tokens[i-window:i]
    else:
        return []

In [179]:
qs_df['tokens']=qs_df['question'].map(tokenize)
qs_df['left_context']=qs_df['tokens'].map(lambda x: get_left_context(x,2))

In [180]:
qs_df.to_csv("D:\Study\自然语言工程\高级\week2\lab2resources\lab2resources\sentence-completion\data.csv")

In [181]:
questions=os.path.join(parentdir,"data.csv")

In [182]:
class question:
    
    def __init__(self,aline):
        self.fields=aline
    
    def get_field(self,field):
        return self.fields[question.colnames[field]]
    
    def get_context(self,field):
        left = eval(self.fields[question.colnames["left_context"]])
        option = self.fields[question.colnames[field]]
        left.append(option)
        return left
        
    def add_answer(self,fields):
        self.answer=fields[1]
   
    def chooseA(self):
        return("a")
    
    def chooserandom(self):
        choices=["a","b","c","d","e"]
        return np.random.choice(choices)
    
    def chooseunigram(self,lm):
        choices=["a","b","c","d","e"]      
        probs=[lm.unigram.get(self.get_field(ch+")"),0) for ch in choices]
        maxprob=max(probs)
        bestchoices=[ch for ch,prob in zip(choices,probs) if prob == maxprob]
        #if len(bestchoices)>1:
        #    print("Randomly choosing from {}".format(len(bestchoices)))
        return np.random.choice(bestchoices)
    
    def choosebigram(self,lm):
        choices = ["a","b","c","d","e"]
        probs_total = []
        for ch in choices:
            context = self.get_context(ch+")")
            awnser_probs = lm.gram.get(context[1],{})#获取答案前一个词相关概率的所有词
            vocab_bigram = sorted(awnser_probs.items(),key=lambda x:x[1],reverse =True)#根据概率从大到小排列
            for i in vocab_bigram:
                try:
                    prob = mymodel.similarity(i[0],context[2])
                    probs_total.append(prob)
                    break
                    
                except KeyError:
                    continue
        try:
            maxprob = max(probs_total)
            bestchoices=[ch for ch,prob in zip(choices,probs_total) if prob == maxprob]
            return np.random.choice(bestchoices)
        except ValueError: 
            return np.random.choice(choices)

    def choosetrigram(self,lm):
        choices = ["a","b","c","d","e"]
        probs_total = []
        for ch in choices:
            context = self.get_context(ch+")")
            awnser_probs = lm.gram.get(context[0],{})#获取答案前一个词相关概率的所有词
            vocab_bigram = sorted(awnser_probs.items(),key=lambda x:x[1],reverse =True)#根据概率从大到小排列
            #遍历和前一个词相关的所有二元词组，如果
            for i in vocab_bigram:
                try:
                    if i[0][0] == context[1]:
                        prob = mymodel.similarity(i[0][1],context[2])
                        probs_total.append(prob)
                        break
                    
                except KeyError:
                    continue
        try:
            maxprob = max(probs_total)
            bestchoices=[ch for ch,prob in zip(choices,probs_total) if prob == maxprob]
            return np.random.choice(bestchoices)
        except ValueError: 
                return np.random.choice(choices)

    def choosequadrigram(self,lm):
        choices = ["a","b","c","d","e"]
        probs_total = []
        for ch in choices:
            context = self.get_context(ch+")")
            prob = lm.gram.get(context[0],{}).get((context[1],context[2],context[3]),0)
            probs_total.append(prob)
        maxprob = max(probs_total)
        bestchoices=[ch for ch,prob in zip(choices,probs_total) if prob == maxprob]
        return np.random.choice(bestchoices)
    
    def predict(self,method="chooseA",lm=mylm):
        #eventually there will be lots of methods to choose from
        if method=="chooseA":
            return self.chooseA()
        elif method=="random":
            return self.chooserandom()
        elif method=="unigram":
            return self.chooseunigram(lm=lm)
        elif method=="bigram":
            return self.choosebigram(lm=lm)
        elif method=="trigram":
            return self.choosetrigram(lm=lm)
        elif method=="quadrigram":
            return self.choosequadrigram(lm=lm)
        
    def predict_and_score(self,method="chooseA"):
        
        #compare prediction according to method with the correct answer
        #return 1 or 0 accordingly
        prediction=self.predict(method=method)
        if prediction ==self.answer:
            return 1
        else:
            return 0

In [183]:
class scc_reader:
    
    def __init__(self,qs=questions,ans=answers):
        self.qs=qs
        self.ans=ans
        self.read_files()
        
    def read_files(self):
        
        #read in the question file
        with open(self.qs) as instream:
            csvreader=csv.reader(instream)
            qlines=list(csvreader)
        
        #store the column names as a reverse index so they can be used to reference parts of the question
        question.colnames={item:i for i,item in enumerate(qlines[0])}
        
        #create a question instance for each line of the file (other than heading line)
        self.questions=[question(qline) for qline in qlines[1:]]
        
        #read in the answer file
        with open(self.ans) as instream:
            csvreader = csv.reader(instream)
            alines=list(csvreader)
            
        #add answers to questions so predictions can be checked    
        for q,aline in zip(self.questions,alines[1:]):
            q.add_answer(aline)
        
    def get_field(self,field):
        return [q.get_field(field) for q in self.questions] 
    
    def predict(self,method="chooseA"):
        return [q.predict(method=method) for q in self.questions]
    
    def predict_and_score(self,method="chooseA"):
        scores=[q.predict_and_score(method=method) for q in self.questions]
        return sum(scores)/len(scores)

In [184]:
SCC = scc_reader()

In [185]:
SCC.predict_and_score(method="unigram")

0.2519230769230769

In [186]:
#score_bigram = SCC.predict_and_score(method="bigram")
#print(score_bigram)

In [187]:
score_trigram = SCC.predict_and_score(method="trigram")
print(score_trigram)

0.24519230769230768


In [188]:
with open("Wordvec.txt","a+") as f:
    f.write(f"WikiNews-vectors trigram score: {score_trigram}  ")