In [2]:
import os, csv, random
from nltk import word_tokenize as tokenize

parentdir = "data"

questions=os.path.join(parentdir,"testing_data.csv")
answers=os.path.join(parentdir,"test_answer.csv")

class question:
    
    def __init__(self,aline):
        self.fields=aline
        self.anskeys = ["a)","b)","c)","d)","e)"]
    
    def get_field(self,field):
        return self.fields[question.colnames[field]]
    
    def add_answer(self,fields):
        self.fields+=fields[1]


    def set_context(self,direction,window=1,target="_____"):
        found=-1
        direction = direction.lower()[0]
        sent_tokens = tokenize(self.get_field("question"))
        for i,token in enumerate(sent_tokens):
            if token==target:
                found=i
                break
        if found>-1:
            question.colnames["context "+direction] = question.colnames.get("context "+direction,len(question.colnames))
            if direction == "l":
                self.fields+=sent_tokens[i-window:i]
            if direction == "r":
                self.fields+=sent_tokens[i+1:i+window+1]
    
    def make_sentence(self,answer):
        q = self.get_field("question")
        return q.replace("_____",answer)
        
        
class scc_reader:
    
    def __init__(self,qs,ans):
        self.qs=qs
        self.ans=ans
        self.read_files()
        
    def read_files(self):
        
        #read in the question file
        with open(self.qs) as instream:
            csvreader=csv.reader(instream)
            qlines=list(csvreader)
        
        #store the column names as a reverse index so they can be used to reference parts of the question
        question.colnames={item:i for i,item in enumerate(qlines[0])}
        question.colnames["answer"] = len(question.colnames)
        
        #create a question instance for each line of the file (other than heading line)
        self.questions=[question(qline) for qline in qlines[1:]]
        
        #read in the answer file
        with open(self.ans) as instream:
            csvreader=csv.reader(instream)
            alines=list(csvreader)
            
        #add answers to questions so predictions can be checked    
        for q,aline in zip(self.questions,alines[1:]):
            q.add_answer(aline)
        
    def get_field(self,field):
        return [q.get_field(field) for q in self.questions] 


In [3]:
scc = scc_reader(questions,answers)

In [51]:
scc.questions[0].get_field("question")

'I have it from the same source that you are both an orphan and a bachelor and are _____ alone in London.'

In [57]:
scc.questions[0].set_context("l")

In [5]:
q = scc.questions[0].get_field("question")
q.replace("_____","hi")

'I have it from the same source that you are both an orphan and a bachelor and are hi alone in London.'

In [50]:
scc.questions[0].fields

['1',
 'I have it from the same source that you are both an orphan and a bachelor and are _____ alone in London.',
 'crying',
 'instantaneously',
 'residing',
 'matched',
 'walking',
 'c',
 'are',
 'alone']

In [53]:
scc.questions[0].colnames

{'id': 0,
 'question': 1,
 'a)': 2,
 'b)': 3,
 'c)': 4,
 'd)': 5,
 'e)': 6,
 'answer': 8,
 'context l': 9,
 'context r': 10}

In [4]:
from datetime import datetime, date, time, timedelta
import re, copy, pdb


class ResultsLogParser:

    def __init__(self, log_path=r"results.log"):
        self.ops = 0
        self.log_path = log_path
        self.filtlist = self.get_all()
        self.history = []
        self.since_when_dict = {"today": datetime.combine(date.today(), time()),
                                "yesterday": datetime.combine(date.today() - timedelta(1), time()),
                                "start": datetime(2020, 3, 9, 0, 0),
                                "monday": self._get_weekday("monday"),
                                "tuesday": self._get_weekday("tuesday"),
                                "wednesday": self._get_weekday("wednesday"),
                                "thursday": self._get_weekday("thursday"),
                                "friday": self._get_weekday("friday"),
                                "saturday": self._get_weekday("saturday"),
                                "sunday": self._get_weekday("sunday")}

    def get_all(self):
        with open(self.log_path, "r") as filtlist:
            all_results = [line for line in filtlist]
        self.filtlist = all_results

    def _manage_filtlist(self, filtlist=None):
        close = None
        if filtlist is None:
            filtlist = self.filtlist
            if filtlist is not None:
                self.history.append(copy.deepcopy(filtlist))
            else:
                filtlist = open(self.log_path, "r")
                close = True
        return filtlist, close

    def _split_line(self, line):
        messages = line.split(r' | ')
        return messages

    def _dirtystring_to_list(self, dirtystring):
        m = re.search(r'\[([^]]*)\]', dirtystring)
        cleanstring = m.group(0)
        cleanstring = cleanstring.replace("'", '')
        cleanlist = cleanstring.strip('][').split(', ')
        results = []
        for item in cleanlist:
            if item.isnumeric():
                item = int(item)
            elif item.replace('.', '', 1).isdigit():
                item = float(item)
            else:
                item = str(item)
            results.append(item)
        return results

    def filter_by_model(self, model, filtlist=None):

        results = []
        filtlist, close = self._manage_filtlist(filtlist=filtlist)

        for line in filtlist:
            messages = self._split_line(line)
            if model in messages[1]:
                results.append(line)
        if close:
            filtlist.close()

        self.filtlist = results
        self.ops += 1

    def undo_steps(self, steps):
        if type(steps) == int:
            if steps > 0:
                if steps <= self.ops:
                    self.history = self.history[:self.ops - steps]
                    self.ops -= steps
                else:
                    return print("Too many steps back")
            else:
                if abs(steps) < self.ops:
                    self.history = self.history[:abs(steps)]
                    self.ops = abs(steps)
                else:
                    return print("Too many steps forward from start")
        elif type(steps) == str:
            if steps in ["cl", "clear", "all"]:
                self.get_all()
                self.history = []
                self.ops = 0
                return

        self.filtlist = copy.deepcopy(self.history[-1])

    def get_as_list(self, keyword, filtlist=None):
        results = []
        idx = self._find_index(keyword)
        filtlist, close = self._manage_filtlist(filtlist=filtlist)
        for line in filtlist:
            try:
                stringlist = self._split_line(line)[idx]
            except IndexError:
                continue
            reslist = self._dirtystring_to_list(stringlist)
            results += reslist
        if close:
            filtlist.close()
        try:
            return sorted(results)
        except TypeError:
            return results

    def _get_weekday(self, d):
        weekdays = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]
        twd = weekdays.index(d)  # get number of target weekday
        cwd = datetime.today().weekday()  # get number of current weekday
        td = abs(abs(cwd - twd) - 7)  # calculate time delta
        return datetime.combine(date.today() - timedelta(td), time())

    def _find_index(self,
                    search_terms):  # find index of message component, added to give more flexibility in log construction
        idx = None
        if not isinstance(search_terms, list):
            search_terms = [search_terms]
        found = False
        with open(self.log_path, "r") as logfile:
            while not found:
                line = next(logfile)
                messages = self._split_line(line)
                if len(messages) > 2:
                    found = True
            for i, message in enumerate(messages):
                if any(substring in message for substring in search_terms):
                    idx = i
                    break
        return idx

    def filter_by_time(self, *args, hours_ago=None, since_when=None, filtlist=None):
        if len(args) == 1:
            compare_dt = datetime(
                *args[0])  # insert specific datetime in the format expected by datetime ie (2020, 3, 9, 21, 0)
        elif hours_ago is not None:
            compare_dt = (datetime.now() - timedelta(hours=hours_ago))
        elif since_when is not None:
            if since_when not in self.since_when_dict.keys():
                since_when = "start"
            compare_dt = self.since_when_dict[since_when]

        results = []
        filtlist, close = self._manage_filtlist(filtlist=filtlist)

        for line in filtlist:
            dt = self._split_line(line)[0]
            dt = datetime.strptime(dt, '%Y-%b-%d %H:%M:%S')
            if dt >= compare_dt:
                results.append(line)

        if close:
            filtlist.close()

        self.filtlist = results
        self.ops += 1


In [5]:
rlp = ResultsLogParser()


In [51]:
rlp.filter_by_model("word2vec")

In [52]:
rlp.get_as_list("accuracy")

[0.3557692307692308, 0.3557692307692308]

In [53]:
rlp.undo_steps("all")

In [55]:
rlp.filter_by_time(since_when="Monday")

In [57]:
rlp.filter_by_time((2020,3,9,22,41))

In [6]:
rlp.get_as_list("failwords")

['and',
 'a',
 'of',
 'country-dance',
 'to',
 'rumours',
 'worn-out',
 'smoothfaced',
 'wormeaten',
 'discoloured',
 'panelling',
 'tissue-paper',
 'bellpull',
 'fouryearold',
 'blood-stained',
 'blottingpaper',
 'realised',
 'policestation',
 '"comin"',
 'practised',
 'sitting-room',
 'dressinggown',
 'marvelling',
 'ecarte',
 'reasoner',
 'analyse',
 'wellgrown',
 'aeroplane',
 'pocket-book',
 'sepulchre',
 'theatres',
 'note-book',
 'honour',
 'motor-car',
 'offence',
 'drawing-room',
 'honourable',
 'labour',
 'good-natured',
 'well-spoken',
 'befel',
 'first-class',
 'good-hearted',
 'dressing-table',
 'Jamess',
 'sittingroom',
 'programme',
 'centre',
 'wicker-work',
 'horror-stricken',
 'consultingroom',
 'battle-cry',
 'walking-stick',
 'morningroom',
 'out-of-the-way',
 '"familys"',
 'quarterpast',
 'thirty-five',
 'small-pox',
 '"kings"',
 'smokingroom',
 'consulting-room',
 'travelled',
 'countryhouses',
 'good-humoured',
 'whipcord',
 'illhealth',
 'splendour',
 'first-fru

In [9]:
import os, math, argparse, random
from nltk import word_tokenize as tokenize
from scc import *
import numpy as np

class LanguageModel:

    def __init__(self, trainingdir=r"data/Holmes_Training_Data", files=[]):
        self.training_dir = trainingdir
        self.files = files
        self.train()

    def __str__(self):
        return f"ngram trained on {len(self.files)} files"

    def train(self):
        self.unigram = {}
        self.bigram = {}

        self._processfiles()
        self._make_unknowns()
        self._discount()
        self._convert_to_probs()

    def _processline(self, line):
        tokens = ["__START"] + tokenize(line) + ["__END"]
        previous = "__END"
        for token in tokens:
            self.unigram[token] = self.unigram.get(token, 0) + 1
            current = self.bigram.get(previous, {})
            current[token] = current.get(token, 0) + 1
            self.bigram[previous] = current
            previous = token

    def _processfiles(self, verbose=False):
        for afile in self.files:
            if verbose:
                print("Processing {}".format(afile))
            try:
                with open(os.path.join(self.training_dir, afile)) as instream:
                    for line in instream:
                        line = line.rstrip()
                        if len(line) > 0:
                            self._processline(line)
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing {}: ignoring rest of file".format(afile))

    def _convert_to_probs(self):

        self.unigram = {k: v / sum(self.unigram.values()) for (k, v) in self.unigram.items()}
        self.bigram = {key: {k: v / sum(adict.values()) for (k, v) in adict.items()} for (key, adict) in
                       self.bigram.items()}
        self.kn = {k: v / sum(self.kn.values()) for (k, v) in self.kn.items()}

    def get_prob(self, token, context="", methodparams={}):
        if methodparams.get("method", "unigram") == "unigram":
            return self.unigram.get(token, self.unigram.get("__UNK", 0))
        else:
            if methodparams.get("smoothing", "kneser-ney") == "kneser-ney":
                unidist = self.kn
            else:
                unidist = self.unigram
            bigram = self.bigram.get(context[-1], self.bigram.get("__UNK", {}))
            big_p = bigram.get(token, bigram.get("__UNK", 0))
            lmbda = bigram["__DISCOUNT"]
            uni_p = unidist.get(token, unidist.get("__UNK", 0))
            # print(big_p,lmbda,uni_p)
            p = big_p + lmbda * uni_p
            return p

    def compute_prob_line(self, line, methodparams={}):
        # this will add _start to the beginning of a line of text
        # compute the probability of the line according to the desired model
        # and returns probability together with number of tokens

        tokens = ["__START"] + tokenize(line) + ["__END"]
        acc = 0
        for i, token in enumerate(tokens[1:]):
            acc += math.log(self.get_prob(token, tokens[:i + 1], methodparams))
        return acc, len(tokens[1:])

    def _make_unknowns(self, known=2):
        unknown = 0
        for (k, v) in list(self.unigram.items()):
            if v < known:
                del self.unigram[k]
                self.unigram["__UNK"] = self.unigram.get("__UNK", 0) + v
        for (k, adict) in list(self.bigram.items()):
            for (kk, v) in list(adict.items()):
                isknown = self.unigram.get(kk, 0)
                if isknown == 0:
                    adict["__UNK"] = adict.get("__UNK", 0) + v
                    del adict[kk]
            isknown = self.unigram.get(k, 0)
            if isknown == 0:
                del self.bigram[k]
                current = self.bigram.get("__UNK", {})
                current.update(adict)
                self.bigram["__UNK"] = current

            else:
                self.bigram[k] = adict

    def _discount(self, discount=0.75):
        # discount each bigram count by a small fixed amount
        self.bigram = {k: {kk: value - discount for (kk, value) in adict.items()} for (k, adict) in self.bigram.items()}

        # for each word, store the total amount of the discount so that the total is the same
        # i.e., so we are reserving this as probability mass
        for k in self.bigram.keys():
            lamb = len(self.bigram[k])
            self.bigram[k]["__DISCOUNT"] = lamb * discount

        # work out kneser-ney unigram probabilities
        # count the number of contexts each word has been seen in
        self.kn = {}
        for (k, adict) in self.bigram.items():
            for kk in adict.keys():
                self.kn[kk] = self.kn.get(kk, 0) + 1

def get_training_testing(training_dir=r"data/Holmes_Training_Data", split=0.5):
    filenames = os.listdir(training_dir)
    n = len(filenames)
    print("There are {} files in the training directory: {}".format(n, training_dir))
    random.seed(53)  # if you want the same random split every time
    random.shuffle(filenames)
    index = int(n * split)
    trainingfiles = filenames[:index]
    heldoutfiles = filenames[index:]
    return trainingfiles, heldoutfiles

MAX_FILES = 10

training, _ = get_training_testing()
mylm = LanguageModel(files=training[:MAX_FILES])


There are 522 files in the training directory: data/Holmes_Training_Data


In [11]:
keys = ["a)", "b)", "c)", "d)", "e)"]
scc = scc_reader()
acc = 0
correct, incorrect = [], []
for question in scc.questions:
    scores = []
    for key in keys:
        answord = question.get_field(key)
        s = mylm.get_prob(answord,methodparams={"smoothing":"kneser-ney"})
        scores.append(s)
    maxs = max(scores)
    idx = np.random.choice(
        [i for i, j in enumerate(scores) if j == maxs])  # find index/indices of answers with max score
    answer = keys[idx][0]  # answer is first letter of key w/o accompanying bracket
    qid = question.get_field("id")
    if answer == question.get_field("answer"):
        acc += 1
        correct.append(qid)
    else:
        incorrect.append(qid)

print(len(correct)/len(scc.questions))

0.18076923076923077


In [15]:
import json
json.loads('{"method":"unigram"}')

{'method': 'unigram'}

Using cache found in /root/.cache/torch/hub/pytorch_fairseq_master


In [18]:
import numpy as np
import string
from scc import *
from utils import *
import torch


roberta = torch.hub.load('pytorch/fairseq', 'roberta.large')
scc = scc_reader()

acc = 0
correct, incorrect = [], []
topk = 49500

for question in scc.questions[428:]:
    q = question.get_field("question").replace("_____", "TEMPMASK")
    translator = str.maketrans('', '', string.punctuation)
    q = q.translate(translator)
    q = q.replace("TEMPMASK", "<mask>")
    rob_masks = roberta.fill_mask(q, topk=topk)
    rob_ranks = [mask[2] for mask in rob_masks]
    candidates = [question.get_field(ak) for ak in scc.keys]
    ans_ranks = []
    for i, candidate in enumerate(candidates):
        if candidate in rob_ranks:
            ans_ranks.append(rob_ranks.index(candidate))
        else:
            ans_ranks.append(topk+1)
    mins = min(ans_ranks)
    idx = np.random.choice([i for i, j in enumerate(ans_ranks) if j == mins])
    answer = scc.keys[idx][0]
    qid = question.get_field('id')
    outcome = answer == question.get_field("answer")
    if outcome:
        acc += 1
        correct.append(qid)
    else:
        incorrect.append(qid)
    print(
                f"{qid}: {answer} {outcome} | {question.make_sentence(question.get_field(scc.keys[idx]), highlight=True)}")

Using cache found in /root/.cache/torch/hub/pytorch_fairseq_master


429: a False | We are *treated* in an enemy's country.
430: b False | An inspection of his chair showed me that he had been in the *style* of standing on it , which of course would be necessary in order that he should reach the ventilator.
431: d True | The other was a very small , dark fellow , with his hat pushed back and several *packages* under his arm.
432: b True | On the other hand , we brought to the surface an *object* of a most unexpected kind.
433: d False | And what deep and earnest purpose can he have which *played* for such a trial.
434: c False | Evidently , therefore , Alec Cunningham had lied when he said that the two men were *obdurate* when the shot was fired.
435: d False | Add to that the length of neck and head , and you get a *bite* not much less than two feet long probably more if there is any tail.
436: c False | I fainted when it was done , and I think that I must have been *watching* for a long time.
437: b True | Two hours passed slowly away , and then , sud

In [10]:
len(set([1,1,1,1,1]))==1

True