In [None]:
#  -*- coding: utf-8 -*-
import math
import argparse
import nltk
import codecs
from collections import defaultdict
import json
import requests

"""
This file is part of the computer assignments for the course DD1418/DD2418 Language engineering at KTH.
Created 2017 by Johan Boye and Patrik Jonell.
"""

class BigramTester(object):
    def __init__(self):
        """
        This class reads a language model file and a test file, and computes
        the entropy of the latter. 
        """
        # The mapping from words to identifiers.
        self.index = {}

        # The mapping from identifiers to words.
        self.word = {}

        # An array holding the unigram counts.
        self.unigram_count = {}

        # The bigram log-probabilities.
        self.bigram_prob = defaultdict(dict)

        # Number of unique words (word forms) in the training corpus.
        self.unique_words = 0

        # The total number of words in the training corpus.
        self.total_words = 0

        # The average log-probability (= the estimation of the entropy) of the test corpus.
        # Important that it is named self.logProb for the --check flag to work
        self.logProb = 0

        # The identifier of the previous word processed in the test corpus. Is -1 if the last word was unknown.
        self.last_index = -1

        # The fraction of the probability mass given to unknown words.
        self.lambda3 = 0.000001

        # The fraction of the probability mass given to unigram probabilities.
        self.lambda2 = 0.01 - self.lambda3

        # The fraction of the probability mass given to bigram probabilities.
        self.lambda1 = 0.99

        # The number of words processed in the test corpus.
        self.test_words_processed = 0


    def read_model(self, filename):
        """
        Reads the contents of the language model file into the appropriate data structures.

        :param filename: The name of the language model file.
        :return: <code>true</code> if the entire file could be processed, false otherwise.
        """

        try:
            with codecs.open(filename, 'r', 'utf-8') as f:
                self.unique_words, self.total_words = map(int, f.readline().strip().split(' '))
                # YOUR CODE HERE
                return True
        except IOError:
            print("Couldn't find bigram probabilities file {}".format(filename))
            return False


    def compute_entropy_cumulatively(self, word):
        # YOUR CODE HERE
        pass

    def process_test_file(self, test_filename):
        """
        <p>Reads and processes the test file one word at a time. </p>

        :param test_filename: The name of the test corpus file.
        :return: <code>true</code> if the entire file could be processed, false otherwise.
        """
        try:
            with codecs.open(test_filename, 'r', 'utf-8') as f:
                self.tokens = nltk.word_tokenize(f.read().lower()) # Important that it is named self.tokens for the --check flag to work
                for token in self.tokens:
                    self.compute_entropy_cumulatively(token)
            return True
        except IOError:
            print('Error reading testfile')
            return False


def main():
    """
    Parse command line arguments
    """
    parser = argparse.ArgumentParser(description='BigramTester')
    parser.add_argument('--file', '-f', type=str,  required=True, help='file with language model')
    parser.add_argument('--test_corpus', '-t', type=str, required=True, help='test corpus')
    parser.add_argument('--check', action='store_true', help='check if your alignment is correct')

    arguments = parser.parse_args()

    bigram_tester = BigramTester()
    bigram_tester.read_model(arguments.file)
    bigram_tester.process_test_file(arguments.test_corpus)
    if arguments.check:
        results  = bigram_tester.logProb

        payload = json.dumps({
            'model': open(arguments.file, 'r').read(),
            'tokens': bigram_tester.tokens,
            'result': results
        })
        response = requests.post(
            'https://language-engineering.herokuapp.com/lab2_tester',
            data=payload,
            headers={'content-type': 'application/json'}
        )
        response_data = response.json()
        if response_data['correct']:
            print('Read {0:d} words. Estimated entropy: {1:.2f}'.format(bigram_tester.test_words_processed, bigram_tester.logProb))
            print('Success! Your results are correct')
        else:
            print('Your results:')
            print('Estimated entropy: {0:.2f}'.format(bigram_tester.logProb))
            print("The server's results:\n Entropy: {0:.2f}".format(response_data['result']))

    else:
        print('Read {0:d} words. Estimated entropy: {1:.2f}'.format(bigram_tester.test_words_processed, bigram_tester.logProb))

if __name__ == "__main__":
    main()


In [272]:
import math

In [273]:
i = math.exp(-3.986130977581868)
#i = -3.986130977581868

In [274]:
like = math.exp(-4.454347296253507)
#like = -4.454347296253507

In [317]:
a3 = 0.000001 

In [276]:
a2 = 0.01 - a3
print(a2)

0.009999000000000001


In [277]:
a1 =0.99

In [320]:
an = round((a2*(86/24944))+a3,15)
#print('{0:.15f}'.format(an))

In [321]:
an1 = round((a2*(4/24944))+a3,15)
#print('{0:.15f}'.format(an1))

In [322]:
an2 = round((a2*(348/24944))+a3,15)
#print('{0:.15f}'.format(an2))

In [323]:
an3 = round(i,15)
#print('{0:.15f}'.format(an3))

In [324]:
an4 = round(like,15)
#print('{0:.15f}'.format(an4))
an4

0.011627906976744

In [325]:
an5 = round((a2*(36/24944))+a3,15)
#print('{0:.15f}'.format(an5))

In [326]:
an6 = round((a2*(0/24944))+a3,15)
#print('{0:.15f}'.format(an6))

In [327]:
an7= (a2*(700/24944))+a3
#print('{0:.15f}'.format(an7))
round(an7,15)

0.000281600545221

In [328]:
an8 = a3
print(a3)

1e-06


In [329]:
an9= round((a2*(36/24944))+a3,15)
#print('{0:.15f}'.format(an9))

In [330]:
an10 = round((a2*(700/24944))+a3,15)
#print('{0:.15f}'.format(an10))

In [331]:
an11= round((a2*(1/24944))+a3,15)
#print('{0:.15f}'.format(an11))

In [332]:
an12= round((a2*(30/24944))+a3,15)
#print('{0:.15f}'.format(an12))

In [333]:
an13= round((86/24944),15)
#print('{0:.15f}'.format(an13))

In [334]:
sum = math.log(an)+math.log(an1)+math.log(an2)+math.log(0.000001)+math.log(i)+math.log(an5)+math.log(an6)+math.log(an7)+math.log(an8)+math.log(0.000001)+math.log(an10)+math.log(an11)+math.log(like)+math.log(an9)+math.log(0.000001)+math.log(an12)+math.log(like)+math.log(an9)+math.log(an13)

In [335]:
ans = (-sum)/19

In [336]:
ans

10.206978916199787

In [76]:
+math.log(like)+math.log(an9)

-13.334075728414074

In [1]:
import math
import argparse
import nltk
import codecs
from collections import defaultdict
import json
import requests

In [2]:
index = {}
word = {}
unigram_count = {}
bigram_prob = defaultdict(dict)
unique_words = 0
total_words = 0
logProb = 0
last_index = -1
lambda3 = 0.000001
lambda2 = 0.01 - lambda3

lambda1 = 0.99 
test_words_processed = 0

In [19]:
def read_model(filename):

    try:
        with codecs.open(filename, 'r', 'utf-8') as f:
            unique_words, total_words = map(int, f.readline().strip().split(' '))
            for i in range(0, unique_words):
                content = [f.readline().strip().split(' ')]
                index[content[0][1]] = content[0][0]
                word[content[0][0]] = content[0][1]
                unigram_count[content[0][0]] = content[0][2]
            for i in f.readlines(): 
                pro = [i.strip().split(' ')]
                if pro[0][0] != str(-1):
                    bigram_prob[pro[0][0]][pro[0][1]] = pro[0][2]
                print(bigram_prob)
            return True
    except IOError:
        print("Couldn't find bigram probabilities file {}".format(filename))
        return False

In [20]:
read_model("kafka_model_correct.txt")

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




defaultdict(<class 'dict'>, {'0': {'1': '0.000000000000000'}, '1': {'2': '0.000000000000000'}, '2': {'3': '0.000000000000000'}, '3': {'4': '0.000000000000000'}, '4': {'5': '-4.025351690735150', '46': '-4.025351690735150', '26': '-3.332204510175204', '527': '-4.025351690735150', '556': '-3.332204510175204', '567': '-3.332204510175204', '143': '-4.025351690735150', '49': '-1.252762968495369', '861': '-4.025351690735150', '390': '-4.025351690735150', '167': '-4.025351690735150', '20': '-2.926739402067040', '12': '-2.233592221507095', '617': '-3.332204510175204', '1804': '-4.025351690735150', '1808': '-4.025351690735150', '1255': '-3.332204510175204', '1190': '-3.332204510175204', '1045': '-4.025351690735150', '434': '-4.025351690735150', '2040': '-4.025351690735150', '242': '-4.025351690735150', '1138': '-4.025351690735150', '1491': '-4.025351690735150', '1441': '-4.025351690735150', '8': '-4.025351690735150', '1248': '-4.025351690735150', '368': '-4.025351690735150'}, '5': {'6': '0.0000

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




defaultdict(<class 'dict'>, {'0': {'1': '0.000000000000000'}, '1': {'2': '0.000000000000000'}, '2': {'3': '0.000000000000000'}, '3': {'4': '0.000000000000000'}, '4': {'5': '-4.025351690735150', '46': '-4.025351690735150', '26': '-3.332204510175204', '527': '-4.025351690735150', '556': '-3.332204510175204', '567': '-3.332204510175204', '143': '-4.025351690735150', '49': '-1.252762968495369', '861': '-4.025351690735150', '390': '-4.025351690735150', '167': '-4.025351690735150', '20': '-2.926739402067040', '12': '-2.233592221507095', '617': '-3.332204510175204', '1804': '-4.025351690735150', '1808': '-4.025351690735150', '1255': '-3.332204510175204', '1190': '-3.332204510175204', '1045': '-4.025351690735150', '434': '-4.025351690735150', '2040': '-4.025351690735150', '242': '-4.025351690735150', '1138': '-4.025351690735150', '1491': '-4.025351690735150', '1441': '-4.025351690735150', '8': '-4.025351690735150', '1248': '-4.025351690735150', '368': '-4.025351690735150'}, '5': {'6': '0.0000

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




defaultdict(<class 'dict'>, {'0': {'1': '0.000000000000000'}, '1': {'2': '0.000000000000000'}, '2': {'3': '0.000000000000000'}, '3': {'4': '0.000000000000000'}, '4': {'5': '-4.025351690735150', '46': '-4.025351690735150', '26': '-3.332204510175204', '527': '-4.025351690735150', '556': '-3.332204510175204', '567': '-3.332204510175204', '143': '-4.025351690735150', '49': '-1.252762968495369', '861': '-4.025351690735150', '390': '-4.025351690735150', '167': '-4.025351690735150', '20': '-2.926739402067040', '12': '-2.233592221507095', '617': '-3.332204510175204', '1804': '-4.025351690735150', '1808': '-4.025351690735150', '1255': '-3.332204510175204', '1190': '-3.332204510175204', '1045': '-4.025351690735150', '434': '-4.025351690735150', '2040': '-4.025351690735150', '242': '-4.025351690735150', '1138': '-4.025351690735150', '1491': '-4.025351690735150', '1441': '-4.025351690735150', '8': '-4.025351690735150', '1248': '-4.025351690735150', '368': '-4.025351690735150'}, '5': {'6': '0.0000

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




defaultdict(<class 'dict'>, {'0': {'1': '0.000000000000000'}, '1': {'2': '0.000000000000000'}, '2': {'3': '0.000000000000000'}, '3': {'4': '0.000000000000000'}, '4': {'5': '-4.025351690735150', '46': '-4.025351690735150', '26': '-3.332204510175204', '527': '-4.025351690735150', '556': '-3.332204510175204', '567': '-3.332204510175204', '143': '-4.025351690735150', '49': '-1.252762968495369', '861': '-4.025351690735150', '390': '-4.025351690735150', '167': '-4.025351690735150', '20': '-2.926739402067040', '12': '-2.233592221507095', '617': '-3.332204510175204', '1804': '-4.025351690735150', '1808': '-4.025351690735150', '1255': '-3.332204510175204', '1190': '-3.332204510175204', '1045': '-4.025351690735150', '434': '-4.025351690735150', '2040': '-4.025351690735150', '242': '-4.025351690735150', '1138': '-4.025351690735150', '1491': '-4.025351690735150', '1441': '-4.025351690735150', '8': '-4.025351690735150', '1248': '-4.025351690735150', '368': '-4.025351690735150'}, '5': {'6': '0.0000

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




defaultdict(<class 'dict'>, {'0': {'1': '0.000000000000000'}, '1': {'2': '0.000000000000000'}, '2': {'3': '0.000000000000000'}, '3': {'4': '0.000000000000000'}, '4': {'5': '-4.025351690735150', '46': '-4.025351690735150', '26': '-3.332204510175204', '527': '-4.025351690735150', '556': '-3.332204510175204', '567': '-3.332204510175204', '143': '-4.025351690735150', '49': '-1.252762968495369', '861': '-4.025351690735150', '390': '-4.025351690735150', '167': '-4.025351690735150', '20': '-2.926739402067040', '12': '-2.233592221507095', '617': '-3.332204510175204', '1804': '-4.025351690735150', '1808': '-4.025351690735150', '1255': '-3.332204510175204', '1190': '-3.332204510175204', '1045': '-4.025351690735150', '434': '-4.025351690735150', '2040': '-4.025351690735150', '242': '-4.025351690735150', '1138': '-4.025351690735150', '1491': '-4.025351690735150', '1441': '-4.025351690735150', '8': '-4.025351690735150', '1248': '-4.025351690735150', '368': '-4.025351690735150'}, '5': {'6': '0.0000

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



defaultdict(<class 'dict'>, {'0': {'1': '0.000000000000000'}, '1': {'2': '0.000000000000000'}, '2': {'3': '0.000000000000000'}, '3': {'4': '0.000000000000000'}, '4': {'5': '-4.025351690735150', '46': '-4.025351690735150', '26': '-3.332204510175204', '527': '-4.025351690735150', '556': '-3.332204510175204', '567': '-3.332204510175204', '143': '-4.025351690735150', '49': '-1.252762968495369', '861': '-4.025351690735150', '390': '-4.025351690735150', '167': '-4.025351690735150', '20': '-2.926739402067040', '12': '-2.233592221507095', '617': '-3.332204510175204', '1804': '-4.025351690735150', '1808': '-4.025351690735150', '1255': '-3.332204510175204', '1190': '-3.332204510175204', '1045': '-4.025351690735150', '434': '-4.025351690735150', '2040': '-4.025351690735150', '242': '-4.025351690735150', '1138': '-4.025351690735150', '1491': '-4.025351690735150', '1441': '-4.025351690735150', '8': '-4.025351690735150', '1248': '-4.025351690735150', '368': '-4.025351690735150'}, '5': {'6': '0.00000

True

In [None]:
if self.index[self.ac[-1]] in kes.keys():
                k = self.index[self.ac[-1]]
                if self.index[word] in kes[k].values():

In [22]:
d = {0 : [1,2], 2 : [3,4]}

In [26]:
if 3 in d[2]:    
return True

SyntaxError: 'return' outside function (<ipython-input-26-a49b6fc9c908>, line 2)