In [111]:
from PIL import Image, ImageDraw, ImageFont
import sys
import math
import re
from collections import defaultdict

CHARACTER_WIDTH=14
CHARACTER_HEIGHT=25

In [2]:
def load_letters(fname):
    im = Image.open(fname)
    px = im.load()
    (x_size, y_size) = im.size
    print(im.size)
    print(int(x_size / CHARACTER_WIDTH) * CHARACTER_WIDTH)
    result = []
    for x_beg in range(0, int(x_size / CHARACTER_WIDTH) * CHARACTER_WIDTH, CHARACTER_WIDTH):
        result += [ [ "".join([ '*' if px[x, y] < 1 else ' ' for x in range(x_beg, x_beg+CHARACTER_WIDTH) ]) for y in range(0, CHARACTER_HEIGHT) ], ]
    return result

def load_training_letters(fname):
    TRAIN_LETTERS="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789(),.-!?\"' "
    letter_images = load_letters(fname)
    return { TRAIN_LETTERS[i]: letter_images[i] for i in range(0, len(TRAIN_LETTERS) ) }

In [77]:
def emis_prob(pic1, pic2, m):
    counter = sum([1 if pic1[r][c] == pic2[r][c] else 0 for c in range(len(pic1[0])) for r in range(len(pic1))])
    log_prob = 350 * math.log(m/100) + counter * math.log((100-m)/m)
    return log_prob

In [109]:
def simple_model(fuzzy_img, train_letters, m):
    seq = []
    TRAIN_LETTERS="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789(),.-!?\"' "
    for i in range(len(fuzzy_img)):
        max_let = ''
        max_prob = -100000000
        for let in TRAIN_LETTERS:
            temp_prob = emis_prob(fuzzy_img[i], train_letters[let], m)
            if temp_prob > max_prob:
                max_prob = temp_prob
                max_let = let
        seq.append(max_let)
    return "".join(seq)

In [123]:
train_letters = load_training_letters('courier-train.png')
test_letters = load_letters('test-19-0.png')

(1008, 25)
1008
(729, 25)
728


In [126]:
simple_model(test_letters, train_letters, 40)

' 1N D   , B  .  .   1OM Y  ,  nc     N,   ..  c n c.'

In [126]:
def strip_labels(string):
    output = re.sub(' ADJ | ADV | ADP | CONJ | DET | NOUN | NUM | PRON | PRT | VERB | X |\n', ' ', string)
    output = re.sub(' \'\' . ', '\" ', output)
    output = re.sub(r' \`\` . ', ' \"', output)
    output = re.sub(r'\`\` . ', '\"', output)
    output = re.sub(' , . ', ', ', output)
    output = re.sub(r' \? . ', '? ', output)
    output = re.sub(r' \! . ', '! ', output)
    output = re.sub(' . . ', '. ', output)
    output = re.sub('    ', '', output)
    return output

In [127]:
def read_data(fname):
    exemplars = []
    file = open(fname, 'r')
    for line in file:
        exemplars += [ strip_labels(line) ]
    return exemplars

In [129]:
test = read_data('bc.train')
test

aired, Hemphill charged.',
 'Wide range in bids',
 'Hemphill said the Hughes concern contracted to do the repairs at a cost of $500 for each joint.',
 'The bid from A. Belanger and Sons of Cambridge, Mass., which listed the same officers as Hughes, was $600 per joint.',
 'But, Hemphill added, bids from other contractors ranged from $2400 to $3100 per joint.',
 "Berger's decision to sue for the full amount of the performance bond was questioned by Wagner in the morning press conference.",
 'Wagner said the city paid only $37,500 to the Hughes company.',
 '"We won\'t know the full amount until we get a full report", Wagner said.',
 '"We can claim on the maximum amount of the bond", Berger said.',
 'Wagner replied, "Can\'t you just see the headline. \' . City Hooked for $172,000 \' ."?.',
 "' . know enough to sue.",
 'Berger insisted that "we know enough to sue for the full amount".',
 'Douglas M. Pratt, president of the PTC, who attended the meeting, said the transit company is reviewing

In [119]:
initial_dict = defaultdict(lambda: 0)
trans_dict = {}

def train(data):
    #print(data)
    TRAIN_LETTERS="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789(),.-!?\"' "

    # learn the initial probabilities

    for row in data:
        beg_pos = row[0]
        initial_dict[beg_pos] += 1
    for l in TRAIN_LETTERS:
        if l in initial_dict.keys():
            pass
        else:
            initial_dict[l] = 1
        
    total = sum(initial_dict.values())
    for key in initial_dict.keys():
        initial_dict[key] = initial_dict[key] / total
    
    # learn the transition probailities
    
    for row in data:
        for i in range(len(row)-1):
            if row[i] in trans_dict.keys():
                if row[i+1] in trans_dict[row[i]].keys():
                        trans_dict[row[i]][row[i+1]] += 1
                else:
                    trans_dict[row[i]][row[i+1]] = 1
            else:
                trans_dict[row[i]] = {row[i+1]: 1}

    for value in trans_dict.values():
        for letter in TRAIN_LETTERS:
            if letter in value.keys():
                pass
            else:
                value[letter] = 1

    for value in trans_dict.values():
        total = sum(value.values())
        for key in value.keys():
            value[key] = value[key] / total

In [120]:
train(test)

In [125]:
trans_dict['G']

{'r': 0.19392917369308602,
 'e': 0.25362563237774033,
 'O': 0.004384485666104553,
 'o': 0.26711635750421586,
 'a': 0.0984822934232715,
 'u': 0.04620573355817875,
 '.': 0.017537942664418212,
 'h': 0.00505902192242833,
 'i': 0.05834738617200674,
 'l': 0.026981450252951095,
 'A': 0.0006745362563237774,
 'F': 0.0003372681281618887,
 'M': 0.0003372681281618887,
 ' ': 0.003372681281618887,
 'S': 0.0003372681281618887,
 'E': 0.0006745362563237774,
 'y': 0.0013490725126475548,
 ',': 0.001011804384485666,
 "'": 0.0003372681281618887,
 'N': 0.002023608768971332,
 'I': 0.0006745362563237774,
 'B': 0.0003372681281618887,
 'C': 0.0003372681281618887,
 'D': 0.0003372681281618887,
 'G': 0.0003372681281618887,
 'H': 0.0003372681281618887,
 'J': 0.0003372681281618887,
 'K': 0.0003372681281618887,
 'L': 0.0003372681281618887,
 'P': 0.0003372681281618887,
 'Q': 0.0003372681281618887,
 'R': 0.0003372681281618887,
 'T': 0.0003372681281618887,
 'U': 0.0003372681281618887,
 'V': 0.0003372681281618887,
 'W': 