In [12]:
import numpy as np
import requests
import pyprind
import pickle
import matplotlib.pyplot as plt
import seaborn as sb
import itertools
import pandas as pd

In [2]:
# load all words vocabulary. All these words should be classified as positive samples.
positives = np.genfromtxt('words.txt', dtype=np.str, delimiter='\n\r')

In [3]:
positives[0:10]

array(['A', "A'asia", "A's", 'AA', "AA's", 'AAA', 'AAAA', 'AAAAAA', 'AAAL',
       'AAAS'], 
      dtype='<U60')

In [4]:
def collectTrainData(samples=100):
    seeds = np.random.uniform(0, 1919415150, samples)
    bar = pyprind.ProgBar(samples, bar_char='█')
    dump = {}
    for seed in seeds:
        r = requests.get('https://hola.org/challenges/word_classifier/testcase/%d'%seed)
        json = r.json()
        dump.update(json)
        bar.update()
    return dump

In [5]:
def saveTrainData(dump):
    with open('train.txt', 'wb') as file:
        pickle.dump(dump, file)

def loadTrainData():
    with open('train.txt', 'rb') as file:
        return pickle.loads(file.read())

In [5]:
train = collectTrainData()
saveTrainData(train)

0%                          100%
[██████████████████████████████] | ETA: 00:00:00
Total time elapsed: 00:01:09


In [6]:
train = loadTrainData()

In [7]:
[(key, train[key]) for key in train.keys()][:5]

[('stampeders', True),
 ('hobapoutaunes', False),
 ('palaeology', True),
 ('retched', True),
 ('electrophilicities', True)]

In [83]:
def validate(classifier, train):
    return np.sum([classifier(sample) == train[sample] for sample in train])/len(train)

In [10]:
chars = []
for w in positives:
    for c in list(w):
        chars.append(c)

In [19]:
chars_hist = {}
for c in chars:
    if c in chars_hist:
        chars_hist[c] += 1
    else:
        chars_hist[c] = 0

In [49]:
charProbs = {c: chars_hist[c]/sum(chars_hist.values()) for c in chars_hist}

In [58]:
def getCharPairs(word):
    return list(zip([None] + list(word), list(word)))

charPairsHist = {}

for w in positives:
    for p in getCharPairs(w):
        if p in charPairsHist:
            charPairsHist[p] += 1
        else:
            charPairsHist[p] = 0

In [61]:
charCondProbs = {(c1, c2): charPairsHist[(c1, c2)]/(chars_hist[c1] if c1 != None else len(positives)) for (c1, c2) in charPairsHist}

In [82]:
def getWordProb(word):
    prob = 1
    for c1, c2 in list(zip([None] + list(word), list(word))):
        if not c2 in charProbs:
            return 0
        if not (c1, c2) in charCondProbs:
            return 0
        prob *= charCondProbs[(c1, c2)] * charProbs[c2]
    return prob

In [85]:
validate(lambda w: getWordProb(w) > 0.1, train)

0.50973504616619836