In [None]:
import pandas
import numpy
import collections
import scipy
import nltk

In [None]:
texts = pandas.read_excel("LibreOffice_Translations_sfx2messages_Tok Pisin.xlsx", na_values='',
                         dtype={'English': object, 'Tok Pisin': object})
texts = texts[['English', 'Tok Pisin']].dropna()

In [None]:
texts.English = texts.English.map(str).str.replace('~','').str.replace('_','').str.upper()
texts = texts[texts.English != 'SAVE'].copy()
texts['Tok Pisin'] = texts['Tok Pisin'].str.upper()

In [None]:
texts

In [None]:
all_locations = set(texts.index)

In [None]:
english_vocabulary = collections.defaultdict(list)
for loc,message in zip(texts.index, texts.English):
    for word in nltk.word_tokenize(message):
        english_vocabulary[word].append(loc)

In [None]:
def everygram_generator(sequence, min_tokens=1, max_tokens=None):
    sequence = list(sequence)
    sequence_length = len(sequence)
    if max_tokens is None:
        max_tokens = len(sequence)
    for i in range(len(sequence)):
        for j in range(min_tokens, max_tokens+1):
            if i+j < sequence_length:
                yield tuple(sequence[i:i+j])

def possible_translation_phrases(english_word):
    locations = english_vocabulary[english_word]
    for location in locations:
        tokens_of_translation = nltk.word_tokenize(texts.loc[location]['Tok Pisin'])
        for ngram in everygram_generator(tokens_of_translation, max_tokens=4):
            yield ngram

In [None]:
tokpisin_vocabulary = collections.defaultdict(list)
for loc,message in zip(texts.index, texts['Tok Pisin']):
    for word in everygram_generator(nltk.word_tokenize(message),max_tokens=4):
        tokpisin_vocabulary[word].append(loc)

In [None]:
list(everygram_generator(["The", "quick", "brown"]))

In [None]:
set(possible_translation_phrases("SAVE"))

In [None]:
def guess_translation(english_word):
    english_word = english_word.upper()
    english_locations = set(english_vocabulary[english_word])
    possible_tokpisin_phrases = possible_translation_phrases(english_word)
    lowest_probability = 1.0
    lowest_probability_phrase = None
    for possible_phrase in set(possible_tokpisin_phrases):
        tokpisin_locations = set(tokpisin_vocabulary[possible_phrase])
        together = english_locations.intersection(tokpisin_locations)
        probability = scipy.stats.binom_test(len(together),
                                            len(english_locations),
                                            len(tokpisin_locations) / len(all_locations),
                                            alternative='greater')
        bonferroni_corrected = (1 - ((1 - probability) ** len(possible_phrase)))
        if bonferroni_corrected < lowest_probability:
            lowest_probability = bonferroni_corrected
            lowest_probability_phrase = possible_phrase
    return (lowest_probability, lowest_probability_phrase)

In [None]:
guess_translation("SAVE")

In [None]:
%%time
extra_vocab = []
for word in english_vocabulary:
    if word in set(texts.English):
        # no need to figure it out, we've got a translation
        continue
    confidence, translation = guess_translation(word)
    if translation is None:
        continue
    extra_vocab.append({'English': word, 'Tok Pisin': " ".join(translation), 'confidence': confidence})
extra_vocab = pandas.DataFrame.from_records(extra_vocab).sort_values('confidence')

In [None]:
extra_vocab.sample(10, random_state=0)

In [None]:
extra_vocab.to_excel("derived-tokpisin-vocab.xlsx", index=False)