In [None]:
import pandas
import numpy
import collections
import scipy
import nltk
import sklearn.feature_extraction
import sklearn.model_selection
import sklearn.dummy
import sklearn.tree
import sklearn.ensemble
import sklearn.linear_model
import sklearn.svm

In [None]:
texts = pandas.read_excel("LibreOffice_Translations_sfx2messages_Tok Pisin.xlsx", na_values='',
                         dtype={'English': object, 'Tok Pisin': object})
texts = texts[['English', 'Tok Pisin']].dropna()

In [None]:
texts.English = texts.English.map(str).str.replace('~','').str.replace('_','').str.upper()
texts['Tok Pisin'] = texts['Tok Pisin'].str.upper()

In [None]:
texts['english_source_length'] = texts.English.map(nltk.word_tokenize).map(len)
texts.english_source_length.plot.hist(bins=38)
texts['tokpisin_length'] = texts['Tok Pisin'].map(nltk.word_tokenize).map(len)

In [None]:
two_word_source_texts = texts[texts.english_source_length == 2]
two_word_source_texts.drop_duplicates()

In [None]:
def default_classifier():
    return sklearn.ensemble.RandomForestClassifier(random_state=12345)

In [None]:
def make_model(df, already_translated_count, verbose=False, model_creator=default_classifier):
    cvec = sklearn.feature_extraction.text.CountVectorizer()
    all_sentences = []
    correct_predictions = []
    for english, tokpisin in zip(df.English, df['Tok Pisin']):
        tokpisin_tokens = nltk.word_tokenize(tokpisin)
        if len(tokpisin_tokens) < already_translated_count:
            continue
        left_sentence = []
        for word in nltk.word_tokenize(english):
            left_sentence.append(f"english_{word}")
        for i, word in enumerate(tokpisin_tokens):
            if i >= already_translated_count:
                # If you have to have a consistent number of words
                # return a null-word marker here
                break
            left_sentence.append(f"tokpisin_{word}")
        sentence = " ".join(left_sentence)
        if already_translated_count >= len(tokpisin_tokens):
            correct_prediction = "[END]"
        else:
            correct_prediction = tokpisin_tokens[already_translated_count]
        if verbose:
            print(sentence.replace(' ', ' + '),"=", correct_prediction, f"|{english.lower()} -> {tokpisin.lower()}")
        all_sentences.append(sentence)
        correct_predictions.append(correct_prediction)
    if len(all_sentences) == 0:
        return None, None
    # This is where we make some sort of embedding of the sentence.
    # If you have enough data, use word embeddings in a large R^n space and use deep learning.
    # For low resource languages, use something denser.
    X = cvec.fit_transform(all_sentences)
    #model = sklearn.linear_model.LogisticRegression()
    # Keeps outputing OL
    if len(set(correct_predictions)) == 1:
        model = sklearn.dummy.DummyClassifier(strategy='most_frequent')
    else:
        model = model_creator()
    model.fit(X, correct_predictions)
    return (cvec, model)

def make_models(df, model_creator=default_classifier):
    answer = {}
    i = 0
    while True:
        answer[i] = make_model(df, i, model_creator=model_creator)
        if answer[i] == (None, None):
            del answer[i]
            return answer
        i += 1    

In [None]:
make_model(two_word_source_texts, 12, verbose=True)

In [None]:
%%time
translator = make_models(two_word_source_texts)

In [None]:
class TranslatorExhaustion(Exception):
    pass

def suggest_next_token(translator, english_sentence, tokens_output_so_far=None):
    if tokens_output_so_far is None:
        tokens_output_so_far = []
    token_count = len(tokens_output_so_far)
    if token_count not in translator:
        # We know something went wrong, though
        raise TranslatorExhaustion
    cvec, model = translator[token_count]
    left_sentence = []
    for word in nltk.word_tokenize(english_sentence):
        left_sentence.append(f"english_{word}")
    left_sentence += [f"tokpisin_{word}" for word in tokens_output_so_far]
    X = cvec.transform([" ".join(left_sentence)])
    predictions = model.predict(X)
    return predictions[0]

def suggest_translation(translator, english_sentence):
    tokens = []
    while True:
        try:
            suggestion = suggest_next_token(translator, english_sentence, tokens)
        except TranslatorExhaustion:
            return " ".join(tokens) + " [INCOMPLETE]"
        if suggestion == "[END]":
            if len(tokens) == 0:
                return ""
            if tokens[-1] in "?.!":
                return " ".join(tokens[:-1]) + tokens[-1]
            else:
                return " ".join(tokens)
        tokens.append(suggestion)
        

In [None]:
suggest_next_token(translator, "LAST PAGE", [])

In [None]:
suggest_next_token(translator, "LAST PAGE", ['PEIJ'])

In [None]:
suggest_next_token(translator, "LAST PAGE", ['PEIJ', 'ANTAP'])

In [None]:
suggest_translation(translator, "LAST PAGE")

In [None]:
suggest_translation(translator, "NEXT PAGE")

In [None]:
def evaluate_translator(df, model_creator=sklearn.ensemble.RandomForestClassifier):
    loo = sklearn.model_selection.LeaveOneOut()
    for train_index, test_index in loo.split(df):
        train = df.iloc[train_index]
        testing_sentence = df.iloc[test_index].iloc[0]
        translator = make_models(train)
        translated = suggest_translation(translator, testing_sentence.English)
        if translated == testing_sentence['Tok Pisin']:
            print("*****",testing_sentence.English,"translated correctly as", translated)
        else:
            print(testing_sentence.English,"=", translated, ". Correct answer is",testing_sentence['Tok Pisin'])

In [None]:
evaluate_translator(texts)