In [1]:
import numpy as np
import pandas as pd
import gc

from nltk.util import ngrams
from nltk import word_tokenize
import re

ROOT = ""
DATA_PATH = "data/"
OUTPUT_PATH = "output/"

In [2]:
train_df = pd.read_csv(DATA_PATH + "train.csv")

In [3]:
def check(sentence):
    incorrect_symbols = ['€', '≈', '¬', 'ќ', 'Ќ', '√', 'Ћ', '∆', 'ƒ', 'Є', '¤', 'ι', 'Ђ', 'Ў', 'ї', 'ћ', '▒', 'ﺎ', '\xad', 'æ']
    for symbol in incorrect_symbols:
        if symbol in sentence:
            return False
    return True

In [4]:
train_df["check"] = train_df["correct_text"].apply(check)
train_df = train_df[train_df["check"] == True]
train_df = train_df.drop(columns=["check"])

In [5]:
train_text = train_df["correct_text"].tolist()

In [6]:
punct = {'!', '.', ',', '?', '/', '-', ';', '"', "'", ':', '..', '...', '--', '---', '""', "''", '…'}

def tokenize_sentence(sentence):
    sentence = word_tokenize(sentence)
    sentence = list(filter(lambda word: word not in punct, sentence))
    sentence = ["<s>"] + sentence + ["</s>"]
    return sentence

In [7]:
tokenized_text = list(map(tokenize_sentence, train_text))
tokenized_text[0:3]

[['<s>', 'Считает', 'что', 'сможет', 'жить', 'вечно', '</s>'],
 ['<s>', '20', 'миллионов', 'и', 'ни', 'пенни', 'меньше', '</s>'],
 ['<s>', 'Но', 'и', 'мы', 'умрём', '</s>']]

In [8]:
from collections import Counter

d = dict()
for sentence in tokenized_text:
    for first, second in list(ngrams(sentence, n=2)):
        if first in d:
            d[first][second] += 1
        else:
            d[first] = Counter()
            d[first][second] += 1
for key in d.keys():
    d[key] = d[key].most_common(len(d[key]))

In [9]:
dr = dict()
for sentence in tokenized_text:
    for first, second in list(ngrams(sentence, n=2)):
        if second in dr:
            dr[second][first] += 1
        else:
            dr[second] = Counter()
            dr[second][first] += 1
for key in dr.keys():
    dr[key] = dr[key].most_common(len(dr[key]))

In [10]:
d3 = dict()
for sentence in tokenized_text:
    for first, second, third in list(ngrams(sentence, n=3)):
        if (first, second) in d3:
            d3[(first, second)][third] += 1
        else:
            d3[(first, second)] = Counter()
            d3[(first, second)][third] += 1
for key in d3.keys():
    d3[key] = d3[key].most_common(len(d3[key]))

In [11]:
dr3 = dict()
for sentence in tokenized_text:
    for first, second, third in list(ngrams(sentence, n=3)):
        if (second, third) in dr3:
            dr3[(second, third)][first] += 1
        else:
            dr3[(second, third)] = Counter()
            dr3[(second, third)][first] += 1
for key in dr3.keys():
    dr3[key] = dr3[key].most_common(len(dr3[key]))

In [12]:
dw = dict()
for sentence in tokenized_text:
    for first, second, third in list(ngrams(sentence, n=3)):
        if (first, third) in dw:
            dw[(first, third)][second] += 1
        else:
            dw[(first, third)] = Counter()
            dw[(first, third)][second] += 1
for key in dw.keys():
    dw[key] = dw[key].most_common(len(dw[key]))

In [13]:
train_X = train_df["corrupted_text"].tolist()

In [14]:
test_df = pd.read_csv(DATA_PATH + "private_test.csv")
test_X = test_df["corrupted_text"].tolist()

In [15]:
import re

def untokenize(words):
    text = ' '.join(words)
    step1 = text.replace("`` ", '"').replace(" ''", '"').replace('. . .',  '...')
    step2 = step1.replace(" ( ", " (").replace(" ) ", ") ")
    step3 = re.sub(r' ([.,:;?!%]+)([ \'"`])', r"\1\2", step2)
    step4 = re.sub(r' ([.,:;?!%]+)$', r"\1", step3)
    step5 = step4.replace(" '", "'").replace(" n't", "n't").replace(
         "can not", "cannot")
    step6 = step5.replace(" ` ", " '")
    return step6.strip()

In [21]:
from Levenshtein import distance
from tqdm.auto import tqdm

count = 0
submit_df = pd.DataFrame(columns=["corrupted_text"])

def predict(eval_X):
    global count, submit_df
    for indx, X in enumerate(tqdm(eval_X)):
        y_pred = ["<s>"]
        tokenized = tokenize_sentence(X)
        X_str = X
        X = X.split()
        for i in range(1, len(tokenized) - 1):
            if tokenized[i] not in d.keys():
                min_dist = 1e9
                predict = tokenized[i]
                if (y_pred[-1], tokenized[i + 1]) in dw.keys():
                    for value, number in dw[(y_pred[-1], tokenized[i + 1])]:
                        distance_ = distance(tokenized[i], value)
                        if distance_ < min_dist and value != "</s>" and value != "<s>":
                            min_dist = distance_
                            predict = value
                if i > 1:
                    if (y_pred[-2], y_pred[-1]) in d3.keys():
                        for value, number in d3[(y_pred[-2], y_pred[-1])]:
                            distance_ = distance(tokenized[i], value)
                            if distance_ < min_dist and value != "</s>" and value != "<s>":
                                min_dist = distance_
                                predict = value
                if y_pred[-1] in d.keys():
                    for value, number in d[y_pred[-1]]:
                        distance_ = distance(tokenized[i], value)
                        if distance_ < min_dist and value != "</s>" and value != "<s>":
                            min_dist = distance_
                            predict = value
                if i < len(tokenized) - 2:
                    if (tokenized[i + 1], tokenized[i + 2]) in dr3.keys():
                        for value, number in dr3[(tokenized[i + 1], tokenized[i + 2])]:
                            distance_ = distance(tokenized[i], value)
                            if distance_ < min_dist and value != "</s>" and value != "<s>":
                                min_dist = distance_
                                predict = value
                if tokenized[i + 1] in dr.keys():
                    for value, number in dr[tokenized[i + 1]]:
                        distance_ = distance(tokenized[i], value)
                        if distance_ < min_dist and value != "</s>" and value != "<s>":
                            min_dist = distance_
                            predict = value
                y_pred += [predict]
            else:
                y_pred += [tokenized[i]]
        y_pred = y_pred[1:]
        answer = list()
        X_str_tokenized = word_tokenize(X_str)
        j = 0
        for i in range(len(X_str_tokenized)):
            if not list(filter(lambda word: word not in punct, word_tokenize(X_str_tokenized[i]))) or X_str_tokenized[i] == ' ':
                answer.append(X_str_tokenized[i])
            else:
                if X_str_tokenized[i][0] in punct:
                    y_pred[j] = X_str_tokenized[i][0] + y_pred[j]
                if (X_str_tokenized[i].istitle() or j == 0) and '-' not in y_pred[j]:
                    answer.append(y_pred[j].title())
                else:
                    answer.append(y_pred[j])
                j += 1
        y_pred = untokenize(answer)
        submit_df.loc[len(submit_df.index)] = [y_pred]
#         if indx == 100:
#             break

In [22]:
predict(test_X)

  0%|          | 0/56526 [00:00<?, ?it/s]

In [23]:
submit_df = submit_df.rename(columns={"corrupted_text": submit_df["corrupted_text"][0]})

In [24]:
submit_df.drop([0]).reset_index().drop(columns=['index']).to_csv(OUTPUT_PATH + "private.csv", index=False)