In [None]:
import os
import re
import glob

import numpy as np
import pandas as pd

import matplotlib.pylab as plt
import seaborn as sns

from tqdm import tqdm
from itertools import cycle

plt.style.use("ggplot")
color_pal = plt.rcParams["axes.prop_cycle"].by_key()["color"]
color_cycle = cycle(plt.rcParams["axes.prop_cycle"].by_key()["color"])

In [None]:
PATH = "../input/feedback-prize-2021"

train_df = pd.read_csv(f'{PATH}/train.csv')
ss_df = pd.read_csv(f'{PATH}/sample_submission.csv')

train_txt = glob.glob(f"{PATH}/train/*.txt")
test_txt = glob.glob(f"{PATH}/test/*.txt")

train_df.shape, ss_df.shape, len(train_txt), len(test_txt)

In [None]:
train_txt_dict = {}
for txt_path in train_txt:
    with open(txt_path, 'r') as file: 
        data = file.read()
    
    txt_id = txt_path.split('/')[-1].split('.')[0]
    train_txt_dict[txt_id] = data

len(train_txt_dict)

In [None]:
train_df.head()

In [None]:
def get_discourse_text(txt_id, start, end):
    data = train_txt_dict[txt_id]
    return data[int(start):int(end)]

train_df["discourse_start_end_text"] = train_df.apply(lambda x: get_discourse_text(x.id, x.discourse_start, x.discourse_end), axis=1)
train_df.head()

In [None]:
tmp_df = train_df.query("discourse_text !=  discourse_start_end_text")
tmp_df.shape

In [None]:
train_df['discourse_start_end_text'] = train_df['discourse_start_end_text'].apply(lambda x: x.strip())
train_df['discourse_text'] = train_df['discourse_text'].apply(lambda x: x.strip())

temp_df = train_df.query("discourse_text !=  discourse_start_end_text")
temp_df.shape

In [None]:
def get_compare_last_char(discourse_text, discourse_start_end_text):
    if discourse_text != discourse_start_end_text:
        discourse_start_end_text_last = discourse_start_end_text[:len(discourse_text)]
        if discourse_text == discourse_start_end_text_last:
            rm_car = discourse_start_end_text[len(discourse_text):]
        else:
            rm_car = np.nan
    else:
        rm_car = 1
    
    return rm_car

train_df['last_char'] = train_df.apply(lambda x: get_compare_last_char(x.discourse_text, x.discourse_start_end_text), axis=1)

In [None]:
def remove_last_chars(discourse_text, discourse_start_end_text):
    if discourse_text != discourse_start_end_text:
        discourse_start_end_text_last = discourse_start_end_text[:len(discourse_text)]
        if discourse_text == discourse_start_end_text_last:
            rm_car = discourse_start_end_text[:len(discourse_text)]
        else:
            rm_car = discourse_start_end_text
    else:
        rm_car = discourse_start_end_text
    
    return rm_car

train_df['discourse_start_end_text_last_char_rm'] = train_df.apply(lambda x: remove_last_chars(x.discourse_text, x.discourse_start_end_text), axis=1)

In [None]:
tmp_df = train_df.query("discourse_text !=  discourse_start_end_text_last_char_rm")
tmp_df.shape

## New Prediction String

In [None]:
train_df.loc[8916, 'discourse_text'] = train_df.iloc[8916]['discourse_text'].replace('florida','LOCATION_NAME')

In [None]:
def find_txt(id, discourse_text, discourse_start, discourse_end):
    text = train_txt_dict[id]
    text = text.strip()
    discourse_text = re.escape(discourse_text.strip())
    try:
        out = re.finditer(discourse_text, text)
        start_diff, end_diff = np.inf, np.inf
        start, end = None, None
        if out is not None:
            for obj in out:
                obj_start = obj.start()
                obj_end = obj.end()
                obj_start_diff = abs(obj_start - discourse_start)
                obj_end_diff = abs(obj_end - discourse_end)
                if obj_start_diff < start_diff:
                    start = obj_start
                    end = obj_end
                    start_diff = abs(obj_start_diff)
                    end_diff = obj_end_diff
                    
        else:
            start = -1
            end = -1
    except:
        start = -1
        end = -1

    return start, end

train_df["find_start_end"] = train_df.apply(lambda x: find_txt(x.id, x.discourse_text, x.discourse_start, x.discourse_end), axis=1)

In [None]:
train_df['new_start'] = train_df["find_start_end"].apply(lambda x: x[0])
train_df['new_end'] = train_df["find_start_end"].apply(lambda x: x[1])

In [None]:
train_df["start_diff"] = np.abs(train_df['discourse_start'] - train_df['new_start'])
train_df["end_diff"] = np.abs(train_df['discourse_end'] - train_df['new_end'])

In [None]:
plt.figure(figsize=(25, 9))
sns.ecdfplot(data=train_df.query("start_diff > 1"), x="start_diff")
plt.show()

In [None]:
# new prediction string

def calc_word_indices(id, discourse_start, discourse_end):

    discourse_start = int(discourse_start)
    discourse_end = int(discourse_end)

    full_text = train_txt_dict[id]
    start_index = len(full_text[:discourse_start].split())
    token_len = len(full_text[discourse_start:discourse_end].split())
    output = list(range(start_index, start_index + token_len))
    if output[-1] >= len(full_text.split()):
        output = list(range(start_index, start_index + token_len-1))

    return output

In [None]:
train_df['new_predictionstring'] = train_df.apply(lambda x: calc_word_indices(x.id, x.new_start, x.new_end), axis=1)

In [None]:
train_df.head()

In [None]:
train_df.to_csv("new_predictionstring.csv", index=False)