In [1]:
"""
Changing working directory to repository path 
in order to make simpler references to files/folder.

Also, adding src folder in the repository to import
any code that has been moved to py files for reusability
"""

import os
REPOSITORY_PATH = '/mnt/batch/tasks/shared/LS_root/mounts/clusters/mlgpu2/code/Users/santiago.a.diez/evaluating-student-writing-kaggle-challenge'
os.chdir(REPOSITORY_PATH)
import sys  
sys.path.insert(0, 'src')

In [2]:
import numpy as np
import pandas as pd
from glob import glob
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

from eswkg.config import Config

In [7]:
def read_essay(essay_id, train_folder = Config.get_all_file_paths()["train_folder"]):
    with open(train_folder + f"/{essay_id}.txt") as f:
        essay = f.read()
    return essay

def read_essays(train_txt):
    train_txt_file_id, train_txt_file_text = [],[]
    for train_txt_file in train_txt:
        essay_id = os.path.basename(train_txt_file).rsplit(".",1)[0]

        train_txt_file_id.append(essay_id)
        train_txt_file_text.append(read_essay(essay_id))
    return pd.DataFrame({"id":train_txt_file_id, "text":train_txt_file_text})

def get_essay_entities(essay_text, essay_metadata):
    essay_entities = ["O"]*len(essay_text.split())
    for discourse_type, predictionstring in zip(essay_metadata["discourse_type"],essay_metadata["predictionstring"]):
        predictionstring_digits = list(map(int, predictionstring.split()))

        essay_entities[predictionstring_digits[0]] = f"B-{discourse_type}"
        for predictionstring_digits_index in predictionstring_digits[1:]:
           essay_entities[predictionstring_digits_index] = f"I-{discourse_type}"
    
    return essay_entities

def tag_essays(essays, essays_metadata):
    tagged_essays = pd.DataFrame()
    tagged_essays_list = []
    for _, essay in essays.iterrows():
        essay_id = essay["id"]
        essay_text = essay["text"]
        essay_metadata = essays_metadata.query("id == @essay_id")
        essay_entities = get_essay_entities(essay_text, essay_metadata)

        tagged_essays_list.append( 
            {
                "id": essay_id,
                "text": essay_text,
                "entities": essay_entities
            }
        )
    return pd.DataFrame.from_dict(tagged_essays_list)


In [4]:
file_paths = Config.get_all_file_paths()

In [11]:
essays_metadata = pd.read_csv(file_paths["train"])
train[['discourse_id', 'discourse_start', 'discourse_end']] = train[['discourse_id', 'discourse_start', 'discourse_end']].astype(int)

sample_submission = pd.read_csv(file_paths["sample_submission"])

#The glob module finds all the pathnames matching a specified pattern according to the rules used by the Unix shell
train_txt = glob(file_paths["train_folder"] + "/*.txt") 
test_txt = glob(file_paths["test_folder"] + "/*.txt")

In [8]:
create_train_text_file = False

train_text_path = file_paths["intermediate"]+"/train_text.csv"
if create_train_text_file:
    read_essays(train_txt).to_csv(train_text_path, index=False)

if os.path.isfile(train_text_path):
    essays = pd.read_csv(train_text_path)
else:
    print(f"File {train_text_path} does not exist.")

In [33]:
tagged_essays = tag_essays(essays=essays, essays_metadata=essays_metadata)