In [4]:
"""
Changing working directory to repository path 
in order to make simpler references to files/folder.

Also, adding src folder in the repository to import
any code that has been moved to py files for reusability
"""

import os
REPOSITORY_PATH = '/mnt/batch/tasks/shared/LS_root/mounts/clusters/mlgpu2/code/Users/santiago.a.diez/evaluating-student-writing-kaggle-challenge'
os.chdir(REPOSITORY_PATH)
import sys  
sys.path.insert(0, 'src')

In [21]:
import numpy as np
import pandas as pd
from glob import glob
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')
from ast import literal_eval

from eswkg.config import Config

In [6]:
def read_essay(essay_id, train_folder = Config.get_all_file_paths()["train_folder"]):
    with open(train_folder + f"/{essay_id}.txt") as f:
        essay = f.read()
    return essay


def read_essays(train_txt):
    train_txt_file_id, train_txt_file_text = [],[]
    for train_txt_file in train_txt:
        essay_id = os.path.basename(train_txt_file).rsplit(".",1)[0]

        train_txt_file_id.append(essay_id)
        train_txt_file_text.append(read_essay(essay_id))
    return pd.DataFrame({"id":train_txt_file_id, "text":train_txt_file_text})


def get_essay_entities(essay_text, essay_metadata):
    essay_entities = ["O"]*len(essay_text.split())
    for discourse_type, predictionstring in zip(essay_metadata["discourse_type"],essay_metadata["predictionstring"]):
        predictionstring_digits = list(map(int, predictionstring.split()))

        essay_entities[predictionstring_digits[0]] = f"B-{discourse_type}"
        for predictionstring_digits_index in predictionstring_digits[1:]:
           essay_entities[predictionstring_digits_index] = f"I-{discourse_type}"
    
    return essay_entities


def tag_essays(essays, essays_metadata):
    tagged_essays = pd.DataFrame()
    tagged_essays_list = []
    for _, essay in essays.iterrows():
        essay_id = essay["id"]
        essay_text = essay["text"]
        essay_metadata = essays_metadata.query("id == @essay_id")
        essay_entities = get_essay_entities(essay_text, essay_metadata)

        tagged_essays_list.append( 
            {
                "id": essay_id,
                "text": essay_text,
                "entities": essay_entities
            }
        )
    return pd.DataFrame.from_dict(tagged_essays_list)


In [7]:
file_paths = Config.get_all_file_paths()

In [9]:
essays_metadata = pd.read_csv(file_paths["train"])
essays_metadata[['discourse_id', 'discourse_start', 'discourse_end']] = essays_metadata[['discourse_id', 'discourse_start', 'discourse_end']].astype(int)

sample_submission = pd.read_csv(file_paths["sample_submission"])

#The glob module finds all the pathnames matching a specified pattern according to the rules used by the Unix shell
train_txt = glob(file_paths["train_folder"] + "/*.txt") 
test_txt = glob(file_paths["test_folder"] + "/*.txt")

In [14]:
def generate_file(generation_func, file_path, generate_file=False, *args):
    try:
        if generate_file:
            generation_func(*args).to_csv(file_path, index=False)
        return pd.read_csv(file_path)
    except FileNotFoundError as err:
        print(f"{err}, {type(err)}")
    except Exception as err:
        print(f"Unexpected {err}, {type(err)}")
        raise

In [17]:
create_essays_file = False
essays_file_path = file_paths["intermediate"]+"/train_text.csv"

essays = generate_file(read_essays, essays_file_path, create_essays_file, train_txt)

In [18]:
create_essay_entities_file = False
essay_entities_file_path = file_paths["model_input"]+"/essays_NER.csv"

essays_entities = generate_file(tag_essays, essay_entities_file_path, create_essay_entities_file, essays, essays_metadata)
essays_entities.entities = essays_entities.entities.apply(lambda x: literal_eval(x) )

In [22]:
essays_entities

Unnamed: 0,id,text,entities
0,0000D23A521A,"Some people belive that the so called ""face"" o...","[B-Position, I-Position, I-Position, I-Positio..."
1,00066EA9880D,Driverless cars are exaclty what you would exp...,"[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."
2,000E6DE9E817,Dear: Principal\n\nI am arguing against the po...,"[O, O, B-Position, I-Position, I-Position, I-P..."
3,001552828BD0,Would you be able to give your car up? Having ...,"[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."
4,0016926B079C,I think that students would benefit from learn...,"[B-Position, I-Position, I-Position, I-Positio..."
...,...,...,...
15589,FFF1442D6698,"Every student looks forward to summer break, i...","[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."
15590,FFF1ED4F8544,Many citizens argue that the Electoral college...,"[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."
15591,FFF868E06176,"Every summer break, students are given project...","[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."
15592,FFFD0AF13501,"In the article ""A Cowboy Who Rode the Waves"" L...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
