## Pre-processing

### Load essays and tag every word in the essay

In [69]:
import pandas as pd 
import os
import string
import re
from tqdm import tqdm

In [70]:
# Loading all essays/texts

essay_dir = '../data/feedback-prize-2021/train'

essays_data = []

def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

for filename in tqdm(os.listdir(essay_dir), desc="Processing essays"):
    if filename.endswith('.txt'):
        essay_id = filename[:-4]
        file_path = os.path.join(essay_dir, filename)
        with open(file_path, 'r') as file:
            essay_text = file.read()
            essay_text_no_punct = remove_punctuation(essay_text)
            essays_data.append((essay_id, essay_text_no_punct.split()))

essays_df = pd.DataFrame(essays_data, columns=['id', 'essay_text'])

Processing essays: 100%|██████████| 15594/15594 [00:03<00:00, 4661.47it/s]


In [71]:
text = (essays_df[essays_df["id"] == '0A0AA9C21C5D']["essay_text"]).iloc[0]
print(text)

['Dear', 'state', 'senator', 'Many', 'people', 'believe', 'that', 'the', 'Electoral', 'College', 'should', 'be', 'abolished', 'while', 'others', 'believe', 'that', 'the', 'Electoral', 'College', 'should', 'stay', 'as', 'it', 'is', 'However', 'what', 'most', 'people', 'who', 'want', 'to', 'keep', 'the', 'electoral', 'college', 'do', 'not', 'know', 'is', 'that', 'when', 'you', 'vote', 'for', 'a', 'presidential', 'candidate', 'you', 'are', 'actually', 'voting', 'for', 'a', 'slate', 'of', 'electors', 'who', 'in', 'turn', 'elect', 'the', 'president', 'Which', 'means', 'that', 'the', 'people', 'do', 'not', 'get', 'a', 'direct', 'vote', 'towards', 'the', 'president', 'Therefore', 'it', 'can', 'cause', 'disinterest', 'in', 'people', 'who', 'are', 'eligible', 'to', 'vote', 'That', 'is', 'why', 'I', 'argue', 'in', 'favor', 'of', 'changing', 'to', 'election', 'by', 'popular', 'vote', 'for', 'the', 'president', 'of', 'the', 'United', 'States', 'The', 'first', 'reason', 'as', 'to', 'why', 'I', 'fav

In [72]:
essays_df

Unnamed: 0,id,essay_text
0,3321A3E87AD3,"[I, do, agree, that, some, students, would, be..."
1,DFEAEC512BAB,"[Should, students, design, a, summer, project,..."
2,2E4AFCD3987F,"[Dear, State, Senator, In, the, ruels, of, vot..."
3,EB6C2AF20BFE,"[People, sometimes, have, a, different, opinio..."
4,A91A08E523D5,"[Dear, senator, As, you, know, the, Electoral,..."
...,...,...
15589,1C899F124FEB,"[While, some, students, may, think, its, a, be..."
15590,4453444AF383,"[There, has, been, a, strong, arguement, going..."
15591,EF0D75BF48DA,"[I, favor, in, to, changing, election, by, pop..."
15592,8FFDA5B9D359,"[Do, you, think, students, would, benefit, fro..."


In [73]:
train = pd.read_csv('../data/feedback-prize-2021/train.csv')

In [74]:
train.head()

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622628000000.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622628000000.0,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622628000000.0,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622628000000.0,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...


In [75]:
essays_df['tags'] = None


def create_tag_array(text):
    return ['O'] * len(text)

def update_tags(tags, predictionstring, discourse_type, len_essay):
    indices = list(map(int, predictionstring.split()))
    for index in indices:
        if index >= (len_essay):
            return tags
        tags[index] = discourse_type
    return tags

for index, row in tqdm(essays_df.iterrows(), total=essays_df.shape[0], desc="Tagging"):
    tags = create_tag_array(row['essay_text'])

    essay_annotations = train[train['id'] == row['id']]

    for _, annotation_row in essay_annotations.iterrows():
        tags = update_tags(tags, annotation_row['predictionstring'], annotation_row['discourse_type'], len(row['essay_text']))

    if len(tags) != len(row['essay_text']):
        print(f"Length mismatch: {len(tags)} tags, {len(row['essay_text'])} words")
    
    essays_df.at[index, 'tags'] = tags


Tagging: 100%|██████████| 15594/15594 [01:11<00:00, 217.91it/s]


In [76]:
print(len(essays_df.iloc[0]["essay_text"]))
print(len(essays_df.iloc[0]["tags"]))

471
471


### Tokenize essays and align tags correctly

In [77]:
essays_df.to_pickle('../data/preprocess_step1.pkl')

In [78]:
mismatch_count = 0

for index, row in essays_df.iterrows():
    # Assuming 'essay_text' is an array of words and 'tags' is an array of tags
    if len(row['tags']) != len(row['essay_text']):
        mismatch_count += 1
        # Optionally, print details about the mismatches
        print(f"Mismatch in row {index}: {len(row['tags'])} tags, {len(row['essay_text'])} words")

print(f"Total mismatches found: {mismatch_count}")

Total mismatches found: 0


In [79]:
from transformers import LongformerTokenizer

essays_df = pd.read_pickle('../data/preprocess_step1.pkl')

mismatch_count = 0

for index, row in essays_df.iterrows():
    # Assuming 'essay_text' is an array of words and 'tags' is an array of tags
    if len(row['tags']) != len(row['essay_text']):
        mismatch_count += 1
        # Optionally, print details about the mismatches
        print(f"Mismatch in row {index}: {len(row['tags'])} tags, {len(row['essay_text'])} words")

print(f"Total mismatches found: {mismatch_count}")

Total mismatches found: 0


In [80]:
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

tokenized_data = []

def align_tags_with_tokens(tags, words, tokenized_words):
    aligned_tags = []
    word_index = 0 

    for token in tokenized_words:
        if word_index >= len(tags):  
            break 

        if token.startswith("##"):
            aligned_tags.append(tags[word_index - 1])
        else:
            aligned_tags.append(tags[word_index])
            word_index += 1

    return aligned_tags

    return aligned_tags

for _, row in tqdm(essays_df.iterrows(), total=essays_df.shape[0], desc="Tokenizing and Aligning"):
    tokens = tokenizer.tokenize(' '.join(row['essay_text']))

    aligned_tags = align_tags_with_tokens(row['tags'], row['essay_text'], tokens)

    tokenized_data.append({
        'id': row['id'],
        'tokens': tokens,
        'aligned_tags': aligned_tags
    })

tokenized_df = pd.DataFrame(tokenized_data)



Tokenizing and Aligning: 100%|██████████| 15594/15594 [00:17<00:00, 884.04it/s]


In [81]:
tokenized_df

Unnamed: 0,id,tokens,aligned_tags
0,3321A3E87AD3,"[I, Ġdo, Ġagree, Ġthat, Ġsome, Ġstudents, Ġwou...","[Lead, Lead, Lead, Lead, Lead, Lead, Lead, Lea..."
1,DFEAEC512BAB,"[Should, Ġstudents, Ġdesign, Ġa, Ġsummer, Ġpro...","[O, O, O, O, O, O, O, O, Position, Position, P..."
2,2E4AFCD3987F,"[Dear, ĠState, ĠSenator, ĠIn, Ġthe, Ġru, els, ...","[O, O, O, O, Position, Position, Position, Pos..."
3,EB6C2AF20BFE,"[People, Ġsometimes, Ġhave, Ġa, Ġdifferent, Ġo...","[Lead, Lead, Lead, Lead, Lead, Lead, Lead, Lea..."
4,A91A08E523D5,"[Dear, Ġsenator, ĠAs, Ġyou, Ġknow, Ġthe, ĠElec...","[O, O, Lead, Lead, Lead, Lead, Lead, Lead, Lea..."
...,...,...,...
15589,1C899F124FEB,"[While, Ġsome, Ġstudents, Ġmay, Ġthink, Ġits, ...","[Lead, Lead, Lead, Lead, Lead, Lead, Lead, Lea..."
15590,4453444AF383,"[There, Ġhas, Ġbeen, Ġa, Ġstrong, Ġarg, u, eme...","[Lead, Lead, Lead, Lead, Lead, Lead, Lead, Lea..."
15591,EF0D75BF48DA,"[I, Ġfavor, Ġin, Ġto, Ġchanging, Ġelection, Ġb...","[Position, Position, Position, Position, Posit..."
15592,8FFDA5B9D359,"[Do, Ġyou, Ġthink, Ġstudents, Ġwould, Ġbenefit...","[Position, Position, Position, Position, Posit..."


In [82]:
tokenized_df.to_pickle('../data/preprocess_step2.pkl')