## Pre-processing

### Load essays and tag every word in the essay

In [23]:
import pandas as pd 
import os
import string
import re
from tqdm import tqdm

In [70]:
# Loading all essays/texts

essay_dir = '../data/feedback-prize-2021/train'

essays_data = []

def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

for filename in tqdm(os.listdir(essay_dir), desc="Processing essays"):
    if filename.endswith('.txt'):
        essay_id = filename[:-4]
        file_path = os.path.join(essay_dir, filename)
        with open(file_path, 'r') as file:
            essay_text = file.read()
            essay_text_no_punct = remove_punctuation(essay_text)
            essays_data.append((essay_id, essay_text_no_punct.split()))

essays_df = pd.DataFrame(essays_data, columns=['id', 'essay_text'])

Processing essays: 100%|██████████| 15594/15594 [00:03<00:00, 4661.47it/s]


In [71]:
text = (essays_df[essays_df["id"] == '0A0AA9C21C5D']["essay_text"]).iloc[0]
print(text)

['Dear', 'state', 'senator', 'Many', 'people', 'believe', 'that', 'the', 'Electoral', 'College', 'should', 'be', 'abolished', 'while', 'others', 'believe', 'that', 'the', 'Electoral', 'College', 'should', 'stay', 'as', 'it', 'is', 'However', 'what', 'most', 'people', 'who', 'want', 'to', 'keep', 'the', 'electoral', 'college', 'do', 'not', 'know', 'is', 'that', 'when', 'you', 'vote', 'for', 'a', 'presidential', 'candidate', 'you', 'are', 'actually', 'voting', 'for', 'a', 'slate', 'of', 'electors', 'who', 'in', 'turn', 'elect', 'the', 'president', 'Which', 'means', 'that', 'the', 'people', 'do', 'not', 'get', 'a', 'direct', 'vote', 'towards', 'the', 'president', 'Therefore', 'it', 'can', 'cause', 'disinterest', 'in', 'people', 'who', 'are', 'eligible', 'to', 'vote', 'That', 'is', 'why', 'I', 'argue', 'in', 'favor', 'of', 'changing', 'to', 'election', 'by', 'popular', 'vote', 'for', 'the', 'president', 'of', 'the', 'United', 'States', 'The', 'first', 'reason', 'as', 'to', 'why', 'I', 'fav

In [72]:
essays_df

Unnamed: 0,id,essay_text
0,3321A3E87AD3,"[I, do, agree, that, some, students, would, be..."
1,DFEAEC512BAB,"[Should, students, design, a, summer, project,..."
2,2E4AFCD3987F,"[Dear, State, Senator, In, the, ruels, of, vot..."
3,EB6C2AF20BFE,"[People, sometimes, have, a, different, opinio..."
4,A91A08E523D5,"[Dear, senator, As, you, know, the, Electoral,..."
...,...,...
15589,1C899F124FEB,"[While, some, students, may, think, its, a, be..."
15590,4453444AF383,"[There, has, been, a, strong, arguement, going..."
15591,EF0D75BF48DA,"[I, favor, in, to, changing, election, by, pop..."
15592,8FFDA5B9D359,"[Do, you, think, students, would, benefit, fro..."


In [15]:
import pandas as pd

# Load your dataset
df = pd.read_csv('../data/feedback-prize-2021/train.csv')


# Sort the DataFrame by 'id' and 'discourse_start' to ensure proper order
df = df.sort_values(by=['id', 'discourse_start'])

# Initialize a dictionary to hold the essays
essays = {}

# Process each essay
for _, row in df.iterrows():
    essay_id = row['id']
    discourse_words = row['discourse_text'].split()
    discourse_tags = [row['discourse_type']] * len(discourse_words)

    if essay_id not in essays:
        essays[essay_id] = {'words': [], 'tags': []}

    essays[essay_id]['words'].extend(discourse_words)
    essays[essay_id]['tags'].extend(discourse_tags)

# Convert the dictionary to a DataFrame
essays_df = pd.DataFrame([(essay_id, data['words'], data['tags']) for essay_id, data in essays.items()],
                         columns=['id', 'essay_text', 'tags'])

# Display the first few rows of the DataFrame
print(essays_df.head())



             id                                         essay_text  \
0  0000D23A521A  [Some, people, belive, that, the, so, called, ...   
1  00066EA9880D  [Driverless, cars, are, exaclty, what, you, wo...   
2  000E6DE9E817  [I, am, arguing, against, the, policy, change,...   
3  001552828BD0  [Would, you, be, able, to, give, your, car, up...   
4  0016926B079C  [I, think, that, students, would, benefit, fro...   

                                                tags  
0  [Position, Position, Position, Position, Posit...  
1  [Lead, Lead, Lead, Lead, Lead, Lead, Lead, Lea...  
2  [Position, Position, Position, Position, Posit...  
3  [Lead, Lead, Lead, Lead, Lead, Lead, Lead, Lea...  
4  [Position, Position, Position, Position, Posit...  


### Tokenize essays and align tags correctly

In [17]:
essays_df.to_pickle('../data/preprocess_step1.pkl')

In [18]:
mismatch_count = 0

for index, row in essays_df.iterrows():
    # Assuming 'essay_text' is an array of words and 'tags' is an array of tags
    if len(row['tags']) != len(row['essay_text']):
        mismatch_count += 1
        # Optionally, print details about the mismatches
        print(f"Mismatch in row {index}: {len(row['tags'])} tags, {len(row['essay_text'])} words")

print(f"Total mismatches found: {mismatch_count}")

Total mismatches found: 0


In [21]:
from transformers import LongformerTokenizer

essays_df = pd.read_pickle('../data/preprocess_step1.pkl')

mismatch_count = 0

for index, row in essays_df.iterrows():
    # Assuming 'essay_text' is an array of words and 'tags' is an array of tags
    if len(row['tags']) != len(row['essay_text']):
        mismatch_count += 1
        # Optionally, print details about the mismatches
        print(f"Mismatch in row {index}: {len(row['tags'])} tags, {len(row['essay_text'])} words")

print(f"Total mismatches found: {mismatch_count}")

Total mismatches found: 0


In [24]:
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

tokenized_data = []

def align_tags_with_tokens(tags, words, tokenized_words):
    aligned_tags = []
    word_index = 0 

    for token in tokenized_words:
        if word_index >= len(tags):  
            break 

        if token.startswith("##"):
            aligned_tags.append(tags[word_index - 1])
        else:
            aligned_tags.append(tags[word_index])
            word_index += 1

    return aligned_tags

for _, row in tqdm(essays_df.iterrows(), total=essays_df.shape[0], desc="Tokenizing and Aligning"):
    tokens = tokenizer.tokenize(' '.join(row['essay_text']))

    aligned_tags = align_tags_with_tokens(row['tags'], row['essay_text'], tokens)

    tokenized_data.append({
        'id': row['id'],
        'tokens': tokens,
        'aligned_tags': aligned_tags
    })

tokenized_df = pd.DataFrame(tokenized_data)



Tokenizing and Aligning: 100%|██████████| 15594/15594 [00:17<00:00, 884.20it/s]


In [25]:
tokenized_df

Unnamed: 0,id,tokens,aligned_tags
0,0000D23A521A,"[Some, Ġpeople, Ġbel, ive, Ġthat, Ġthe, Ġso, Ġ...","[Position, Position, Position, Position, Posit..."
1,00066EA9880D,"[Driver, less, Ġcars, Ġare, Ġex, acl, ty, Ġwha...","[Lead, Lead, Lead, Lead, Lead, Lead, Lead, Lea..."
2,000E6DE9E817,"[I, Ġam, Ġarguing, Ġagainst, Ġthe, Ġpolicy, Ġc...","[Position, Position, Position, Position, Posit..."
3,001552828BD0,"[Would, Ġyou, Ġbe, Ġable, Ġto, Ġgive, Ġyour, Ġ...","[Lead, Lead, Lead, Lead, Lead, Lead, Lead, Lea..."
4,0016926B079C,"[I, Ġthink, Ġthat, Ġstudents, Ġwould, Ġbenefit...","[Position, Position, Position, Position, Posit..."
...,...,...,...
15589,FFF1442D6698,"[Every, Ġstudent, Ġlooks, Ġforward, Ġto, Ġsumm...","[Lead, Lead, Lead, Lead, Lead, Lead, Lead, Lea..."
15590,FFF1ED4F8544,"[Many, Ġcitizens, Ġargue, Ġthat, Ġthe, ĠElecto...","[Lead, Lead, Lead, Lead, Lead, Lead, Lead, Lea..."
15591,FFF868E06176,"[Every, Ġsummer, Ġbreak, ,, Ġstudents, Ġare, Ġ...","[Lead, Lead, Lead, Lead, Lead, Lead, Lead, Lea..."
15592,FFFD0AF13501,"[they, Ġget, Ġto, Ġsee, Ġtons, Ġof, Ġawesome, ...","[Claim, Claim, Claim, Claim, Claim, Claim, Cla..."


In [27]:
tokenized_df.to_pickle('../data/preprocess_step2.pkl')

In [29]:
print(tokenized_df[tokenized_df['id']=='0000D23A521A'])

             id                                             tokens  \
0  0000D23A521A  [Some, Ġpeople, Ġbel, ive, Ġthat, Ġthe, Ġso, Ġ...   

                                        aligned_tags  
0  [Position, Position, Position, Position, Posit...  


In [None]:
''