# Preprocessing Data

## Imports

In [None]:
import os
import pandas as pd
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import tostring
import matplotlib.pyplot as plt
import csv

## Reading Raw Matres

In [None]:
directory_path_timebank = 'data/raw/timebank.txt'
directory_path_platinum = 'data/raw/platinum.txt'
directory_path_aquaint = 'data/raw/aquaint.txt'

In [None]:
directory_path_timebank_tmlr = 'data/raw/timebank/'
directory_path_platinum_tmlr = 'data/raw/platinum/'
directory_path_aquaint_tmlr = 'data/raw/aquaint/'

In [None]:
def read_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter='\t')
        next(reader)
        for row in reader:
            if len(row) == 6:  # Ensure the row has the correct number of columns
                entry = {
                    'docid': row[0],
                    'verb1': row[1],
                    'verb2': row[2],
                    'eiid1': row[3],
                    'eiid2': row[4],
                    'relation': row[5]
                }
                data.append(entry)
    return data

In [None]:
def read_matres_data_to_dataframe(file_path):
    try:
        df = pd.read_csv(file_path, delimiter='\t', header=None, names=['docid', 'verb1', 'verb2', 'eiid1', 'eiid2', 'relation'], encoding='utf-8')
        df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

        return df
    except Exception as e:
        print(f"Error reading or processing file into DataFrame: {e}")
        return None

In [None]:
timebank = read_matres_data_to_dataframe(directory_path_timebank)
platinum = read_matres_data_to_dataframe(directory_path_platinum)
aquaint = read_matres_data_to_dataframe(directory_path_aquaint)

In [None]:
def extract_clean_text(root):
    text_element = root.find('.//TEXT')
    return ''.join(text_element.itertext())

In [None]:
for index, row in timebank.iterrows():
    file_path = os.path.join(directory_path_timebank_tmlr, f"{row['docid']}.tml")
    if os.path.exists(file_path):
        try:
            tree = ET.parse(file_path)
            root = tree.getroot()
            context = extract_clean_text(root)
            timebank.at[index, 'context'] = context.strip()  # strip() to remove leading/trailing whitespace
        except ET.ParseError as e:
            print(f"Error parsing XML: {e}")
        except Exception as e:
            print(f"Unexpected error: {e}")
    else:
        print(f"File does not exist: {file_path}")

In [None]:
for index, row in platinum.iterrows():
    file_path = os.path.join(directory_path_platinum_tmlr, f"{row['docid']}.tml")
    if os.path.exists(file_path):
        try:
            tree = ET.parse(file_path)
            root = tree.getroot()
            context = extract_clean_text(root)
            platinum.at[index, 'context'] = context.strip()  # strip() to remove leading/trailing whitespace
        except ET.ParseError as e:
            print(f"Error parsing XML: {e}")
        except Exception as e:
            print(f"Unexpected error: {e}")
    else:
        print(f"File does not exist: {file_path}")

In [None]:
for index, row in aquaint.iterrows():
    file_path = os.path.join(directory_path_aquaint_tmlr, f"{row['docid']}.tml")
    if os.path.exists(file_path):
        try:
            tree = ET.parse(file_path)
            root = tree.getroot()
            context = extract_clean_text(root)
            aquaint.at[index, 'context'] = context.strip()  # strip() to remove leading/trailing whitespace
        except ET.ParseError as e:
            print(f"Error parsing XML: {e}")
        except Exception as e:
            print(f"Unexpected error: {e}")
    else:
        print(f"File does not exist: {file_path}")

## Processing Raw Matres

In [None]:
platinum_equal = platinum[platinum['relation'] == 'EQUAL'].copy()
platinum_equal['verb1'], platinum_equal['verb2'] = platinum_equal['verb2'], platinum_equal['verb1']
platinum = pd.concat([platinum, platinum_equal], ignore_index=True)

In [None]:
aquaint_equal = aquaint[aquaint['relation'] == 'EQUAL'].copy()
aquaint_equal['verb1'], aquaint_equal['verb2'] = aquaint_equal['verb2'], aquaint_equal['verb1']
aquaint = pd.concat([aquaint, platinum_equal], ignore_index=True)

In [None]:
timebank_equal = timebank[timebank['relation'] == 'EQUAL'].copy()
timebank_equal['verb1'], timebank_equal['verb2'] = timebank_equal['verb2'], timebank_equal['verb1']
timebank = pd.concat([timebank, platinum_equal], ignore_index=True)

In [None]:
print(timebank['relation'].value_counts())
print(aquaint['relation'].value_counts())
print(platinum['relation'].value_counts())

In [None]:
#platinum.to_csv('data/interim/platinum.csv', index=False)
#aquaint.to_csv('data/interim/aquaint.csv', index=False)
#timebank.to_csv('data/interim/timebank.csv', index=False)

## Split Data

In [None]:
train = pd.concat([timebank, aquaint], ignore_index=True)
test = platinum

In [None]:
print(train['relation'].value_counts())
print(test['relation'].value_counts())

## Process Split Data

In [None]:
label_mapping = {
    'BEFORE': 0,
    'AFTER': 1,
    'EQUAL': 2,
    'VAGUE': 3
}

In [None]:
train['label'] = train['relation'].map(label_mapping)
test['label'] = test['relation'].map(label_mapping)

In [None]:
def process(df):
    processed = []
    for i, row in df.iterrows():
        context = row['context']
        eventA = row['verb1']
        eventB = row['verb2']
        relation = row['relation']
        label = row['label']
        id = row['docid']

    #input = f"{context} [SEP] {eventA} [SEP] {eventB}"
        processed.append({
            'id' : id,
            'context' : context,
            'eventA' : eventA,
            'eventB' : eventB,
            'label_temp' : relation,
            'label' : label
          })
    dataset = pd.DataFrame(processed)
    return dataset

In [None]:
processed_train = process(train)
processed_test = process(test)

In [None]:
processed_train.to_csv('data/processed/train.csv', index=False)
processed_test.to_csv('data/processed/test.csv', index=False)