### Data Augmentation and Oversampling

The idea is to leverage the metadata and to follow the algorithm proposed by Hashemi et al. (2023) to get pairs of non-consecutive paragraphs (with and
without style changes). Then, classes will be oversampled to obtain a balanced data set.

Description of the algorithm: " incorporate additional non-consecutive pairs of paragraphs
into our sample set and assign them labels based on the inferred relationships. For example, if
there are three consecutive paragraphs without a style change, we can infer that the first and
third paragraphs are written by the same author. Similarly, if there are style changes between
the first and second paragraphs and between the second and third paragraphs, we can deduce
that the authors of the first and third paragraphs are different, given that the number of authors
in the document exceeds the number of style changes by one." (Hashemi et al. 2023: 4).

In [1]:
import os, json
import pandas as pd

In [6]:
BASE_DIR = '../data_pipeline/'

# get data sets 
df_train = pd.read_csv(os.path.join(BASE_DIR, "df_train.csv"), index_col=0)
df_val = pd.read_csv(os.path.join(BASE_DIR, "df_validation.csv"), index_col=0)

# check distribution of labels
changes_train = len(df_train[df_train['label_author'] == 1])
no_changes_train = len(df_train[df_train['label_author'] == 0])

changes_val = len(df_val[df_val['label_author'] == 1])
no_changes_val = len(df_val[df_val['label_author'] == 0])

total_data = len(df_train) + len(df_val)
train_ratio = len(df_train) / total_data
val_ratio = len(df_val) / total_data

print(f"Train-Val split is: {train_ratio:.2f}-{val_ratio:.2f}")


print(f"Number of rows where label_author == 0 (training data): {no_changes_train}")
print(f"Number of rows where label_author == 1 (training data): {changes_train}")

print(f"Number of rows where label_author == 0 (validation data): {no_changes_val}")
print(f"Number of rows where label_author == 1 (validation data): {changes_val}")

# check for duplicates

duplicate_train = df_train.duplicated().sum()

# Check for duplicates in the balanced validation dataset
duplicate_val = df_val.duplicated().sum()

print(f"Number of duplicate rows in balanced training dataset: {duplicate_rows_train}")
print(f"Number of duplicate rows in balanced validation dataset: {duplicate_rows_val}")

Train-Val split is: 0.82-0.18
Number of rows where label_author == 0 (training data): 20485
Number of rows where label_author == 1 (training data): 31508
Number of rows where label_author == 0 (validation data): 4489
Number of rows where label_author == 1 (validation data): 6709
Number of duplicate rows in balanced training dataset: 7957
Number of duplicate rows in balanced validation dataset: 1526


In [3]:
def data_augmentation(data):
    augmented_rows = []
    # problem is that fileindexes start from 1 for every label_dataset --> iterate over label_dataset first
    unique_datasets = data['label_dataset'].unique()
    for dataset in unique_datasets:
        dataset_data = data[data['label_dataset'] == dataset] # get subset of data for easy, medium, and hard
        unique_fileindexes = data['fileindex'].unique() # get unique fileindexes
        for file_index in unique_fileindexes:
            file_data = dataset_data[dataset_data['fileindex'] == file_index] # get DataFrame for file
            
            if (file_data['label_author'] == 1).sum() == (file_data["n_authors"].iloc[0] - 1):
                for i in range(len(file_data)-1):
                    row = file_data.iloc[i]
                    j = i + 1 # set next paragraph index

                    while (j < len(file_data)) and (file_data["label_author"].iloc[j-1] == 0):# while same author
                        if j > i:
                            augmented_rows.append({
                        'paragraph1': row['paragraph1'],
                        'paragraph2': file_data['paragraph2'].iloc[j],
                        'label_author': 0, # same author
                        'label_dataset': row['label_dataset'],
                        'n_authors': row['n_authors'],
                        'fileindex': row['fileindex']
                    })   
                        j +=1 # move to next paragraph
                    while j < len(file_data):
                        if j > i:
                            augmented_rows.append({
                        'paragraph1': row['paragraph1'],
                        'paragraph2': file_data['paragraph2'].iloc[j],
                        'label_author': 1, # style change
                        'label_dataset': row['label_dataset'],
                        'n_authors': row['n_authors'],
                        'fileindex': row['fileindex']
                    })
                        j += 1 # move to next paragraph
            
    # Create a new DataFrame with augmented rows
    augmented_df = pd.DataFrame(augmented_rows)
    return augmented_df

augmented_df_train = data_augmentation(df_train)
print(f"n data_train: {len(df_train)}")
print(f"created {len(augmented_df_train)} additional training data.")
augmented_df_train[:10]

augmented_df_val = data_augmentation(df_val)
print(f"n data_val: {len(df_val)}")
print(f"created {len(augmented_df_val)} additional validation data.")
augmented_df_val[:10]

add_training_df = pd.concat([df_train, augmented_df_train], ignore_index=True)
add_val_df = pd.concat([df_val, augmented_df_val], ignore_index=True)


# write into new files and drop column for n_authors
add_training_df.drop(columns=['n_authors']).to_csv('training_data_augmented.csv', index=True)

add_val_df.drop(columns=['n_authors']).to_csv('validation_data_augmented.csv', index=True)

# new train-val split

total_data_new = len(add_training_df) + len(add_val_df)
train_ratio_new = len(add_training_df) / total_data_new
val_ratio_new = len(add_val_df) / total_data_new

print(f"New Train-Val split is: {train_ratio_new:.2f}-{val_ratio_new:.2f}")

n data_train: 51993
created 47598 additional training data.
n data_val: 11198
created 11532 additional validation data.
New Train-Val split is: 0.81-0.19


### Balancing data set (create two new files for balanced data sets)

In [4]:
augmented_train = pd.read_csv("training_data_augmented.csv", index_col=0)
augmented_val = pd.read_csv("validation_data_augmented.csv", index_col=0)


# check distribution of labels
changes_train_aug = len(add_training_df[add_training_df['label_author'] == 1])
no_changes_train_aug = len(add_training_df[add_training_df['label_author'] == 0])

changes_val_aug = len(add_val_df[add_val_df['label_author'] == 1])
no_changes_val_aug = len(add_val_df[add_val_df['label_author'] == 0])

print(f"Number of rows where label_author == 0 (training data): {no_changes_train_aug}")
print(f"Number of rows where label_author == 1 (training data): {changes_train_aug}")

print(f"Number of rows where label_author == 0 (validation data): {no_changes_val_aug}")
print(f"Number of rows where label_author == 1 (validation data): {changes_val_aug}")


# original train-validation split 0.82-0.12

def balance_data(train,val):
    '''balance data by swapping paragraph1 and paragraph2 and oversample the minority class'''
    
    total_data = len(train) + len(val)
    ratio_train = len(train) / total_data
    ratio_val = len(val) / total_data
    
    change_train = len(train[train['label_author'] == 1])
    no_change_train = len(train[train['label_author'] == 0]) # minority class
    change_val = len(val[val['label_author'] == 1])
    no_change_val = len(val[val['label_author'] == 0]) # minority class
    
    balanced_rows_train = []
    balanced_rows_val = []
    
    for i in range(change_train-no_change_train):
        row = train[train['label_author'] == 0].sample(n=1, replace=False).iloc[0] # take random sample without replacement from rows with no changes
        balanced_rows_train.append({
                        'paragraph1': row['paragraph2'],
                        'paragraph2': row['paragraph1'],
                        'label_author': 0, # no style change (minority class) 
                        'label_dataset': row['label_dataset'],
                        'fileindex': row['fileindex']
                    })
    
    for i in range(change_val-no_change_val):
        row = val[val['label_author'] == 0].sample(n=1, replace=False).iloc[0]
        balanced_rows_val.append({
                        'paragraph1': row['paragraph2'],
                        'paragraph2': row['paragraph1'],
                        'label_author': 0, # no style change (minority class) 
                        'label_dataset': row['label_dataset'],
                        'fileindex': row['fileindex']
                    })
    
    balanced_train = pd.DataFrame(balanced_rows_train)
    balanced_val = pd.DataFrame(balanced_rows_val)
    return balanced_train, balanced_val

balanced_train, balanced_val = balance_data(augmented_train, augmented_val)

balanced_train = pd.concat([augmented_train, balanced_train], ignore_index=True)
balanced_val = pd.concat([augmented_val, balanced_val], ignore_index=True)

# write into new files and drop column for n_authors
balanced_train.to_csv('balanced_train.csv', index=True)
balanced_val.to_csv('balanced_val.csv', index=True)

increase_train = (len(balanced_train) - len(df_train)) / len(df_train) * 100
increase_val = (len(balanced_val) - len(df_val)) / len(df_val) * 100

print()
print(f"Created {increase_train:.2f}% more training and {increase_val:.2f}% more validation data through creating new paragraph pairs and swapping paragraphs to oversample minority classes.")

total_data = len(balanced_train) + len(balanced_val)
train_ratio = len(balanced_train) / total_data
val_ratio = len(balanced_val) / total_data

print()
print(f"Train-Val split is: {train_ratio:.2f}-{val_ratio:.2f}") # ensure split stays the same (actually unnecessary)


Number of rows where label_author == 0 (training data): 36791
Number of rows where label_author == 1 (training data): 62800
Number of rows where label_author == 0 (validation data): 8599
Number of rows where label_author == 1 (validation data): 14131

Created 141.57% more training and 152.38% more validation data through creating new paragraph pairs and swapping paragraphs to oversample minority classes.

Train-Val split is: 0.82-0.18


In [5]:
# DELETE THIS
print(len(df_train)) # original training data
print(len(add_training_df)) # after adding new paragraphs 
print(len(balanced_train)) # after balancing classes by oversampling minority class (no change)

# Check for duplicates in the balanced training dataset
duplicate_rows_train = balanced_train.duplicated().sum()

# Check for duplicates in the balanced validation dataset
duplicate_rows_val = balanced_val.duplicated().sum()

print(f"Number of duplicate rows in balanced training dataset: {duplicate_rows_train}")
print(f"Number of duplicate rows in balanced validation dataset: {duplicate_rows_val}")

51993
99591
125600
Number of duplicate rows in balanced training dataset: 7957
Number of duplicate rows in balanced validation dataset: 1526
