### Data Loading, Splitting, and Saving Splits

In [6]:
from sklearn.model_selection import train_test_split
import pandas as pd

data = pd.read_csv("/home/roland/Projects/saras_folder/extracted_definitions.csv")

# Stratified split into train+validation and test (80%-20%)
train_val_data, test_data = train_test_split(
    data,
    test_size=0.2,
    stratify=data['definition_sentence'], 
    random_state=42
)

# Splitting train and validation
train_data, val_data = train_test_split(
    train_val_data,
    test_size=0.2,
    stratify=train_val_data['definition_sentence'],
    random_state=42
)

train_data.to_csv("train_split_original.csv", index=False)
val_data.to_csv("val_split_original.csv", index=False)
test_data.to_csv("test_split_original.csv", index=False)


### Class Distribution Check

In [7]:
print(f"Total dataset size: {len(data)}")
print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")

print("\nClass distribution in training set:")
print(train_data['definition_sentence'].value_counts())

print("\nClass distribution in validation set:")
print(val_data['definition_sentence'].value_counts())

print("\nClass distribution in test set:")
print(test_data['definition_sentence'].value_counts())


Total dataset size: 31244
Training set size: 19996
Validation set size: 4999
Test set size: 6249

Class distribution in training set:
definition_sentence
False    18631
True      1365
Name: count, dtype: int64

Class distribution in validation set:
definition_sentence
False    4658
True      341
Name: count, dtype: int64

Class distribution in test set:
definition_sentence
False    5823
True      426
Name: count, dtype: int64


### Data Augmentation of Each Split Independantly

##### This section defines the augmentation methods (synonym replacement and contextual embedding) applied to definitional sentences.

In [8]:
import spacy
from tqdm import tqdm
import nlpaug.augmenter.word as naw
from nltk.corpus import wordnet

spacy_nlp = spacy.load("en_core_web_sm")

def synonym_augmenter(text):
    doc = spacy_nlp(text)
    augmented_text = []
    for token in doc:
        if token.pos_ in {'NOUN', 'VERB', 'ADJ', 'ADV'}:
            synonyms = wordnet.synsets(token.text)
            if synonyms:
                synonym = synonyms[0].lemmas()[0].name()
                augmented_text.append(synonym.replace('_', ' '))
            else:
                augmented_text.append(token.text)
        else:
            augmented_text.append(token.text)
    return " ".join(augmented_text)

contextual_aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert")

# Augmentation
def augment_split(data, split_name):
    definition_data = data[data['definition_sentence'] == 1]
    non_definition_data = data[data['definition_sentence'] == 0]

    augmented_sentences = []
    for text in tqdm(definition_data['sentence_original'], desc=f"Augmenting {split_name} definitions"):
    
        augmented_sentences.append(contextual_aug.augment(text))
        augmented_sentences.append(synonym_augmenter(text))

    augmented_definitions = pd.DataFrame({
        'sentence_original': augmented_sentences,
        'definition_sentence': 1
    })

    # Combine original and augmented data again
    combined_data = pd.concat([non_definition_data, definition_data, augmented_definitions], ignore_index=True)
    return combined_data.sample(frac=1, random_state=42).reset_index(drop=True)


##### Each data split (train-validation-test) is augmented independantly to ensure no data leakage

In [10]:

train_data = pd.read_csv("train_split_original.csv")
val_data = pd.read_csv("val_split_original.csv")
test_data = pd.read_csv("test_split_original.csv")

# Augment each split indepentantly
augmented_train_data = augment_split(train_data, "train")
augmented_val_data = augment_split(val_data, "validation")
augmented_test_data = augment_split(test_data, "test")

# Saving augmented splits
augmented_train_data.to_csv("train_split_augmented.csv", index=False)
augmented_val_data.to_csv("val_split_augmented.csv", index=False)
augmented_test_data.to_csv("test_split_augmented.csv", index=False)

print("Augmentation complete. Augmented splits saved!")


Augmenting train definitions: 100%|██████████| 1365/1365 [06:15<00:00,  3.64it/s]
Augmenting validation definitions: 100%|██████████| 341/341 [01:31<00:00,  3.71it/s]
Augmenting test definitions: 100%|██████████| 426/426 [01:57<00:00,  3.62it/s]


Augmentation complete. Augmented splits saved!


### Augmentated Data Validation: Class Distribution and Checking Sizes

In [11]:
import pandas as pd

# Load augmented splits
augmented_train_data = pd.read_csv("/home/roland/Projects/saras_folder/train_split_augmented.csv")
augmented_val_data = pd.read_csv("/home/roland/Projects/saras_folder/val_split_augmented.csv")
augmented_test_data = pd.read_csv("/home/roland/Projects/saras_folder/test_split_augmented.csv")

# Print sizes of each augmented split
print(f"Size of augmented training data: {len(augmented_train_data)}")
print(f"Size of augmented validation data: {len(augmented_val_data)}")
print(f"Size of augmented test data: {len(augmented_test_data)}")

# Check class distribution in each split
print("\nClass distribution in augmented training data:")
print(augmented_train_data['definition_sentence'].value_counts())

print("\nClass distribution in augmented validation data:")
print(augmented_val_data['definition_sentence'].value_counts())

print("\nClass distribution in augmented test data:")
print(augmented_test_data['definition_sentence'].value_counts())


Size of augmented training data: 22726
Size of augmented validation data: 5681
Size of augmented test data: 7101

Class distribution in augmented training data:
definition_sentence
0    18631
1     4095
Name: count, dtype: int64

Class distribution in augmented validation data:
definition_sentence
0    4658
1    1023
Name: count, dtype: int64

Class distribution in augmented test data:
definition_sentence
0    5823
1    1278
Name: count, dtype: int64
