## Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Train, Validation and Test Set Indices

In [None]:
train_set = [10, 18, 20, 22, 23, 24, 25, 26, 31, 32, 34, 37, 46, 48, 49, 50, 51, 58, 61, 73, 94, 101, 102, 108, 110, 114, 115, 123, 127, 129, 130, 140, 156, 170, 175, 180, 191, 211, 213, 216, 218, 223, 232, 237, 241, 243, 246, 247, 272, 273, 279, 291, 301, 310, 311, 317]

test_set = [4, 30, 38, 52, 65, 71, 77, 91, 97, 100, 106, 107, 109, 119, 121, 149, 166, 172, 174, 184, 192, 199, 200, 215, 221, 227, 250, 253, 258, 259, 278, 286, 309]

validation_set = [45, 53, 56, 59, 76, 85, 88, 98, 103, 105, 142, 143, 157, 163, 164, 171, 217, 224, 229, 235, 308, 325, 330]

In [None]:
train_set_length = len(train_set)
test_set_length = len(test_set)
validation_set_length = len(validation_set)

print("Length of the train set:", train_set_length)
print("Length of the testing set:", test_set_length)
print("Length of the validation set:", validation_set_length)

Length of the train set: 56
Length of the testing set: 33
Length of the validation set: 23


## Utility Functions

In [None]:
# Function to read contents of a file and filter out empty lines
def read_file_content(file_path):
    with open(file_path, 'r') as file:
        lines = (line.strip() for line in file if line.strip())
        return ' '.join(lines)

# Function to group files based on their indexes
def group_files_by_index(folder_path, indexes, folder_name):
    grouped_contents = [''] * len(indexes)
    for index in indexes:
        file_path = f"{folder_path}/{folder_name}_{index}.txt"
        content = read_file_content(file_path)
        grouped_contents[indexes.index(index)] = content
    return grouped_contents

# Function to write the grouped contents to output files
def write_output_file(file_name, content):
    with open(file_name, 'w') as output_file:
        output_file.write(content)

## Preprocessing

In [None]:
folder_path = "/content/drive/MyDrive/NLP_Project/PSAT/Original"
folder_name = 'original'
train_group = group_files_by_index(folder_path, train_set,folder_name)
test_group = group_files_by_index(folder_path, test_set,folder_name)
valid_group = group_files_by_index(folder_path, validation_set,folder_name)

write_output_file('PSAT.train.complex', '\n'.join(train_group))
write_output_file('PSAT.test.complex', '\n'.join(test_group))
write_output_file('PSAT.valid.complex', '\n'.join(valid_group))

In [None]:
folder_path = "/content/drive/MyDrive/NLP_Project/PSAT/Simplified"
folder_name = 'simplified'
train_group = group_files_by_index(folder_path, train_set,folder_name)
test_group = group_files_by_index(folder_path, test_set,folder_name)
valid_group = group_files_by_index(folder_path, validation_set,folder_name)

write_output_file('PSAT.train.simple', '\n'.join(train_group))
write_output_file('PSAT.test.simple', '\n'.join(test_group))
write_output_file('PSAT.valid.simple', '\n'.join(valid_group))

## Download Preprocessed Dataset

In [None]:
import zipfile

file_names = ['PSAT.train.complex', 'PSAT.test.complex', 'PSAT.valid.complex','PSAT.train.simple', 'PSAT.test.simple', 'PSAT.valid.simple']

# Create a zip file
with zipfile.ZipFile('PSAT.zip', 'w') as zipf:
    # Add each file to the zip
    for file_name in file_names:
        zipf.write(file_name)

In [None]:
# Download the zip file
from google.colab import files
files.download('PSAT.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>