# Convert texts files to a data frame and save as parquet
It is easier to have one IO operation on a relatively small file (the output is ~19MB for training dataset) than have to read >10,000 very small text files. Admittedly the benefit on this relatively small size of data is minimal but having a single data frame of values makes it easier to perform further analysis, cleaning, and classification. For this purpose we'll read all the files and dump them into a parquet file.

In [None]:
import glob
import pandas as pd
import os

train_files = glob.glob('../input/feedback-prize-2021/train/*.txt')
test_files = glob.glob('../input/feedback-prize-2021/test/*.txt')

def read_txts(files):
    '''
    Helper function to store all text read into a dictionary
    '''
    file_dict = {}
    for file in files:
        filename = file.split(os.path.sep)[-1].strip('.txt')
        with open(file,'r') as f:
            file_dict[filename] = [f.read()]
    return file_dict

# Generate dataframes
train_df = pd.DataFrame.from_dict(read_txts(train_files), orient='index', columns=['text'])
test_df = pd.DataFrame.from_dict(read_txts(test_files), orient='index', columns=['text'])

# save dataframes
os.makedirs('./feedback-prize-2021/')
train_df.to_parquet('./feedback-prize-2021/train_all.parquet')
test_df.to_parquet('./feedback-prize-2021/test_all.parquet')