In [1]:
from wcmatch import wcmatch
import os
from tqdm import tqdm

In [6]:
# function to extract sentences from text files
def extract_sentences(file_path):
    # This is what will be returned from the function with Empty as a placeholder
    final_content = "Empty"
    # The files contents will be saved in the variable below for processing
    content = ''
    # Stride length is the maximum number of words we want to include in our sequence being generated
    stride = 500

    # Validate file path and return "Empty" if not valid
    if not os.path.isfile(file_path):
        print("{} does not exist ".format(file_path))
        return final_content

    # Read file and remove empty line and new lines
    with open(file_path, 'r') as file:
        for line in file.readlines():
            if line.strip():
                if len(line.strip()) > 2:
                    content += line.replace('\n','')

    # Create list of words and generate number of words
    split_content = content.split()
    seq_len = len(split_content)

    # Check that contents have been extracted and reset Empty flag
    if seq_len > 0:
        final_content = ""

    # Create the sequences
    for begin_loc in tqdm(range(0, seq_len, stride)):
        end_loc = min(begin_loc + stride, seq_len)
        if len(split_content[begin_loc:end_loc]) != 0:
            # Include a new line at the end of the generated sequence
            final_content += "<s>" + ' '.join(split_content[begin_loc:end_loc]) + "</s> \n"

    # Return the sequences
    return final_content

In [7]:
def create_dataset(folder_path, file_ext, folder_destination, dataset_name="dataset.txt"):
    # folder_path is the source path
    # file_ext is the file formats to be matched
    # folder_destination is where to save the dataset
    # dataset_name is the name of the dataset (default will be dataset.txt)

    # Save file paths matched in files variable
    print("Start processing ...")
    files = wcmatch.WcMatch(root_dir=folder_path, file_pattern=file_ext, flags=wcmatch.RECURSIVE).match()
    print(str(len(files)) + " files to be processed!")

    # Loop through and process each file
    i = 0
    while i < len(files):
        try:
            # Get extracted sentences
            contents = extract_sentences(files[i])
            # Ignore "Empty" sentences
            if contents != 'Empty':
                # Open or create dataset in append and byte mode
                f = open(folder_destination + dataset_name, "ab")
                # Save contents in utf-8 encoding
                f.write(contents.encode('utf-8'))
                f.close()
        except Exception as e:
            # Log any issues encountered for further investigation
            print(files[i])
            print ("Error saving extraction to file " + str(e))

        # Increment counter
        i += 1

    print("Finished processing ...")

In [8]:
# Read from the following path
folder_path = "C:\\Users\\r.muema\\Documents\\Study\\KaggleX\\Project\\Extracted data\\ELO 23"
#Only process the following file formats, add more file extensions using comma separation
file_ext = "*.txt"
# Save to the following path
folder_destination = "C:\\Users\\r.muema\\Documents\\Study\\KaggleX\\Project\\Preprocessed data\\"
# Dataset name (can be omitted to use default values)
dataset_name="custom-llama2-dataset.txt"

In [9]:
create_dataset(folder_path, file_ext, folder_destination, dataset_name)

Start processing ...
94 files to be processed!


100%|██████████| 4/4 [00:00<00:00, 2000.14it/s]
100%|██████████| 3/3 [00:00<00:00, 3010.99it/s]
100%|██████████| 48/48 [00:00<00:00, 4797.95it/s]
100%|██████████| 2/2 [00:00<00:00, 1985.47it/s]
100%|██████████| 8/8 [00:00<00:00, 3991.72it/s]
100%|██████████| 4/4 [00:00<00:00, 1998.72it/s]
100%|██████████| 2/2 [00:00<00:00, 999.36it/s]
100%|██████████| 8/8 [00:00<00:00, 2666.65it/s]
100%|██████████| 1/1 [00:00<00:00, 500.04it/s]
100%|██████████| 1/1 [00:00<00:00, 979.29it/s]
100%|██████████| 1/1 [00:00<00:00, 999.36it/s]
100%|██████████| 22/22 [00:00<00:00, 4397.59it/s]
100%|██████████| 14/14 [00:00<00:00, 3498.38it/s]
100%|██████████| 1/1 [00:00<00:00, 997.22it/s]
100%|██████████| 12/12 [00:00<00:00, 3000.04it/s]
100%|██████████| 7/7 [00:00<00:00, 2334.99it/s]
100%|██████████| 21/21 [00:00<00:00, 4615.65it/s]
100%|██████████| 1/1 [00:00<00:00, 1014.83it/s]
100%|██████████| 15/15 [00:00<00:00, 3746.48it/s]
100%|██████████| 15/15 [00:00<00:00, 1670.24it/s]
100%|██████████| 2/2 [00:00<00:

C:\Users\r.muema\Documents\Study\KaggleX\Project\Extracted data\ELO 23\en-us_ops-onprem_installation.txt
Error saving extraction to file 'charmap' codec can't decode byte 0x9d in position 2628: character maps to <undefined>


100%|██████████| 8/8 [00:00<00:00, 4001.72it/s]
100%|██████████| 3/3 [00:00<00:00, 3003.08it/s]
100%|██████████| 18/18 [00:00<00:00, 6007.60it/s]
100%|██████████| 10/10 [00:00<00:00, 2500.78it/s]
100%|██████████| 10/10 [00:00<00:00, 2493.20it/s]
100%|██████████| 5/5 [00:00<00:00, 2501.37it/s]


C:\Users\r.muema\Documents\Study\KaggleX\Project\Extracted data\ELO 23\en-us_ops-sand_installation-operation.txt
Error saving extraction to file 'charmap' codec can't decode byte 0x9d in position 310: character maps to <undefined>


100%|██████████| 7/7 [00:00<00:00, 3482.81it/s]
100%|██████████| 15/15 [00:00<00:00, 7460.52it/s]
100%|██████████| 7/7 [00:00<00:00, 7008.86it/s]
100%|██████████| 10/10 [00:00<00:00, 5003.94it/s]
100%|██████████| 7/7 [00:00<00:00, 4609.85it/s]
100%|██████████| 7/7 [00:00<00:00, 1753.16it/s]
100%|██████████| 8/8 [00:00<00:00, 3999.34it/s]
100%|██████████| 8/8 [00:00<00:00, 4016.09it/s]
100%|██████████| 5/5 [00:00<00:00, 2501.97it/s]
100%|██████████| 13/13 [00:00<00:00, 6518.34it/s]
100%|██████████| 22/22 [00:00<00:00, 7325.13it/s]
100%|██████████| 11/11 [00:00<00:00, 1375.18it/s]
100%|██████████| 4/4 [00:00<00:00, 1000.97it/s]
100%|██████████| 12/12 [00:00<00:00, 3995.21it/s]
100%|██████████| 51/51 [00:00<00:00, 3916.82it/s]
100%|██████████| 21/21 [00:00<00:00, 5247.25it/s]
100%|██████████| 107/107 [00:00<00:00, 6296.78it/s]
100%|██████████| 10/10 [00:00<00:00, 4995.60it/s]
100%|██████████| 52/52 [00:00<00:00, 971.01it/s]
100%|██████████| 7/7 [00:00<00:00, 7008.86it/s]
100%|██████████| 

Finished processing ...





In [10]:
!pip install huggingface_hub

