In [1]:
import re
import os
from bs4 import BeautifulSoup
from underthesea import text_normalize, word_tokenize
from deep_translator import GoogleTranslator
from transformers import pipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
import html
import json
from dotenv import load_dotenv


In [None]:
load_dotenv()
project_directory = os.getenv("PROJECT_DIR")
MAX_LENGTH = 512
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

In [None]:
cleaned_data_directory = os.path.join(project_directory, 'Data\cleaned_data')
page_directory = os.path.join(project_directory, 'Data\pages')
url_file = os.path.join(project_directory, r'Data\urls.json')
with open(url_file, 'r') as file:
    urls = json.load(file)

# urls

In [None]:
def join_tokens(tokens):
    sentence = ""
    for token in tokens:
        if token in [',', '.', '?', ':', '!', ';', '..', '...']:
            sentence += token
        else:
            if sentence and sentence[-1] != ' ':
                sentence += ' '
            sentence += token

    return sentence

In [None]:
def find_title(href, urls):
    title = ''
    for url in urls:
        if href == url['url'] or href.replace('community.canvaslms.com:443', 'community.canvaslms.com') == url['url']:
            return url['title']
        if href in url['url'] or href.replace('community.canvaslms.com:443', 'community.canvaslms.com') in url['url']:
            title = url['title']
    return title

In [None]:
def remove_html_elements(text, urls):
    soup = BeautifulSoup(text, "html.parser")
    introduction = soup.find_all(class_='step-instructions screensteps-textblock screensteps-wrapper--introduction screensteps-wrapper')
    for element in introduction:
        element.decompose()
    images = soup.find_all('img')
    for image in images:
        image.decompose()
    # for h2 in soup.find_all('h2'):
    #     h2.replace_with("- " + h2.text + ":")
    # for a in soup.find_all('a'):
    #     title = find_title(a['href'], urls)
    #     if len(title) > 0:
    #         a.replace_with(f"{a.text} ({title})")
    #     else:
    #         a.replace_with(f"{a.text}")
    clean_text = soup.get_text()
    clean_text = html.unescape(clean_text).replace('\xa0', '')
    return clean_text

In [None]:
def clean_data(name, indir, outdir, urls):
    clean_file = os.path.join(indir, name)
    # print(f'Cleaning file: {clean_file}')
    with open(clean_file, 'r', encoding='utf-8') as file:
        text = file.read()

    text = remove_html_elements(text, urls)
    lines = text.split('\n')
    lines = [line for line in lines if len(line.strip()) > 0]
    text = '\n'.join(lines)
    # pattern = re.compile(r'\b(' + '|'.join(re.escape(word) for word in abbre_dict.keys()) + r')\b', re.IGNORECASE)

    # text = pattern.sub(lambda x: abbre_dict[x.group().lower()], text)

    # corrector = pipeline("text2text-generation", model="bmd1905/vietnamese-correction")
    #
    # predictions = corrector(sents, max_length=MAX_LENGTH)
    # sents = [pred['generated_text'] for pred in predictions]

    text = re.sub(r'\s*\[\d+\]', '', text)
    text = re.sub(r'\[\d+\]', '', text)

    name, ext = os.path.splitext(name)
    text = name + '\n' + text

    text_splitter = RecursiveCharacterTextSplitter(
        separators=['\n', '.'],
        chunk_size=4096,
        chunk_overlap=0,
        length_function=len
    )
    
    # chunks = text_splitter.split_text(text)
    
    # translator = GoogleTranslator(src='auto', target='vi')
    # chunks = [translator.translate(chunk) for chunk in chunks]
    
    # # text = translator.translate(text)
    # text = chunks[0]
    # for chunk in chunks[1:]:
    #     if chunk[0] == '.':
    #         text += chunk
    #     else:
    #         text += "\n" + chunk
    # print(text)
    # name = name.replace("’", "'")
    name = text.split('\n')[0]
    replace_char = ['\n', '\t', '?', '!', ':', '*']
    for char in replace_char:
        name = name.replace(char, '')
    write_file = os.path.join(outdir, name + '.txt')
    with open(write_file, 'w', encoding='utf-8') as text_file:
        text_file.write(''.join(text))
    # print(f'Done writing to file: {write_file}')

In [None]:
def clean_all_files(indir, outdir, urls):
    for name in os.listdir(indir):
        clean_data(name, indir, outdir, urls)
    print(f'Done cleaning all files')

In [None]:
clean_all_files(page_directory, cleaned_data_directory, urls)

Done cleaning all files
