### Read Excel file, extract the data from column A into a list, and treat first line as a header

In [None]:
import sys
sys.path.append('../..')
from py3810.myUtils import pickle_dump, pickle_load

path_lumen_docs = '..\langchain\docs\lumen\docs\\'

In [None]:
def remove_trailing_chars(items, chars):
  for i, item in enumerate(items):
    for char in chars:
      item = item.rstrip(char)
    items[i] = item

  return items

# Example usage
chars = [".", ",", ":", "'"]
items = ['apple.', 'banana:', 'cherry"', 'a.apple', '.apple']
print(remove_trailing_chars(items, chars))

In [None]:
def flatten_list(nested_list):
    flat_list = []
    for sublist in nested_list:
        for item in sublist:
            flat_list.append(item)
    return flat_list

In [None]:
def separate_item_on_chars(items, chars):
  rtn_list = []
  for i, item in enumerate(items):
    for char in chars:
      if char in item:
        my_list = item.split(char)
        my_list[0] = my_list[0] + char
        rtn_list.append(my_list)

  rtn_list = flatten_list(rtn_list)        

  return rtn_list

# Example usage
chars = [".", ",", ":", '"']
items = ['apple.app', 'banana:ban', 'cherry"che', 'a.apple', '.apple']

print(f'chars: {chars}')
print(f'items: {items}')
print(f'rtn_list: {separate_item_on_chars(items, chars)}')

In [None]:
# C:\Users\ping\AppData\Roaming\nltk_data\corpora\words\en
import nltk
# nltk.download('words')
from nltk.corpus import words

def find_misspelled_words(text):
  my_words = set(text.split())
  english_words = set(words.words())
  return [word for word in my_words if word.lower() not in english_words]

#### Process/Clean documents downloaded using LangChain WebBaseLoader 

In [None]:
from langchain.schema.document import Document, BaseDocumentTransformer
from typing import Any, Sequence
import re

class PreprocessTransformer(BaseDocumentTransformer):
    def transform_documents(
        self, documents: Sequence[Document], **kwargs: Any
    ) -> Sequence[Document]:
        for document in documents:
            # Access the page_content field
            content = document.page_content

            # Apply your preprocessing steps here
            # # For example, convert the content to lowercase
            # document.page_content = content.lower()

            # document.page_content = re.sub(r'\n+', ' ', document.page_content)  # replace \n\n\n..\n with ' '
            document.page_content = re.sub(r'\n+', '\n', document.page_content)  # replace \n\n\n..\n with \n
            document.page_content = re.sub(r'\xa0', ' ', document.page_content)# replace '\xa0' with ' '
            # document.page_content = re.sub(r'\n', ' ', document.page_content)  # replace newline with a space  
            document.page_content = re.sub(r' +(?=\s)', '', document.page_content)  # replace consecutive spaces with single space 
            document.page_content = re.sub(r'\n+', '\n', document.page_content)  # replace \n\n\n..\n with \n
            document.page_content = document.page_content.strip()  # replace leading and trailing spaces

        return documents

    async def atransform_documents(
        self, documents: Sequence[Document], **kwargs: Any
    ) -> Sequence[Document]:
        # Implement the asynchronous version of the method
        return self.transform_documents(documents, **kwargs)

In [None]:
docs = pickle_load(filename_pickle='lumen_docs_raw_videos', path_pickle_dump=path_lumen_docs)
docs

In [None]:
import pandas as pd

# Set the path to the directory containing the Excel file
path_lumen_docs = "../langchain/docs/lumen/docs/"
excel_filename = "lumen_sitemapXML.xlsx"

# Define a helper function to flatten a nested list
def fatten_list(nested_list):
    return [item for sublist in nested_list for item in sublist]

# Construct the file path of the Excel file
file = path_lumen_docs + excel_filename

# Extract the data from Excel file column A, first line as a header
df = pd.read_excel(file, usecols='A', header=0)

# Convert the data frame to a list
column_A = df.values.tolist()

# Create a list of unique URLs by flattening the list and removing duplicates
lumen_urls = sorted(fatten_list(column_A))

# Print the length of the lumen_urls list
print(f'len(lumen_urls): {len(lumen_urls)}')

# Print the lumen_urls list
print(lumen_urls)

In [None]:
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader(lumen_urls)
data = loader.load()

In [None]:
pickle_dump(file_to_pickle=data, filename_pickle='lumen_website_docs_raw', path_pickle_dump=path_lumen_docs)

In [None]:
docs_raw = pickle_load(filename_pickle='lumen_website_docs_raw', path_pickle_dump=path_lumen_docs)

In [None]:
docs_raw

In [None]:
transformer = PreprocessTransformer()
# documents = [...]  # assume you have a list of Document objects
lumen_website_docs_processed = transformer.transform_documents(data_raw)
pickle_dump(file_to_pickle=lumen_website_docs_processed, filename_pickle='lumen_website_docs_processed', path_pickle_dump=path_lumen_docs)

In [None]:
lumen_website_docs_processed

In [None]:
# Concatenate all the pages into a single string
str_text = ''
for page in data:
  str_text += page.page_content

In [None]:
str_text

In [None]:
import re

print(f'len(str_text) before replacement: {len(str_text)}')

# str_text = re.sub(r'\n+', ' ', str_text)  # replace \n\n\n..\n with ' '
str_text = re.sub(r'\n+', '\n', str_text)  # replace \n\n\n..\n with \n
str_text = re.sub(r'\xa0', ' ', str_text)# replace '\xa0' with ' '
# str_text = re.sub(r'\n', ' ', str_text)  # replace newline with a space  
str_text = re.sub(r' +(?=\s)', '', str_text)  # replace consecutive spaces with single space 
str_text = re.sub(r'\n+', '\n', str_text)  # replace \n\n\n..\n with \n
str_text = str_text.strip()  # replace leading and trailing spaces

print(f'len(str_text) after  replacement: {len(str_text)}')

# # Optionally, you can also remove any stopwords or punctuation
# from nltk.corpus import stopwords
# str_text = ' '.join([word for word in str_text.split() if word.lower() not in stopwords.words('english')])
# str_text = str_text.translate(str.maketrans('', '', string.punctuation))

In [None]:
str_text

In [None]:
l_misspelled_words = find_misspelled_words(str_text)
print(f'l_misspelled_words, len={len(l_misspelled_words)}:\n{l_misspelled_words}')

In [None]:
chars = [".", ",", ":", "'", '"', '”', '?', '(', ')']
l_misspelled_words = remove_trailing_chars(l_misspelled_words, chars)
print(f'l_misspelled_words, len={len(l_misspelled_words)}:\n{l_misspelled_words}')

In [None]:
str_misspelled_words = " ".join(l_misspelled_words)  # turn it into a string

In [None]:
l_misspelled_words = find_misspelled_words(str_misspelled_words)
print(f'l_misspelled_words, len={len(l_misspelled_words)}:\n{l_misspelled_words}')

In [None]:
str_misspelled_words = " ".join(l_misspelled_words)  # turn it into a string
str_misspelled_words

In [None]:
_ = ['apple.', 'app']
_emty = []
_emty.append(item for item in _)
_emty

In [None]:
sep_char = '?'
my_list = ['Appointment?Do']
my_list[0].split(sep_char)
# my_list = my_list[0].split(sep_char)
# my_list[0] = my_list[0] + sep_char
# print(my_list)

In [None]:
my_words = set(text.split())
english_words = set(words.words())

In [None]:
misspelled_w = []
for word in my_words:
  if word.lower() not in english_words:
    misspelled_w.append(word)

print(f'len(misspelled_w): {len(misspelled_w)}')                        
# print(f'len(misspelled_w): {len(misspelled_w)}')
misspelled_w

In [None]:
word = 'By'
english_words = set(words.words('en'))
if word.lower() not in english_words:
  print(f'{word} NOT in english_words')
else:
  print(f'{word} IN english_words')    

In [None]:
print(f'len(misspelled_words): {len(misspelled_words)}')
misspelled_words

In [None]:
misspelled_words = remove_trailing_char(misspelled_words, '.')

In [None]:
def combine_items(items):
  return " ".join(items)

# Example usage
items = ["hello", "world"]
print(combine_items(items))  # Output: "hello world"

In [None]:
str_misspelled_words = combine_items(misspelled_words)
misspelled_words = find_misspelled_words(str_misspelled_words)
print(f'len(misspelled_words): {len(misspelled_words)}')
misspelled_words

In [None]:
my_text = set(text.split())
my_text

In [None]:
print(f'len(text) before removing HTML tangs: {len(text)}')
text = re.sub(r'\n', ' ', text)  # replace \n\n\n..\n with \n
print(f'len(text) after  removing HTML tangs: {len(text)}')

In [None]:
text = "  hello\nworld  "
text = re.sub(r'\n(?=\w)', ' ', text).strip()
print(text)  # Output: "hello world"

In [None]:
import re

# The (?=\w) pattern is a positive lookahead assertion that matches a position in the string
# where the next character is a word character.
pattern = r'\b\w+(?=\s+\d+(\W+|$))'

text = 'The quick brown fox jumps over the 12345 lazy dog.'

matches = re.findall(pattern, text)

print(matches)  # Output: ['jumps', 'over', 'the']

In [None]:
# Load the data from a website
loader = WebBaseLoader('https://example.com')
pages = loader.load()

# Concatenate all the pages into a single string
text = ''
for page in pages:
    text += page.page_content

# Clean the text by removing any HTML tags
import re
text = re.sub(r'<.*?>', '', text)

# Optionally, you can also remove any stopwords or punctuation
from nltk.corpus import stopwords
text = ' '.join([word for word in text.split() if word.lower() not in stopwords.words('english')])
text = text.translate(str.maketrans('', '', string.punctuation))