### Read Excel file, extract the data from column A into a list, and treat first line as a header

In [None]:
import pandas as pd


# Set the path to the directory containing the Excel file
path_lumen_docs = "../langchain/docs/lumen/docs/"
excel_filename = "lumen_sitemapXML.xlsx"

# Define a helper function to flatten a nested list
def fatten_list(nested_list):
    return [item for sublist in nested_list for item in sublist]

# Construct the file path of the Excel file
file = path_lumen_docs + excel_filename

# Extract the data from Excel file column A, first line as a header
df = pd.read_excel(file, usecols='A', header=0)

# Convert the data frame to a list
column_A = df.values.tolist()

# Create a list of unique URLs by flattening the list and removing duplicates
lumen_urls = sorted(fatten_list(column_A))

# Print the length of the lumen_urls list
print(f'len(lumen_urls): {len(lumen_urls)}')

# Print the lumen_urls list
print(lumen_urls)

https://colab.research.google.com/drive/1f_1HeD1mK_wXfjgvY4VGNFKSQBE5Imeh?usp=sharing#scrollTo=jm2nTHTtVTp0

In [None]:
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader(lumen_urls)
data = loader.load()

In [None]:
data[0:3]
_data = data

In [1]:
import sys
sys.path.append('../..')
from py3810.myUtils import pickle_dump, pickle_load

path_lumen_docs = '..\langchain\docs\lumen\docs\\'

In [2]:
data = pickle_load(filename_pickle='lumen_data', path_pickle_dump=path_lumen_docs)
# pickle_dump(file_to_pickle=data, filename_pickle='lumen_data', path_pickle_dump=path_lumen_docs)


In [4]:
# Concatenate all the pages into a single string
str_text = ''
for page in data:
  str_text += page.page_content

In [5]:
# C:\Users\ping\AppData\Roaming\nltk_data\corpora\words\en
import nltk
nltk.download('words')
from nltk.corpus import words

def find_misspelled_words(text):
  my_words = set(text.split())
  english_words = set(words.words())
  return [word for word in my_words if word.lower() not in english_words]

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\ping\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [6]:
l_misspelled_words = find_misspelled_words(str_text)
print(f'l_misspelled_words, len={len(l_misspelled_words)}:\n{l_misspelled_words}')

l_misspelled_words, len=5755:
['source.', 'control?”', 'Dry,', 'Stybel', 'ceiling.', 'thicker', 'difference.', 'Brides', 'adolescents', 'swings,', 'intervals', 'PATIENTS?', 'dysphoriaPrimary', 'Individuals:', 'At-Risk', 'Previously,', 'Thursday', 'anyone.', '78%.', 'centers', 'healthier', 'clearest', 'Retinoscopy/autorefractor', 'participated', 'Makeup:', 'alignment,', 'issues,', 'touchpoints', 'significant,', 'Health,', 'Britannica,', 'became,', 'drops.', 'side,', 'ADULTS:', 'eyelidDecreased', 'Blinking.', 'scheduled', 'bigger,', 'holds', 'beat.', 'headlights.', 'NeurolensesAs', 'Conversely,', 'replacing', 'dark,', 'myopic),', 'Ortho-K?”', 'UsThis', 'normal”', 'interests.', 'focusing', 'solutions.', 'millimeter.', 'Essentially,', 'Curable?', 'evaluations.', 'options,', 'objectives', 'variations', '“Nature', 'Hands', 'accomplishes', 'stagnant,', 'technique.', 'disease,', 'menu.', 'vision-threatening', 'school,', 'dosed', 'surfaces.', 'Dr.', 'Notably,', 'friends', '(grows', '21', '*For'

In [7]:
def remove_trailing_chars(items, chars):
  for i, item in enumerate(items):
    for char in chars:
      item = item.rstrip(char)
    items[i] = item

  return items

# Example usage
chars = [".", ",", ":", "'"]
items = ['apple.', 'banana:', 'cherry"', 'a.apple', '.apple']
print(remove_trailing_chars(items, chars))

['apple', 'banana', 'cherry"', 'a.apple', '.apple']


In [11]:
chars = [".", ",", ":", "'", '"', '”', '?', ')']
l_misspelled_words = remove_trailing_chars(l_misspelled_words, chars)
print(f'l_misspelled_words, len={len(l_misspelled_words)}:\n{l_misspelled_words}')

l_misspelled_words, len=3233:
['control', 'Grains', 'low-light', '20', 'carotenoids', 'Stybel', 'Appointment?Do', 'Beijing', 'modifying', 'ChoiceApril', 'thicker', 'membranes', 'Disturbances', 'frame-fitting', 'Cosmetics', 'Brides', 'GlaucomaEssential', 'occurring', 'seconds', 'adolescents', 'typing', 'intervals', 'fruits', 'dysphoriaPrimary', '91024Call', 'At-Risk', 'MadeUnderstanding', 'recipes', 'Wraparound', 'Google', 'Thanh', 'goodies', 'Thursday', 'develops', '40', 'life-changing', 'TV', 'Aimee', 'centers', 'Preservatives', 'Forms', 'healthier', '10-12', 'clearest', 'Retinoscopy/autorefractor', 'won’t', '“the', 'Concerns', 'dissolves', 'participated', 'jobs', 'K.G', 'rubs', '10-20%', 'touchpoints', 'indicates', 'encouraged', 'Embedded', '50', '5-year-olds', 'it.', 'safest', 'problemsPrimary', 'elongates', 'Long-Term', 'hygienePrimary', 'keratitisRedness', 'treats', 'paramedics', 'eyelidDecreased', 'environments', 'Limits', 'scheduled', 'letting', '(Ortho-K', 'adjusting', 'Types',

In [12]:
str_misspelled_words = " ".join(l_misspelled_words)

In [13]:
l_misspelled_words = find_misspelled_words(str_misspelled_words)
print(f'l_misspelled_words, len={len(l_misspelled_words)}:\n{l_misspelled_words}')

l_misspelled_words, len=3199:
['Grains', 'low-light', '20', 'carotenoids', 'Stybel', 'Appointment?Do', 'Beijing', 'modifying', 'ChoiceApril', 'thicker', 'membranes', 'Disturbances', 'frame-fitting', 'Cosmetics', 'Brides', 'GlaucomaEssential', 'occurring', 'seconds', 'adolescents', 'typing', 'intervals', 'fruits', 'dysphoriaPrimary', '91024Call', 'At-Risk', 'MadeUnderstanding', 'recipes', 'Wraparound', 'Google', 'Thanh', 'goodies', 'Thursday', 'develops', '40', 'life-changing', 'TV', 'Aimee', 'centers', 'Preservatives', 'Forms', 'television.', 'healthier', '10-12', 'clearest', 'Retinoscopy/autorefractor', 'won’t', '“the', 'Concerns', 'dissolves', 'participated', 'jobs', 'K.G', 'rubs', '10-20%', 'touchpoints', 'indicates', 'encouraged', 'Embedded', '50', '5-year-olds', 'safest', 'problemsPrimary', 'elongates', 'Long-Term', 'wraparound', 'environments', 'hygienePrimary', 'keratitisRedness', 'treats', 'paramedics', 'adjusting', 'Limits', 'eyelidDecreased', 'letting', '(Ortho-K', 'scheduled

In [30]:
def separate_item_on_chars(items, chars):
  rtn_list = []
  for i, item in enumerate(items):
    print(f'item: {item}')
    for char in chars:
      if char in item:
        my_list = item.split(char)
        print(f'my_list before split: {my_list}')        
        my_list[0] = my_list[0] + char
        print(f'char: {char}')
        print(f'my_list[0]: {my_list[0]}')
        print(f'my_list after split: {my_list}')      
        rtn_list.append(my_list)
        print(' ')

  return rtn_list

# Example usage
chars = [".", ",", ":", "'"]
items = ['apple.app', 'banana:ban', 'cherry"che', 'a.apple', '.apple']
print(separate_item_on_chars(items, chars))



item: apple.app
my_list before split: ['apple', 'app']
char: .
my_list[0]: apple.
my_list after split: ['apple.', 'app']
 
item: banana:ban
my_list before split: ['banana', 'ban']
char: :
my_list[0]: banana:
my_list after split: ['banana:', 'ban']
 
item: cherry"che
item: a.apple
my_list before split: ['a', 'apple']
char: .
my_list[0]: a.
my_list after split: ['a.', 'apple']
 
item: .apple
my_list before split: ['', 'apple']
char: .
my_list[0]: .
my_list after split: ['.', 'apple']
 
[['apple.', 'app'], ['banana:', 'ban'], ['a.', 'apple'], ['.', 'apple']]


In [None]:
separate

In [25]:
sep_char = '?'
my_list = ['Appointment?Do']
my_list[0].split(sep_char)
# my_list = my_list[0].split(sep_char)
# my_list[0] = my_list[0] + sep_char
# print(my_list)

['Appointment', 'Do']

In [None]:
my_words = set(text.split())
english_words = set(words.words())

In [None]:
misspelled_w = []
for word in my_words:
  if word.lower() not in english_words:
    misspelled_w.append(word)

print(f'len(misspelled_w): {len(misspelled_w)}')                        
# print(f'len(misspelled_w): {len(misspelled_w)}')
misspelled_w

In [None]:
word = 'By'
english_words = set(words.words('en'))
if word.lower() not in english_words:
  print(f'{word} NOT in english_words')
else:
  print(f'{word} IN english_words')    

In [None]:
print(f'len(misspelled_words): {len(misspelled_words)}')
misspelled_words

In [None]:
misspelled_words = remove_trailing_char(misspelled_words, '.')

In [None]:
def combine_items(items):
  return " ".join(items)

# Example usage
items = ["hello", "world"]
print(combine_items(items))  # Output: "hello world"

In [None]:
str_misspelled_words = combine_items(misspelled_words)
misspelled_words = find_misspelled_words(str_misspelled_words)
print(f'len(misspelled_words): {len(misspelled_words)}')
misspelled_words

In [None]:
my_text = set(text.split())
my_text

In [None]:
# Concatenate all the pages into a single string
text = ''
for page in data:
  text += page.page_content

print(f'len(text) before removing HTML tangs: {len(text)}')
# Clean the text by removing any HTML tags
import re

text = re.sub(r'\n+', '\n', text)  # replace \n\n\n..\n with \n
text = re.sub(r'\xa0', ' ', text)# replace '\xa0' with ' '
# replace all occurrences of the newline character \n that are followed by
# a word character (\w) with a space character " ". The strip() method
# remove any leading or trailing whitespace from the string.
text = re.sub(r'\n(?=\w)', ' ', text).strip()
text = re.sub(r'\n', ' ', text)  
text = re.sub(r' +(?=\s)', ' ', text)  

print(f'len(text) after  removing HTML tangs: {len(text)}')

# # Optionally, you can also remove any stopwords or punctuation
# from nltk.corpus import stopwords
# text = ' '.join([word for word in text.split() if word.lower() not in stopwords.words('english')])
# text = text.translate(str.maketrans('', '', string.punctuation))

In [None]:
print(f'len(text) before removing HTML tangs: {len(text)}')
text = re.sub(r'\n', ' ', text)  # replace \n\n\n..\n with \n
print(f'len(text) after  removing HTML tangs: {len(text)}')

In [None]:
text = "  hello\nworld  "
text = re.sub(r'\n(?=\w)', ' ', text).strip()
print(text)  # Output: "hello world"

In [None]:
import re

# The (?=\w) pattern is a positive lookahead assertion that matches a position in the string
# where the next character is a word character.
pattern = r'\b\w+(?=\s+\d+(\W+|$))'

text = 'The quick brown fox jumps over the 12345 lazy dog.'

matches = re.findall(pattern, text)

print(matches)  # Output: ['jumps', 'over', 'the']

In [None]:
# Load the data from a website
loader = WebBaseLoader('https://example.com')
pages = loader.load()

# Concatenate all the pages into a single string
text = ''
for page in pages:
    text += page.page_content

# Clean the text by removing any HTML tags
import re
text = re.sub(r'<.*?>', '', text)

# Optionally, you can also remove any stopwords or punctuation
from nltk.corpus import stopwords
text = ' '.join([word for word in text.split() if word.lower() not in stopwords.words('english')])
text = text.translate(str.maketrans('', '', string.punctuation))