### Read Excel file, extract the data from column A into a list, and treat first line as a header

In [7]:
import pandas as pd


# Set the path to the directory containing the Excel file
path_lumen_docs = "../langchain/docs/lumen/docs/"
excel_filename = "lumen_sitemapXML.xlsx"

# Define a helper function to flatten a nested list
def fatten_list(nested_list):
    return [item for sublist in nested_list for item in sublist]

# Construct the file path of the Excel file
file = path_lumen_docs + excel_filename

# Extract the data from Excel file column A, first line as a header
df = pd.read_excel(file, usecols='A', header=0)

# Convert the data frame to a list
column_A = df.values.tolist()

# Create a list of unique URLs by flattening the list and removing duplicates
lumen_urls = sorted(fatten_list(column_A))

# Print the length of the lumen_urls list
print(f'len(lumen_urls): {len(lumen_urls)}')

# Print the lumen_urls list
print(lumen_urls)

len(lumen_urls): 173
['https://www.lumenoptometric.com/', 'https://www.lumenoptometric.com/about-us/', 'https://www.lumenoptometric.com/about-us/appointments-forms/', 'https://www.lumenoptometric.com/about-us/gallery/', 'https://www.lumenoptometric.com/about-us/office/', 'https://www.lumenoptometric.com/about-us/testimonials/', 'https://www.lumenoptometric.com/blog/', 'https://www.lumenoptometric.com/blog/amd-awareness-month/3-big-amd-myths-debunked/', 'https://www.lumenoptometric.com/blog/covid-19/contact-lens-wear-during-covid-19/', 'https://www.lumenoptometric.com/blog/covid-19/how-to-stop-masks-from-fogging-glasses/', 'https://www.lumenoptometric.com/blog/eye-care/10-essential-rules-for-contact-lens-wearers/', 'https://www.lumenoptometric.com/blog/eye-care/3-reasons-why-kids-with-myopia-need-to-spend-more-time-outdoors/', 'https://www.lumenoptometric.com/blog/eye-care/3-workplace-eye-wellness-tips-for-employees/', 'https://www.lumenoptometric.com/blog/eye-care/4-effective-methods-f

https://colab.research.google.com/drive/1f_1HeD1mK_wXfjgvY4VGNFKSQBE5Imeh?usp=sharing#scrollTo=jm2nTHTtVTp0

In [14]:
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader(lumen_urls)
data = loader.load()

In [17]:
data[0:3]
_data = data

In [9]:
import sys
sys.path.append('../..')
from py3810.myUtils import pickle_dump, pickle_load

path_lumen_docs = '..\langchain\docs\lumen\docs\\'

In [91]:
data = pickle_load(filename_pickle='lumen_data', path_pickle_dump=path_lumen_docs)
# pickle_dump(file_to_pickle=data, filename_pickle='lumen_data', path_pickle_dump=path_lumen_docs)


In [None]:
data

In [93]:
# Concatenate all the pages into a single string
text = ''
for page in data:
  text += page.page_content

print(f'len(text) before removing HTML tangs: {len(text)}')
# Clean the text by removing any HTML tags
import re

# text = re.sub(r'\n+', '.', text)  # replace \n\n\n..\n with \n
# text = re.sub(r'\xa0', ' ', text)# replace '\xa0' with ' '
# # replace all occurrences of the newline character \n that are followed by
# # a word character (\w) with a space character " ". The strip() method
# # remove any leading or trailing whitespace from the string.
# text = re.sub(r'\n(?=\w)', ' ', text).strip()
# text = re.sub(r'\n', ' ', text)  
# text = re.sub(r' +(?=\s)', ' ', text)  

print(f'len(text) after  removing HTML tangs: {len(text)}')

# # Optionally, you can also remove any stopwords or punctuation
# from nltk.corpus import stopwords
# text = ' '.join([word for word in text.split() if word.lower() not in stopwords.words('english')])
# text = text.translate(str.maketrans('', '', string.punctuation))

len(text) before removing HTML tangs: 945316
len(text) after  removing HTML tangs: 945316


In [95]:
import nltk
nltk.download('words')
from nltk.corpus import words

def find_misspelled_words(text):
    my_words = set(text.split())
    english_words = set(words.words())
    return [word for word in my_words if word.lower() not in english_words]

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\ping\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [96]:
text



In [97]:
misspelled_words = find_misspelled_words(text)

In [None]:
print(f'len(misspelled_words): {len(misspelled_words)}')
misspelled_words

In [100]:
def remove_trailing_char(items, char):
  return [item.rstrip(char) for item in items]

# Example usage
items = ['apple.', 'banana', 'cherry.']
print(remove_trailing_char(items, '.'))
# Output: ['apple', 'banana', 'cherry']

# Example usage
items = ['file1.txt', 'file2', 'file3.txt']
print(remove_trailing_char(items, '.txt'))
# Output: ['file1', 'file2', 'file3']

['apple', 'banana', 'cherry']
['file1', 'file2', 'file3']


In [109]:
misspelled_words = remove_trailing_char(misspelled_words, '.')

In [110]:
def combine_items(items):
  return " ".join(items)

# Example usage
items = ["hello", "world"]
print(combine_items(items))  # Output: "hello world"

hello world


In [112]:
str_misspelled_words = combine_items(misspelled_words)
misspelled_words = find_misspelled_words(str_misspelled_words)
print(f'len(misspelled_words): {len(misspelled_words)}')
misspelled_words

len(misspelled_words): 4706


['Compromised',
 'Minimally,',
 'prevents',
 'testtest',
 'ExamsYour',
 'sunglasses',
 'Tuesday',
 'We’d',
 'U.S',
 'retainers',
 'that’s',
 'considered:',
 'worldwide,',
 'circumstances',
 'COVID-19?Is',
 'growth,',
 'television.”',
 'EYES?',
 'osmolarity',
 'anyone,',
 'researchers',
 'doses',
 'temperatures,',
 'peppers,',
 'Treehouse',
 'team,',
 'prescription’s',
 'ornaments,',
 'often,',
 '9.20.20',
 'Vision5',
 'pleased',
 'weren’t',
 'Exams',
 'diseases3',
 '(Figure',
 '2050!',
 'required',
 'frame-fitting',
 'LinksHome',
 'injuries',
 'done,',
 'LCD',
 'places',
 'controllers',
 'respected',
 'permeability,',
 'dollars!',
 '40s,',
 'Emily',
 'rule,',
 'mirror,',
 'OD,',
 'GP',
 'Newborn',
 'not,',
 '135',
 'Moreover,',
 'try:',
 "Child's",
 'fantastic!!!',
 '2022Your',
 'Mountains',
 'exhibits',
 'scheduler]',
 'more?',
 'walls',
 'shadows!',
 '26,',
 '“yes.”',
 'jobs',
 'arts',
 'after-school',
 'experience:',
 '60,',
 'overlapping',
 'info@lumenoptometric.com,',
 'grains,',


['Compromised',
 'Minimally,',
 'California',
 'prevents',
 'testtest',
 'ExamsYour',
 'sunglasses',
 'treatments',
 'safety',
 'Tuesday',
 'We’d',
 'retainers',
 'that’s',
 'waking',
 'UV',
 'considered:',
 'worldwide,',
 'COVID-19?Is',
 'growth,',
 'television.”',
 'EYES?',
 'osmolarity',
 'anyone,',
 'researchers',
 'doses',
 'temperatures,',
 'peppers,',
 'Treehouse',
 'team,',
 'ensemble',
 'page',
 'prescription’s',
 'ornaments,',
 'often,',
 '9.20.20',
 'grows',
 'age',
 'Vision5',
 'pleased',
 'adapts',
 'weren’t',
 'Exams',
 '(Figure',
 '2050!',
 'required',
 '“acquired”)',
 'frame-fitting',
 'LinksHome',
 'injuries',
 'children',
 'done,',
 'LCD',
 'places',
 'controllers',
 'respected',
 'permeability,',
 'dollars!',
 'carbohydrates',
 'Myopia',
 '40s,',
 'Emily',
 'rule,',
 'performed',
 'OD,',
 'mirror,',
 'GP',
 'Newborn',
 'trust',
 'not,',
 'etc',
 '135',
 'Moreover,',
 'try:',
 "Child's",
 'fantastic!!!',
 '2022Your',
 'Mountains',
 'exhibits',
 'scheduler]',
 'more?',

In [70]:
my_text = set(text.split())
my_text

{'This',
 'a',
 'an',
 'example',
 'here:',
 'is',
 'misspelling',
 'sentence',
 'with',
 'womans'}

In [50]:
# Concatenate all the pages into a single string
text = ''
for page in data:
  text += page.page_content

print(f'len(text) before removing HTML tangs: {len(text)}')
# Clean the text by removing any HTML tags
import re

text = re.sub(r'\n+', '\n', text)  # replace \n\n\n..\n with \n
text = re.sub(r'\xa0', ' ', text)# replace '\xa0' with ' '
# replace all occurrences of the newline character \n that are followed by
# a word character (\w) with a space character " ". The strip() method
# remove any leading or trailing whitespace from the string.
text = re.sub(r'\n(?=\w)', ' ', text).strip()
text = re.sub(r'\n', ' ', text)  
text = re.sub(r' +(?=\s)', ' ', text)  

print(f'len(text) after  removing HTML tangs: {len(text)}')

# # Optionally, you can also remove any stopwords or punctuation
# from nltk.corpus import stopwords
# text = ' '.join([word for word in text.split() if word.lower() not in stopwords.words('english')])
# text = text.translate(str.maketrans('', '', string.punctuation))

len(text) before removing HTML tangs: 945316
len(text) after  removing HTML tangs: 919068


In [34]:
print(f'len(text) before removing HTML tangs: {len(text)}')
text = re.sub(r'\n', ' ', text)  # replace \n\n\n..\n with \n
print(f'len(text) after  removing HTML tangs: {len(text)}')

len(text) before removing HTML tangs: 919850
len(text) after  removing HTML tangs: 919850


In [38]:
text = "  hello\nworld  "
text = re.sub(r'\n(?=\w)', ' ', text).strip()
print(text)  # Output: "hello world"

hello world


In [35]:
import re

# The (?=\w) pattern is a positive lookahead assertion that matches a position in the string
# where the next character is a word character.
pattern = r'\b\w+(?=\s+\d+(\W+|$))'

text = 'The quick brown fox jumps over the 12345 lazy dog.'

matches = re.findall(pattern, text)

print(matches)  # Output: ['jumps', 'over', 'the']

[' ']


In [None]:
# Load the data from a website
loader = WebBaseLoader('https://example.com')
pages = loader.load()

# Concatenate all the pages into a single string
text = ''
for page in pages:
    text += page.page_content

# Clean the text by removing any HTML tags
import re
text = re.sub(r'<.*?>', '', text)

# Optionally, you can also remove any stopwords or punctuation
from nltk.corpus import stopwords
text = ' '.join([word for word in text.split() if word.lower() not in stopwords.words('english')])
text = text.translate(str.maketrans('', '', string.punctuation))