In [82]:
# imports
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain_community.llms import Ollama
from langchain_community.document_transformers import Html2TextTransformer


from bs4 import BeautifulSoup
import re
import json
from urllib.parse import urlparse
import csv

In [107]:
html2text = Html2TextTransformer()

In [130]:
# From URLs, load HTML content async
urls = ["https://neurograd.ucsf.edu/faculty"]
loader = AsyncHtmlLoader(urls)
docs = loader.load()

Fetching pages: 100%|##########| 1/1 [00:00<00:00,  1.82it/s]


In [131]:
# Split the string on the literal "\n" and then join it with actual newlines
soup = BeautifulSoup(docs[0].page_content, 'html.parser')
body_content = soup.body

for header in body_content.find_all('header'):
        header.decompose()

# Remove <footer> tags
for footer in body_content.find_all('footer'):
    footer.decompose()

body_string = str(body_content)
print(body_string)

<body class="html not-front not-logged-in page-node page-node- page-node-86481 node-type-page slider-secondary ucsf-b1gfoot font--header--helveticaneue slider-secondary-no-image profile--grid sidebar sidebar--right sidebar-color--white">
<div id="skip-link">
<a class="element-invisible element-focusable" href="#main-content">Skip to main content</a>
</div>
<div id="page-wrapper">
<div id="page">
<div class="ucsf-top-header row-full-width logo">
<div class="inside">
<ul class="menu">
<li class="first"><a href="https://www.ucsf.edu">University of California San Francisco</a></li>
<li><a href="https://www.ucsfhealth.org/">UCSF Health</a></li>
<li><a href="https://www.ucsf.edu/search" title="">Search UCSF</a></li>
<li><a href="https://www.ucsf.edu/about">About UCSF</a></li>
</ul>
</div>
</div>
<div class="row-full-width" id="header">
<div class="inside clearfix">
<a href="/" id="logo" rel="home" title="Home">
<img alt="Home" src="https://neurograd.ucsf.edu/sites/g/files/tkssra4316/f/Neuros

In [132]:
# Using LLAMA 2 Language model
llm = Ollama(model="llama3")

In [133]:
# Phase 1
# send input in chunks
chunk_size = 16000
step_size = 15000

instruction_to_llm = "From the above HTML content, extract name of professor and URL. Write professor name and URL side by side in format of PROFESSOR NAME: URL. If either professor or URL is missing, print ---. Also don't write anything else. Only name and URL."
output_phase1 = ''
for i in range(0, len(body_string), step_size):
    chunk = body_string[i:i + chunk_size]
    # Process the chunk here (e.g., print or handle the chunk as needed)
    input_to_llm = "===== \n" + chunk + "\n ===== \n" + instruction_to_llm  # Example action: printing the chunk
    for chunks in llm.stream(input_to_llm):
        output_phase1 += chunks
        print(chunks, end = '')

    
    print('#^^^^^^^^^^^^^^^^^^^^^^^^#')
    

Here are the extracted professor names and URLs:

1. Karunesh Ganguly: https://neurograd.ucsf.edu/people/karunesh-ganguly-md-phd
2. Adam Gazzaley: https://neurograd.ucsf.edu/people/adam-gazzaley-md-phd
3. Walter Gonzalez: https://neurograd.ucsf.edu/people/walter-gonzalez-phd
4. Su Guo: ---
5. Corey Harwell: https://neurograd.ucsf.edu/people/corey-harwell-phd
6. Andrea Hasenstaub: ---
7. Stephen Hauser: ---
8. Jonathan Horton: https://neurograd.ucsf.edu/people/jonathan-horton-md-phd
9. Eric J. Huang: https://neurograd.ucsf.edu/people/eric-j-huang-md-phd
10. Yadong Huang: https://neurograd.ucsf.edu/people/yadong-huang-md-phd
11. Holly Ingraham: ---
12. Lily Jan: https://neurograd.ucsf.edu/people/lily-jan-phd
13. Yuh Nung Jan: https://neurograd.ucsf.edu/people/yuh-nung-jan-phd
14. David Julius: https://neurograd.ucsf.edu/people/david-julius-phd
15. Martin Kampmann: https://neurograd.ucsf.edu/people/martin-kampmann-phd
16. Aimee Kao: https://neurograd.ucsf.edu/people/aimee-kao-md-phd
17. S

In [134]:
phase2_input = output_phase1
instruction_to_llm = "From the above text, make a JSON with keyname 'data' and value is an array of objects. Each object has professor name and their href. Ignore the ones where the name of professor is '---'. Example: {'data': [ {'name': 'John', 'url': '/research/faculty/45'}  ]  }. Don't write code. Don't write anything else. Give me JSON only"
output_phase2 = ''
input_to_llm = "=== \n" + phase2_input + "\n === \n" + instruction_to_llm
for chunks in llm.stream(input_to_llm):
        output_phase2 += chunks
        print(chunks, end = '')

{
"data": [
{"name": "Karunesh Ganguly", "url": "https://neurograd.ucsf.edu/people/karunesh-ganguly-md-phd"},
{"name": "Adam Gazzaley", "url": "https://neurograd.ucsf.edu/people/adam-gazzaley-md-phd"},
{"name": "Walter Gonzalez", "url": "https://neurograd.ucsf.edu/people/walter-gonzalez-phd"},
{"name": "Corey Harwell", "url": "https://neurograd.ucsf.edu/people/corey-harwell-phd"},
{"name": "Jonathan Horton", "url": "https://neurograd.ucsf.edu/people/jonathan-horton-md-phd"},
{"name": "Eric J. Huang", "url": "https://neurograd.ucsf.edu/people/eric-j-huang-md-phd"},
{"name": "Yadong Huang", "url": "https://neurograd.ucsf.edu/people/yadong-huang-md-phd"},
{"name": "Lily Jan", "url": "https://neurograd.ucsf.edu/people/lily-jan-phd"},
{"name": "Yuh Nung Jan", "url": "https://neurograd.ucsf.edu/people/yuh-nung-jan-phd"},
{"name": "David Julius", "url": "https://neurograd.ucsf.edu/people/david-julius-phd"},
{"name": "Martin Kampmann", "url": "https://neurograd.ucsf.edu/people/martin-kampmann-

In [135]:
# Regex to find the content between the first { and the last }
match = re.search(r'\{.*\}', output_phase2, re.DOTALL) 
if match:
    # Extract the matched substring
    json_str = match.group(0)
    
    # Convert the string to a dictionary
    try:
        data_dict = json.loads(json_str.replace("'", '"'))
        print(data_dict)
    except json.JSONDecodeError as e:
        print("Failed to parse JSON:", e)
else:
    print("No match found")


{'data': [{'name': 'Karunesh Ganguly', 'url': 'https://neurograd.ucsf.edu/people/karunesh-ganguly-md-phd'}, {'name': 'Adam Gazzaley', 'url': 'https://neurograd.ucsf.edu/people/adam-gazzaley-md-phd'}, {'name': 'Walter Gonzalez', 'url': 'https://neurograd.ucsf.edu/people/walter-gonzalez-phd'}, {'name': 'Corey Harwell', 'url': 'https://neurograd.ucsf.edu/people/corey-harwell-phd'}, {'name': 'Jonathan Horton', 'url': 'https://neurograd.ucsf.edu/people/jonathan-horton-md-phd'}, {'name': 'Eric J. Huang', 'url': 'https://neurograd.ucsf.edu/people/eric-j-huang-md-phd'}, {'name': 'Yadong Huang', 'url': 'https://neurograd.ucsf.edu/people/yadong-huang-md-phd'}, {'name': 'Lily Jan', 'url': 'https://neurograd.ucsf.edu/people/lily-jan-phd'}, {'name': 'Yuh Nung Jan', 'url': 'https://neurograd.ucsf.edu/people/yuh-nung-jan-phd'}, {'name': 'David Julius', 'url': 'https://neurograd.ucsf.edu/people/david-julius-phd'}, {'name': 'Martin Kampmann', 'url': 'https://neurograd.ucsf.edu/people/martin-kampmann-ph

In [136]:
original_url = urls[0]
parsed_url = urlparse(original_url)
hostname = f"{parsed_url.scheme}://{parsed_url.netloc}"

next_set_of_urls = []
for idx, name_url_dict in enumerate(data_dict['data']):
    prof_name = name_url_dict['name']
    prof_url = name_url_dict['url']
    
    if not prof_url.startswith('http'):
        prof_web_page = hostname + prof_url
    else:
        prof_web_page = prof_url
    
    # change to complete URL
    data_dict['data'][idx]['url'] = prof_web_page
    next_set_of_urls.append(prof_web_page)

In [137]:
# From URLs, load HTML content async
next_loader = AsyncHtmlLoader(next_set_of_urls)
prof_website_docs = next_loader.load()


Fetching pages: 100%|##########| 24/24 [00:21<00:00,  1.13it/s]


In [138]:
for idx, prof_doc in enumerate(prof_website_docs):
    soup = BeautifulSoup(prof_doc.page_content, 'html.parser')
    body_content = soup.body

    # Remove <header> tags
    for header in body_content.find_all('header'):
        header.decompose()

    # Remove <footer> tags
    for footer in body_content.find_all('footer'):
        footer.decompose()

    data_dict['data'][idx]['html_body'] = body_content
    


In [147]:
## ROUGH ###
extract_urls(trimmed_str)

['https://www.ucsf.edu',
 'https://www.ucsfhealth.org',
 'https://www.ucsf.edu/search',
 'https://www.ucsf.edu/about',
 'https://neurograd.ucsf.edu/sites/g/files/tkssra4316/f/Neuroscience%20Grad%20Prog%20Web_9x.jpg',
 'https://twitter.com/UCSFNSGrad',
 'https://neurograd.ucsf.edu/outreach',
 'http://neurograd.ucsf.edu/admissions',
 'https://neurograd.ucsf.edu/graduate-student-fair',
 'https://ucsf.box.com/s/0abqv41cnuhj1uiyc0rxouuses7kzvz0',
 'https://ucsf.box.com/s/6xmev6ik2tkxo3sqmbmv39vuwg0vt877',
 'https://ucsf.box.com/s/n3gln76kvc9i5zc9a10px4b22na65zdg',
 'https://ucsf.box.com/s/rafrpdub9v27uwm4ksxj65f62riqun2w',
 'https://ucsf.box.com/s/i1to9s5k34sdznm175caz2wwrj3lh55p',
 'http://coursecatalog.ucsf.edu',
 'http://registrar.ucsf.edu/registration/study-list-filing',
 'http://registrar.ucsf.edu/academic-calendar',
 'http://neurograd.ucsf.edu/annual-neuroscience-retreat',
 'https://neurograd.ucsf.edu/neuroscience-gladstone-and-aarg-seminar-series-2023-24',
 'https://kavliifn.ucsf.edu

In [146]:
# phase 3
phase3_query = """
    From the above HTML content, extract the following:
    1) Lab webpage or professor's personal website
    2) Summary of professor's research under 100 words
    3) Email address of professor

    The extract content should be output in the following format only as in the below example.
    
    LABPAGE: { https://example_webpage_of_prof.com } 
     
    SUMMARY: { The example prof works on example topic in less than 100 words }
    EMAIL: { example_prof@gmail.com }

    ALWAYS PUT EACH INFORMATION IN CURLY BRACES { } as shown in the example. If any piece of information is missing, put "---" instead.
"""

for idx, prof_info in enumerate(data_dict['data']):
    prof_html_body = prof_info['html_body']
    str_html_body = str(prof_html_body)
    trimmed_str = str_html_body[0:min(len(str_html_body), 20000)]
    input_to_llm = "=== \n" + trimmed_str + "\n === \n" + phase3_query
    
    # print(input_to_llm)
    # break
    output_single_prof = ''
    for chunks in llm.stream(input_to_llm):
        output_single_prof += chunks
        print(chunks, end = '')

    
    pattern = r"(?<=PERSONAL_URL:\s)(\{[^\}]*\}|[^\n]*)|(?<=SUMMARY:\s)(\{[^\}]*\}|[^\n]*)|(?<=EMAIL:\s)(\{[^\}]*\}|[^\n]*)"

    
    extracted_content_corrected = re.findall(pattern, output_single_prof)

    
    processed_content = [max(item, key=len).strip('{} ').strip() for item in extracted_content_corrected]

    # Assigning extracted contents to variables
    personal_url, summary, email = processed_content



    data_dict['data'][idx]['personal_url'] = personal_url
    data_dict['data'][idx]['work_summary'] = summary
    data_dict['data'][idx]['email'] = email
    data_dict['data'][idx]['webpage_info_by_LLM'] = output_single_prof 


    print('\n  ====== \n')
    break


    

Based on the provided HTML content, I extracted the following:

LABPAGE: --- (no webpage specified)

SUMMARY: The professor's research focuses on developing neuroprosthetic systems to improve motor control and speech decoding in individuals with neurological disorders. Their work involves advancing neural interfaces, brain-computer interfaces, and prosthetic devices to enhance movement and communication abilities.

EMAIL: --- (no email address specified)

Let me know if you need any further assistance!

ValueError: not enough values to unpack (expected 3, got 2)

In [123]:
def extract_emails(text):
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    emails = re.findall(email_pattern, text)
    return emails

In [125]:
def extract_urls(text):
    url_pattern = r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+[\w/\-?=%.]+\b'
    urls = re.findall(url_pattern, text)
    return urls

In [149]:


def extract_urls_with_clean_labels(text):
    soup = BeautifulSoup(text, 'html.parser')
    urls_labels = []
    for tag in soup.find_all('a', href=True):
        url = tag['href']
        label = tag.get_text()  # Cleanly extracts text, stripping out HTML
        urls_labels.append((label, url))
    return urls_labels


In [155]:
urls_and_text = extract_urls_with_clean_labels(trimmed_str)
urls_text = ''
for u in urls_and_text:
    urls_text += str(u) + '\n'

In [156]:
input_to_llm = "=== \n" + urls_text + ' \n === \n ' + "From the above pairs of anchor tag inner text and URL, which URL might be professor's lab website or personal website"
for chunks in llm.stream(input_to_llm):
        output_phase1 += chunks
        print(chunks, end = '')

The pair that stands out is:

('Lab Website', 'https://www.gangulylab.org/') 

This suggests that Professor Ganguly's laboratory website can be found at this URL.

In [142]:
headers = ["Name", "Instiute Webpage URL", "Personal webpage", "email", "Work Summary"]
keys = ["name", "url", "personal_url", "email", "work_summary"]

# Create a CSV file
with open('data.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=headers)

    # Write headers
    writer.writeheader()

    # Write data rows
    for item in data_dict['data']:
        row = {header: item[key] for header, key in zip(headers, keys)}
        writer.writerow(row)



KeyError: 'personal_url'

In [1]:
import re

def is_valid_url(url):
    # Regex pattern for a simple URL
    pattern = re.compile(
        r'^https?://'  # Start with http:// or https://
        r'[\w.-]+'     # Domain name part allows alphanumeric, dot, and hyphen
        r'\.\w+'       # TLD part, expects at least one dot followed by alphanumeric characters
        r'\/[\w\/.-]*'  # Path that can include alphanumeric, slash, dot, and hyphen
        r'$',          # End of the string
        re.IGNORECASE) # Case insensitive match

    # Match the URL against the pattern
    if pattern.match(url):
        return True
    return False

# Test URL
test_url = "https://research/faculty/9"

# Check if the URL is valid
is_valid = is_valid_url(test_url)

# Print result
print("Is the URL valid?", is_valid)


Is the URL valid? False
