# Imports and API Keys here

In [1]:
import os
import requests
import openai
import tiktoken
from langchain.text_splitter import TokenTextSplitter
import json
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import pandas as pd
import pprint
import re
from selenium import webdriver
from selenium.webdriver.firefox.options import Options

## Base hyperparameters here (No need to adjust)

# No information default
NO_INFO = "No info"

# maximum token length for ChatGPT
MAX_TOKENS = 3000

# max timeout to wait for url opening
timeout = 2

# whether or not to run unit tests. Once the notebook is stable can set this to False
unit_tests = False

#API Keys
os.environ['OPENAI_API_TOKEN'] = 'YOUR_API_KEY_HERE'
api_key = 'YOUR_API_KEY_HERE'
cx = 'YOUR_ENGINE_TOKEN_HERE'

## Tune your search terms here
- modify the query
- modify the constraint of the type of websites you want
- modify the output format you would want

In [None]:
# Query format
query = 'Large Language Model courses'

# Keywords to be contained in secondary sites. 
# Will add url to list if any one matches. Secondary sites defined as links that are on the urls from Google Search
keyword_list = ['course', 'about', 'learn']

# Unwanted url segments. Any url with these inside usually means proprietary sites, or irrelevant (job) sites
unwanted_url = []

# Number of secondary urls to extract each site. GPT will pick the most relevant ones
max_url_per_site = 3

# Number of google queries to search for. Increasing this increases the breadth of search, but you only have 100 searches a day
num_google_queries = 3

# Whether to take top 20 search results (default is 10)
twenty = False

# Output format. Key is the name of the spreadsheet header, Value is the description of what is to be there
output_format = {"Course Name": "Name of course", "Course Website": "Website of course", 
                     "Broad Course Description": "Description of course",
                     "Cost": "Amount of Money for Course",
                     "Time": "Expected amount of time",
                     "Venue": ["Physical <location>","Online <website>"],
                     "Company": "Company offering the course"}

# Base index. When doing the final table, what will be the reference to merge entries
base_index = "Course Website"

## Start a headless browser
- This will be used for scraping all web pages

In [5]:
def start_driver():
    try:
        driver.quit()
        del driver
        print('Deleting existing driver')
    except Exception as e:
        print('No existing driver to delete')
        
    print('Initializing new driver')
    options = Options()
    options.add_argument("--headless")
    
    # Set the User-Agent
    options.add_argument("user-agent=Mozilla")

    # Set the option to accept all SSL certificates by default
    options.add_argument('--ignore-certificate-errors')

    # initialize a browser
    driver = webdriver.Firefox(options=options)
    print('Driver initialized:', driver)
    return driver

In [6]:
driver = start_driver()

No existing driver to delete
Initializing new driver
Driver initialized: <selenium.webdriver.firefox.webdriver.WebDriver (session="8e4cb1c6-576b-4868-9983-f02dca16399c")>


# Helper Functions

In [7]:
def strict_output(system_prompt, user_prompt, output_format, default_category = "", output_value_only = False,
                  model = 'gpt-3.5-turbo', temperature = 0, num_tries = 2, verbose = False):
    ''' Ensures that OpenAI will always adhere to the desired output json format. 
    Uses rule-based iterative feedback to ask GPT to self-correct.
    Keeps trying up to num_tries it it does not. Returns empty json if unable to after num_tries iterations.
    If output field is a list, will treat as a classification problem and output best classification category.
    Text enclosed within < > will generated by GPT accordingly'''

    # if the user input is in a list, we also process the output as a list of json
    list_input = isinstance(user_prompt, list)
    # if the output format contains dynamic elements of < or >, then add to the prompt to handle dynamic elements
    dynamic_elements = '<' in str(output_format)
    # if the output format contains list elements of [ or ], then we add to the prompt to handle lists
    list_output = '[' in str(output_format)
    
    # start off with no error message
    error_msg = ''
    
    for i in range(num_tries):
        
        output_format_prompt = f'''\nYou are to output the following in json format: {output_format}. 
Do not put quotation marks or escape character \ in the output fields.'''
        
        if list_output:
            output_format_prompt += f'''\nIf output field is a list, classify output into the best element of the list.'''
        
        # if output_format contains dynamic elements, process it accordingly
        if dynamic_elements: 
            output_format_prompt += f'''
Any text enclosed by < and > indicates you must generate content to replace it. Example input: Go to <location>, Example output: Go to the garden
Any output key containing < and > indicates you must generate the key name to replace it. Example input: {{'<location>': 'description of location'}}, Example output: {{school: a place for education}}'''

        # if input is in a list format, ask it to generate json in a list
        if list_input:
            output_format_prompt += '''\nGenerate a list of json, one json for each input element.'''
            
        # Use OpenAI to get a response
        response = openai.ChatCompletion.create(
          temperature = temperature,
          model=model,
          messages=[
            {"role": "system", "content": system_prompt + output_format_prompt + error_msg},
            {"role": "user", "content": str(user_prompt)}
          ]
        )

        res = response['choices'][0]['message']['content'].replace('\'', '"')
        
        # ensure that we don't replace away aprostophes in text 
        res = re.sub(r"(\w)\"(\w)", r"\1'\2", res)

        if verbose:
            print('System prompt:', system_prompt + output_format_prompt + error_msg)
            print('\nUser prompt:', str(user_prompt))
            print('\nGPT response:', res)
        
        # try-catch block to ensure output format is adhered to
        try:
            output = json.loads(res)
            if isinstance(user_prompt, list):
                if not isinstance(output, list): raise Exception("Output format not in a list of json")
            else:
                output = [output]
                
            # check for each element in the output_list, the format is correctly adhered to
            for index in range(len(output)):
                for key in output_format.keys():
                    # unable to ensure accuracy of dynamic output header, so skip it
                    if '<' in key or '>' in key: continue
                    # if output field missing, raise an error
                    if key not in output[index]: raise Exception(f"{key} not in json output")
                    # check that one of the choices given for the list of words is an unknown
                    if isinstance(output_format[key], list):
                        choices = output_format[key]
                        # ensure output is not a list
                        if isinstance(output[index][key], list):
                            output[index][key] = output[index][key][0]
                        # output the default category (if any) if GPT is unable to identify the category
                        if output[index][key] not in choices and default_category:
                            output[index][key] = default_category
                        # if the output is a description format, get only the label
                        if ':' in output[index][key]:
                            output[index][key] = output[index][key].split(':')[0]
                            
                # if we just want the values for the outputs
                if output_value_only:
                    output[index] = [value for value in output[index].values()]
                    # just output without the list if there is only one element
                    if len(output[index]) == 1:
                        output[index] = output[index][0]
                    
            return output if list_input else output[0]

        except Exception as e:
            error_msg = f"\n\nResult: {res}\n\nError message: {str(e)}"
            print("An exception occurred:", str(e))
            print("Current invalid json format:", res)
         
    return {}

In [8]:
def extract_unique_urls(url, unwanted_url = [], keyword_list=[]):
    ''' Given the original url, find out the urls containted on that page which are unique and contain the keyword '''
    try:
        response = requests.get(url, timeout = timeout)
    except Exception as e:
        print(f'Unable to access {url}')
        return []
    
    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the HTML elements that contain the URLs you want to extract
    url_elements = soup.select('a[href]')  # Select all anchor tags with an href attribute

    # Extract the URLs from the HTML elements
    url_list = []
    url_by_keyword = {}
    for element in url_elements:
        url = element['href']
        if 'https://' not in url: continue
        
        # if url is not relevant, skip it
        relevant = True
        for each in unwanted_url:
            if each in url: relevant = False
        if not relevant: continue
        
        relevant = False
        for keyword in keyword_list:
            if keyword in url: relevant = True
        if not relevant: continue
        url_list.append(url)

    # Process the URLs or print the list
    return list(set(url_list))

In [9]:
def get_root_url(url):
    parsed_url = urlparse(url)
    root_url = parsed_url.scheme + '://' + parsed_url.netloc
    return root_url

In [10]:
def spaced_text(soup):
    return " ".join(t.strip() for t in soup.findAll(string=True) if t.parent.name not in ['style', 'script', 'head', 'title', 'meta', '[document]'])

def view_url(url, driver=driver, timeout = timeout):
    ''' Views how GPT would see a webpage '''
    try:
        # Get the webpage
        driver.get(url)

        # Let's wait for the JavaScript to run with an implicit wait
        # Here we wait up to timeout seconds for the elements to become available
        driver.implicitly_wait(timeout)

        # Get the page source and parse it with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Send a GET request
        # response = requests.get(url, timeout = timeout)
        # Parse the HTML content
        # soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract metadata information
        text_content = 'Metadata:\n'
        meta_tags = soup.find_all("meta")
        for meta_tag in meta_tags:
            # Get the 'name' and 'content' attributes if present
            name = meta_tag.get("name")
            contents = meta_tag.get("content")
            if name and contents:
                text_content += f"{name}: {contents}\n"
                
        text_content = text_content + '\nMain Text:\n' + spaced_text(soup)

        # Do space processing
        text_content = re.sub('\n+', '\n', text_content)
        text_content = re.sub('\ {2,}', ' ', text_content)

    except Exception as e:
        print(e)
        return 'Unable to retrieve data'

    return text_content

## Unit Tests

In [13]:
url = "https://learn.deeplearning.ai/"
out = extract_unique_urls(url, keyword_list = ['course', 'about', 'learn'])
print(out)

[]


In [14]:
url = 'https://towardsdatascience.com/stable-diffusion-as-an-api-5e381aec1f6'
root_url = get_root_url(url)
print(root_url)

https://towardsdatascience.com


In [15]:
driver = start_driver()
# always do this to test. This website needs security clearance
url = "https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/web_base"
out = view_url(url, driver = driver)
print(out)

No existing driver to delete
Initializing new driver
Driver initialized: <selenium.webdriver.firefox.webdriver.WebDriver (session="d1c27335-d010-46ec-b4c7-ddc196c76152")>
Metadata:
generator: Docusaurus v2.4.0
twitter:card: summary_large_image
twitter:image: https://python.langchain.com/img/parrot-chainlink-icon.png
docusaurus_locale: en
docsearch:language: en
docusaurus_version: current
docusaurus_tag: docs-default-current
docsearch:version: current
docsearch:docusaurus_tag: docs-default-current
description: This covers how to use WebBaseLoader to load all text from HTML webpages into a document format that we can use downstream. For more custom logic for loading webpages look at some child class examples such as IMSDbLoader, AZLyricsLoader, and CollegeConfidentialLoader
viewport: width=device-width, initial-scale=1.0
Main Text:


# Main Program

In [21]:
# Use GPT-3.5-turbo to get good queries for Google Search
res = strict_output(system_prompt = f'''You are a helpful assistant meant to design google web queries to find information. Give {num_google_queries} suitable queries to get information corresponding to what the user wants.''',
                    user_prompt = f'''Base Query: {query}, Output Format: {output_format}''', 
                    output_format = {"query"+str(i):"query text" for i in range(num_google_queries)},
                    output_value_only = True)

search_terms = res
print(search_terms)

['Large Language Model courses online', 'Language Model training courses', 'Best courses for Large Language Models']


In [22]:
# Get the search results from google 
# NOTE: Do not run this cell too often, you only have 100 search API calls a day!

datalist = []
# this is for the first 10 sites for each search term
for search_term in search_terms:
    # Send a GET request to the Custom Search API
    response = requests.get(f'https://www.googleapis.com/customsearch/v1?key={api_key}&cx={cx}&q={search_term}')
    
    # Extract the relevant information from the response
    data = response.json()
    
    datalist.append(data)

# this one is for the next 10 sites for each search term
if twenty:
    for search_term in search_terms:
        # Send a GET request to the Custom Search API
        response = requests.get(f'https://www.googleapis.com/customsearch/v1?key={api_key}&cx={cx}&q={search_term}&start=10')

        # Extract the relevant information from the response
        data = response.json()

        datalist.append(data)

In [28]:
# Use the Google results and populate the URL dictionary
urldict = {}
mainurllist = []
for num, data in enumerate(datalist):
    print(f'Doing split {num+1} out of {len(datalist)}, search term: {search_terms[num%len(search_terms)]}')
    # Process the search results and get list of secondary sources
    items = data.get('items', [])
    for item in items:
        title = item.get('title')
        snippet = item.get('snippet')
        url = item.get('link')
        # if url has been done before, skip it
        if url in urldict: continue
        
        # if url is not relevant, skip it
        relevant = True
        for each in unwanted_url:
            if each in url: relevant = False
        if not relevant: continue
            
        urldict[url] = title
        mainurllist.append(url)
        
        ## Secondary Links from primary link
        new_urls = extract_unique_urls(url, unwanted_url = unwanted_url, keyword_list = keyword_list)
        print(f'Main url: {url}, Secondary url: {new_urls}')
        for new_url in new_urls:
            # if url has been done before, skip it
            if new_url in urldict: continue
            # populate the new url dictionary
            urldict[new_url] = title

mainurllist = list(set(mainurllist))
print(urldict)

Doing split 1 out of 3, search term: Large Language Model courses online
Main url: https://www.kdnuggets.com/2023/03/top-free-courses-large-language-models.html, Secondary url: ['https://www.cs.princeton.edu/courses/archive/fall22/cos597G/', 'https://www.kdnuggets.com/courses/index.html', 'https://www.addtoany.com/add_to/email?linkurl=https%3A%2F%2Fwww.kdnuggets.com%2F2023%2F03%2Ftop-free-courses-large-language-models.html&linkname=Top%20Free%20Courses%20on%20Large%20Language%20Models', 'https://www.kdnuggets.com/2023/07/always-learning-ai-prevents-data-breaches.html', 'https://www.addtoany.com/add_to/facebook?linkurl=https%3A%2F%2Fwww.kdnuggets.com%2F2023%2F03%2Ftop-free-courses-large-language-models.html&linkname=Top%20Free%20Courses%20on%20Large%20Language%20Models', 'https://www.kdnuggets.com/2021/03/3-more-free-nlp-courses.html', 'https://www.kdnuggets.com/2023/07/5-free-courses-chatgpt.html', 'https://www.kdnuggets.com/tag/machine-learning', 'https://www.addtoany.com/add_to/linke

In [31]:
print('Original main search url numbers:', len(mainurllist))

Original main search url numbers: 17


In [32]:
print('Original total search url numbers:', len(urldict))

Original total search url numbers: 296


## Filter
- Get at most max_url_per_site sites per main site (title as key)
- Use GPT to decide which sites are more important

In [33]:
## Get new dictionary with title as key and list of urls as value
titledict = {}
for key, value in urldict.items():
    value = value.replace('’','\'')
    if value not in titledict:
        titledict[value] = [key]
    else:
        titledict[value].append(key)

In [34]:
newurldict = {}
for key, value in titledict.items():
    ## curate each site to only top max_url_per_site entries
    if len(value) > max_url_per_site:
        res = strict_output(system_prompt = f'''You are a helpful assistant given the urls and title of the urls and are meant to filter the urls which match the query "{query}" into the given output format: {output_format}
Return the {max_url_per_site} most relevant urls for the query "{query}"''',
            user_prompt = f'''Title: {key}, URLs: {value[:50]}''',
            output_format = {"URL_List": "URL list in a string separated by space"})
        # if fail, just pick first max_url_per_site
        if res == {}:
            print('GPT filtering failed, doing manual filtering')
            value = value[:max_url_per_site]
            for each in value:
                if 'http' in each: newurldict[each] = key
        else:
            print(f'Original list: {value}')
            value = res['URL_List'].split(' ')
            print(f'Curated list: {value}')
            for each in value:
                if 'http' in each: newurldict[each] = key
    else:
        for each in value:
            if 'http' in each: newurldict[each] = key

Original list: ['https://www.kdnuggets.com/2023/03/top-free-courses-large-language-models.html', 'https://www.cs.princeton.edu/courses/archive/fall22/cos597G/', 'https://www.kdnuggets.com/courses/index.html', 'https://www.addtoany.com/add_to/email?linkurl=https%3A%2F%2Fwww.kdnuggets.com%2F2023%2F03%2Ftop-free-courses-large-language-models.html&linkname=Top%20Free%20Courses%20on%20Large%20Language%20Models', 'https://www.kdnuggets.com/2023/07/always-learning-ai-prevents-data-breaches.html', 'https://www.addtoany.com/add_to/facebook?linkurl=https%3A%2F%2Fwww.kdnuggets.com%2F2023%2F03%2Ftop-free-courses-large-language-models.html&linkname=Top%20Free%20Courses%20on%20Large%20Language%20Models', 'https://www.kdnuggets.com/2021/03/3-more-free-nlp-courses.html', 'https://www.kdnuggets.com/2023/07/5-free-courses-chatgpt.html', 'https://www.kdnuggets.com/tag/machine-learning', 'https://www.addtoany.com/add_to/linkedin?linkurl=https%3A%2F%2Fwww.kdnuggets.com%2F2023%2F03%2Ftop-free-courses-large-

In [35]:
print(f'Initial curated number: {len(newurldict)}')
impturl = list(newurldict.keys())

Initial curated number: 41


In [51]:
## Add back urls from main google search results, as they are most beneficial
for url in mainurllist:
    if url not in newurldict:
        newurldict[url] = urldict[url]

In [52]:
print(f'Final curated number: {len(newurldict)}')
impturl = list(newurldict.keys())

Final curated number: 42


In [53]:
impturl

['https://www.kdnuggets.com/2023/03/top-free-courses-large-language-models.html',
 'https://www.cs.princeton.edu/courses/archive/fall22/cos597G/',
 'https://huggingface.co/course/chapter1/1',
 'https://www.databricks.com/blog/enroll-our-new-expert-led-large-language-models-llms-courses-edx',
 'https://www.edx.org/course/large-language-models-foundation-models-from-the-ground-up',
 'https://www.edx.org/course/large-language-models-application-through-production',
 'https://medium.com/geekculture/top-resoruces-to-learn-understand-large-language-models-4d339f7b685d',
 'https://medium.com/about?autoplay=1&source=post_page-----4d339f7b685d--------------------------------',
 'https://about.medium.com/creators/?source=post_page-----4d339f7b685d--------------------------------',
 'https://www.edx.org/professional-certificate/databricks-large-language-models',
 'https://www.coursera.org/learn/introduction-to-large-language-models',
 'https://stanford-cs324.github.io/winter2022/',
 'https://canv

## Get the data from the curated list

In [54]:
driver = start_driver()

No existing driver to delete
Initializing new driver
Driver initialized: <selenium.webdriver.firefox.webdriver.WebDriver (session="79a19270-846e-44e3-8bd7-8a78437bf251")>


In [55]:
impturl = impturl[:10]

In [56]:
# if we start from scatch, reset content
content = {}
irrelevant_url = []
currentnum = 0

In [58]:
## Get the data from the websites
for num, url in enumerate(impturl):
    
    # to help in case we have runtime error, simply continue from those that have been done
    if num < currentnum: continue
    currentnum = max(num, currentnum)
    
    print(url, f'(URL #{num+1} of {len(impturl)})')
    text_content = view_url(url, driver=driver)
    
    # filter the text into manageable bits for the parser
    text_splitter = TokenTextSplitter(chunk_size=MAX_TOKENS, chunk_overlap=100)
    texts = text_splitter.split_text(text_content)
    
    root_url = get_root_url(url)
    
    # We only cap the number of chunks per site to be 4
    for text in texts[:4]:
        existing_entry = 'None'
        if root_url in content:
            existing_entry = content[root_url]

        # Use GPT-3.5-turbo to get information from website
        res = strict_output(system_prompt = f'''You are a helpful assistant meant to extract information from text that is related to {query}. If there is existing data, add on to it.
Each field in Existing Data should have a maximum of 50 words. If you are unsure about any of the output fields, output {NO_INFO}''',
                            user_prompt = f'''Url: {url}, Existing Data: {existing_entry}, Text: {text}''', 
                            output_format = output_format)
        
        if res=={}:
            print('Empty JSON output'); break
        if res[base_index] == NO_INFO: 
            print('Information not relevant')
            irrelevant_url.append(url)
            break
            
        content[root_url] = res
        print(res)

https://medium.com/about?autoplay=1&source=post_page-----4d339f7b685d-------------------------------- (URL #8 of 10)
{'Course Name': 'Top Resoruces to Learn & Understand Large Language Models', 'Course Website': 'https://medium.com/geekculture/top-resoruces-to-learn-understand-large-language-models-4d339f7b685d', 'Broad Course Description': 'Large Language Models (LLMs) have revolutionized the field of Natural Language Processing (NLP) by providing highly accurate and efficient ways of understanding and generating human language.', 'Cost': 'No info', 'Time': '10 min read', 'Venue': 'Online https', 'Company': 'Medium'}
https://about.medium.com/creators/?source=post_page-----4d339f7b685d-------------------------------- (URL #9 of 10)
Information not relevant
https://www.edx.org/professional-certificate/databricks-large-language-models (URL #10 of 10)
{'Course Name': 'Large Language Models: Application through Production', 'Course Website': 'https://www.edx.org/course/large-language-model

In [59]:
# make the content such that it merges all the available content together
final_content = {}
final_content_sources = {}
for key, value in content.items():
    name = value[base_index]
    # add the webpage to the source
    if name not in final_content_sources:
        final_content_sources[name] = [key]
    else:
        final_content_sources[name].append(key)
    
    if name not in final_content:
        final_content[name] = value
    else:
        # use GPT to match both outputs together
        existingvalue = final_content[name]
        currentvalue = name
        res = strict_output(system_prompt = f'''You are a helpful assistant assistant meant to combine two sources of information together factually.
If you are not sure about any of the output fields, output {NO_INFO}"''',
            user_prompt = f'''Source 1: {existingvalue}, Source 2: {currentvalue}''', 
            output_format = output_format)
        print(f'Combining outputs for {name}\nConsolidated Output: {res}')
        final_content[name] = res

In [60]:
# update the main dictionary with the information sources
for key in final_content.keys():
    final_content[key]['info_sources'] = str(final_content_sources[key]).replace('\'',' ').replace('[','').replace(']','').replace(',','')

## Curate the final output to see if the sources are relevant

In [61]:
curated_final_content = {}
for key, value in final_content.items():
    res = strict_output(system_prompt = f'''You are a helpful assistant assistant meant to see if a user input is relevant for the query: "{query}"
Output whether or not it is relevant.''',
        user_prompt = f'''{value}''', 
        output_format = {"Relevance": ["3: user input matches almost all of the query", 
                         "2: user input matches more than half of the query",
                         "1: user input matches at least one part of the query",
                         "0: user input does not any part of the query"]})
    if res == {}: continue
    value['Relevance'] = res['Relevance']
    curated_final_content[key] = value
    print(value)
    print(res['Relevance'])

{'Course Name': 'Top Free Courses on Large Language Models', 'Course Website': 'https://www.kdnuggets.com/2023/03/top-free-courses-large-language-models.html', 'Broad Course Description': 'Free courses and resources on large language models from Stanford, Princeton, ETH, and more.', 'Cost': 'Free', 'Time': 'No info', 'Venue': 'No info', 'Company': 'KDnuggets', 'info_sources': ' https://www.kdnuggets.com ', 'Relevance': '3'}
3
{'Course Name': 'COS 597G: Understanding Large Language Models', 'Course Website': 'https://www.cs.princeton.edu/courses/archive/fall22/cos597G/', 'Broad Course Description': 'This course covers cutting-edge research topics centering around pre-trained language models, including technical foundations, emerging capabilities, fine-tuning and adaptation, system design, and security and ethics.', 'Cost': 'No info', 'Time': 'No info', 'Venue': 'Physical Sherrerd Hall 101', 'Company': 'Princeton University', 'info_sources': ' https://www.cs.princeton.edu ', 'Relevance':

In [62]:
# Convert it into a document
# Convert dictionary to Excel spreadsheet
file_path = f'{query}.xlsx'
df = pd.DataFrame.from_dict(curated_final_content, orient = 'index')
df = df.sort_values(by='Relevance', ascending=False)
df.to_excel(file_path, index = False)

In [72]:
df

Unnamed: 0,Course Name,Course Website,Broad Course Description,Cost,Time,Venue,Company,info_sources,Relevance
https://www.kdnuggets.com/2023/03/top-free-courses-large-language-models.html,Top Free Courses on Large Language Models,https://www.kdnuggets.com/2023/03/top-free-cou...,Free courses and resources on large language m...,Free,No info,No info,KDnuggets,https://www.kdnuggets.com,3
https://www.cs.princeton.edu/courses/archive/fall22/cos597G/,COS 597G: Understanding Large Language Models,https://www.cs.princeton.edu/courses/archive/f...,This course covers cutting-edge research topic...,No info,No info,Physical Sherrerd Hall 101,Princeton University,https://www.cs.princeton.edu,3
https://www.databricks.com/blog/enroll-our-new-expert-led-large-language-models-llms-courses-edx,Large Language Models (LLMs) Courses on edX,https://www.databricks.com/blog/enroll-our-new...,The Large Language Models (LLMs) Courses on ed...,"Free for anyone to audit, nominal fee for acce...",Courses will begin Summer 2023,Online edX platform,Databricks,https://www.databricks.com,3
https://medium.com/geekculture/top-resoruces-to-learn-understand-large-language-models-4d339f7b685d,Top Resoruces to Learn & Understand Large Lang...,https://medium.com/geekculture/top-resoruces-t...,Large Language Models (LLMs) have revolutioniz...,No info,10 min read,Online https,Medium,https://medium.com,3
https://huggingface.co/course,The 🤗 Course,https://huggingface.co/course,This course teaches natural language processin...,Free,Approximately 6-8 hours per week,Online https,Hugging Face,https://huggingface.co,2
https://www.edx.org/course/large-language-models-application-through-production,Large Language Models: Application through Pro...,https://www.edx.org/course/large-language-mode...,"This course is aimed at developers, data scien...",Free,"6 weeks, 4-10 hours per week",Online edx.org,Databricks,https://www.edx.org,2


## Shut down browser

In [73]:
driver.quit()
del driver