In [None]:
from langchain.document_loaders import UnstructuredURLLoader, UnstructuredFileLoader
from langchain.document_loaders.image import UnstructuredImageLoader
from langchain.docstore.document import Document

from unstructured.cleaners.core import remove_punctuation, clean, clean_extra_whitespace
from urllib.parse import urlparse

def is_url(url):
  try:
    result = urlparse(url)
    return all([result.scheme, result.netloc])
  except ValueError:
    return False
  
def generate_document(url):
    
    fake_head = {
              'User-Agent': 'My User Agent 1.0',
              "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*"
              ";q=0.8",
              "Accept-Language": "en-US,en;q=0.5",
              "Referer": "https://www.google.com/",
              "DNT": "1",
              "Connection": "keep-alive",
              "Upgrade-Insecure-Requests": "1",
          }
    try:
      if is_url(url):
          loader = UnstructuredURLLoader(urls=[url],
                                        mode="elements",
                                        post_processors=[clean,remove_punctuation,clean_extra_whitespace], 
                                        headers=fake_head)
      elif url.endswith('.jpg'):
        loader = UnstructuredImageLoader(url,
                                        mode="elements",
                                        post_processors=[clean,remove_punctuation,clean_extra_whitespace], 
                                        headers=fake_head)
      else:
          loader = UnstructuredFileLoader(url, 
                                  strategy="fast", 
                                  mode="elements",
                                  post_processors=[clean,remove_punctuation,clean_extra_whitespace], 
                                  headers=fake_head)
      elements = loader.load()
      # print(f'elements {elements}')
      selected_elements = [e for e in elements if e.metadata['category']=="NarrativeText" or e.metadata['category']=="Title"]
      # print(f'selected_elements {selected_elements}')
      full_clean = " ".join([e.page_content for e in selected_elements])
      return Document(page_content=full_clean, metadata={"source":url})
    except:
       print(f'*ERROR* {url}')

In [None]:
import pandas as pd

df = pd.read_csv('url_department_2.csv')
df.tail()

In [4]:
from tqdm import tqdm
import time

content_dict = {'url': [],'timestamp': [], 'content': []}

for url in tqdm(df['url']):
    if url.endswith('zip'):
        continue
    content = generate_document(url)
    if content is None:
        continue
    content_dict['url'].append(url)
    content_dict['timestamp'].append(time.time())
    content_dict['content'].append(content.page_content)

  0%|          | 8/2510 [00:55<1:48:09,  2.59s/it]The headers kwarg is set but the url kwarg is not. The headers kwarg will be ignored.


*ERROR* //youtube.com/embed/dTVOC36yz3o?fs=1&autoplay=1&rel=1


  1%|          | 14/2510 [00:56<37:06,  1.12it/s]  The headers kwarg is set but the url kwarg is not. The headers kwarg will be ignored.


*ERROR* //rx.mc.ntu.edu.tw/myDOP/SCENE/ABOUT/mainabout.php?rub=about//9


  3%|▎         | 64/2510 [01:24<42:55,  1.05s/it]The headers kwarg is set but the url kwarg is not. The headers kwarg will be ignored.
  3%|▎         | 66/2510 [01:24<25:35,  1.59it/s]

*ERROR* //youtube.com/embed/WkNbq-r3Kvs?fs=1&autoplay=1&rel=1


  5%|▌         | 132/2510 [02:21<17:31,  2.26it/s]  The headers kwarg is set but the url kwarg is not. The headers kwarg will be ignored.
  5%|▌         | 136/2510 [02:21<08:40,  4.56it/s]

*ERROR* //youtube.com/embed/EBmgbL7Q4vU?fs=1&autoplay=1&rel=1


  7%|▋         | 186/2510 [02:42<20:50,  1.86it/s]The headers kwarg is set but the url kwarg is not. The headers kwarg will be ignored.


*ERROR* //sp.mc.ntu.edu.tw/mySOP/sopPage.php?malangue=&myrub=introduction//1-4


  8%|▊         | 212/2510 [03:48<22:57,  1.67it/s]  The headers kwarg is set but the url kwarg is not. The headers kwarg will be ignored.
  9%|▊         | 214/2510 [03:48<14:39,  2.61it/s]

*ERROR* //youtube.com/embed/NWvvS07UCio?fs=1&autoplay=1&rel=1


 10%|█         | 259/2510 [04:35<41:48,  1.11s/it]  The headers kwarg is set but the url kwarg is not. The headers kwarg will be ignored.


*ERROR* //youtube.com/embed/PsxjFZodeQY?fs=1&autoplay=1&rel=1


 16%|█▌        | 399/2510 [06:11<25:17,  1.39it/s]  The headers kwarg is set but the url kwarg is not. The headers kwarg will be ignored.


*ERROR* //sp.mc.ntu.edu.tw/PRIRS/


 18%|█▊        | 440/2510 [07:15<19:35,  1.76it/s]  The headers kwarg is set but the url kwarg is not. The headers kwarg will be ignored.


*ERROR* //youtube.com/embed/tBMxbwHXN44?fs=1&autoplay=1&rel=1


 18%|█▊        | 447/2510 [07:19<24:30,  1.40it/s]Error fetching or processing https://ems.ntu.edu.tw/infosession/, exception: HTTPSConnectionPool(host='ems.ntu.edu.tw', port=443): Max retries exceeded with url: /infosession/ (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f33d7b32750>: Failed to establish a new connection: [Errno -2] Name or service not known'))
 19%|█▊        | 465/2510 [07:27<25:46,  1.32it/s]The headers kwarg is set but the url kwarg is not. The headers kwarg will be ignored.


*ERROR* //youtube.com/embed/rxc_VIGDY-0?fs=1&autoplay=1&rel=1


 19%|█▉        | 484/2510 [07:35<13:43,  2.46it/s]The headers kwarg is set but the url kwarg is not. The headers kwarg will be ignored.


*ERROR* //sp.mc.ntu.edu.tw/mySOP/sopPageII.php?malangue=&myrub=featured%20courses//1-2


 21%|██        | 527/2510 [07:57<17:05,  1.93it/s]The headers kwarg is set but the url kwarg is not. The headers kwarg will be ignored.
 21%|██        | 530/2510 [07:57<09:11,  3.59it/s]

*ERROR* //sp.mc.ntu.edu.tw/mySOP/


 22%|██▏       | 544/2510 [08:08<18:03,  1.81it/s]Error fetching or processing https://ems.ntu.edu.tw/campus_introduction/, exception: HTTPSConnectionPool(host='ems.ntu.edu.tw', port=443): Max retries exceeded with url: /campus_introduction/ (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f33d7b28550>: Failed to establish a new connection: [Errno -2] Name or service not known'))
 23%|██▎       | 589/2510 [08:53<11:52,  2.69it/s]  The headers kwarg is set but the url kwarg is not. The headers kwarg will be ignored.


*ERROR* //sp.mc.ntu.edu.tw/mySOP/sopPage.php?malangue=&myrub=introduction//1-1


 24%|██▎       | 595/2510 [08:59<26:13,  1.22it/s]The headers kwarg is set but the url kwarg is not. The headers kwarg will be ignored.


*ERROR* //sp.mc.ntu.edu.tw/mySOP/sopPage.php?malangue=&myrub=introduction//1-3


 26%|██▌       | 641/2510 [09:32<06:34,  4.73it/s]  Error fetching or processing https://ems.ntu.edu.tw/core/, exception: HTTPSConnectionPool(host='ems.ntu.edu.tw', port=443): Max retries exceeded with url: /core/ (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f33d7e35550>: Failed to establish a new connection: [Errno -2] Name or service not known'))
 26%|██▋       | 659/2510 [09:37<06:34,  4.69it/s]Error fetching or processing https://ems.ntu.edu.tw/faq/, exception: HTTPSConnectionPool(host='ems.ntu.edu.tw', port=443): Max retries exceeded with url: /faq/ (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f33d7f0b1d0>: Failed to establish a new connection: [Errno -2] Name or service not known'))
 28%|██▊       | 705/2510 [10:20<16:11,  1.86it/s]  Error fetching or processing https://ems.ntu.edu.tw/alliance/, exception: HTTPSConnectionPool(host='ems.ntu.edu.tw', port=443): Max retries exceeded with url: /alliance/ (Caused by

*ERROR* //sp.mc.ntu.edu.tw/mySOP/sopPage.php?malangue=&myrub=location


 30%|███       | 755/2510 [11:11<28:46,  1.02it/s]The headers kwarg is set but the url kwarg is not. The headers kwarg will be ignored.


*ERROR* //rx.mc.ntu.edu.tw/myDOP/INNERPAGE/


 30%|███       | 759/2510 [11:12<16:48,  1.74it/s]Error fetching or processing https://ems.ntu.edu.tw/forum_debate/, exception: HTTPSConnectionPool(host='ems.ntu.edu.tw', port=443): Max retries exceeded with url: /forum_debate/ (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f33d7e2e810>: Failed to establish a new connection: [Errno -2] Name or service not known'))
 33%|███▎      | 819/2510 [12:15<16:25,  1.72it/s]  javaldx: Could not find a Java Runtime Environment!
Please ensure that a JVM and the package libreoffice-java-common
is installed.
If it is already installed then try removing ~/.config/libreoffice/4/user/config/javasettings_Linux_*.xml
 34%|███▍      | 857/2510 [12:41<10:52,  2.53it/s]Error fetching or processing https://ems.ntu.edu.tw/news/, exception: HTTPSConnectionPool(host='ems.ntu.edu.tw', port=443): Max retries exceeded with url: /news/ (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f33bc7d2bd0>: Faile

*ERROR* //youtube.com/embed/CtuRPXfUPNg?fs=1&autoplay=1&rel=1


 43%|████▎     | 1084/2510 [15:34<22:28,  1.06it/s]  Error fetching or processing https://ems.ntu.edu.tw/visiting_activities/, exception: HTTPSConnectionPool(host='ems.ntu.edu.tw', port=443): Max retries exceeded with url: /visiting_activities/ (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f33d7e6eed0>: Failed to establish a new connection: [Errno -2] Name or service not known'))
 44%|████▍     | 1105/2510 [15:41<11:49,  1.98it/s]Error fetching or processing http://tul.blog.ntu.edu.tw/archives/20907, exception: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
 44%|████▍     | 1112/2510 [15:41<04:02,  5.77it/s]Error fetching or processing https://ems.ntu.edu.tw/entrepreneurship_innovation_forum/, exception: HTTPSConnectionPool(host='ems.ntu.edu.tw', port=443): Max retries exceeded with url: /entrepreneurship_innovation_forum/ (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f33d7e32190>: Faile

*ERROR* //rx.mc.ntu.edu.tw/NMRbooking/


 59%|█████▉    | 1482/2510 [21:36<15:46,  1.09it/s]  The headers kwarg is set but the url kwarg is not. The headers kwarg will be ignored.


*ERROR* //sp.mc.ntu.edu.tw/mySOP/sopPage.php?&myrub=donation


 61%|██████▏   | 1543/2510 [23:27<26:30,  1.65s/it]  The headers kwarg is set but the url kwarg is not. The headers kwarg will be ignored.
 62%|██████▏   | 1546/2510 [23:28<12:16,  1.31it/s]

*ERROR* //rx.mc.ntu.edu.tw/alumni/


 63%|██████▎   | 1591/2510 [23:54<06:51,  2.23it/s]The headers kwarg is set but the url kwarg is not. The headers kwarg will be ignored.
 64%|██████▎   | 1594/2510 [23:55<03:45,  4.06it/s]

*ERROR* //youtube.com/embed/4UuLsctO-1M?fs=1&autoplay=1&rel=1


 67%|██████▋   | 1677/2510 [24:57<17:59,  1.30s/it]The headers kwarg is set but the url kwarg is not. The headers kwarg will be ignored.
 67%|██████▋   | 1680/2510 [24:57<08:41,  1.59it/s]

*ERROR* //youtube.com/embed/aA37XK2qO1s?fs=1&autoplay=1&rel=1


 67%|██████▋   | 1683/2510 [24:58<06:08,  2.24it/s]Error fetching or processing https://ems.ntu.edu.tw/concept/, exception: HTTPSConnectionPool(host='ems.ntu.edu.tw', port=443): Max retries exceeded with url: /concept/ (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f33d7d21f10>: Failed to establish a new connection: [Errno -2] Name or service not known'))
 70%|███████   | 1757/2510 [25:25<03:26,  3.65it/s]Error fetching or processing https://ems.ntu.edu.tw/admissions_activities/, exception: HTTPSConnectionPool(host='ems.ntu.edu.tw', port=443): Max retries exceeded with url: /admissions_activities/ (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f33d7ec2790>: Failed to establish a new connection: [Errno -2] Name or service not known'))
 71%|███████   | 1775/2510 [25:52<41:34,  3.39s/it]Error fetching or processing https://ems.ntu.edu.tw/visit/, exception: HTTPSConnectionPool(host='ems.ntu.edu.tw', port=443): Max retries ex

*ERROR* //rx.mc.ntu.edu.tw/MRBS/


 75%|███████▍  | 1876/2510 [26:39<06:34,  1.61it/s]The headers kwarg is set but the url kwarg is not. The headers kwarg will be ignored.


*ERROR* //sp.mc.ntu.edu.tw/mySOP/sopPageII.php?malangue=&myrub=calendar


 78%|███████▊  | 1962/2510 [27:44<09:23,  1.03s/it]Error fetching or processing https://ems.ntu.edu.tw/, exception: HTTPSConnectionPool(host='ems.ntu.edu.tw', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f33d7e6d250>: Failed to establish a new connection: [Errno -2] Name or service not known'))
 80%|███████▉  | 1999/2510 [28:08<05:48,  1.47it/s]The headers kwarg is set but the url kwarg is not. The headers kwarg will be ignored.


*ERROR* //youtube.com/embed/yfOOUw_uSgM?fs=1&autoplay=1&rel=1


 81%|████████  | 2035/2510 [28:20<04:26,  1.78it/s]The headers kwarg is set but the url kwarg is not. The headers kwarg will be ignored.


*ERROR* //youtube.com/embed/JpNoV3WQs1A?fs=1&autoplay=1&rel=1


 82%|████████▏ | 2050/2510 [28:24<02:16,  3.37it/s]Error fetching or processing https://ems.ntu.edu.tw/group/, exception: HTTPSConnectionPool(host='ems.ntu.edu.tw', port=443): Max retries exceeded with url: /group/ (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f33d7e6c350>: Failed to establish a new connection: [Errno -2] Name or service not known'))
 91%|█████████ | 2272/2510 [30:22<02:34,  1.54it/s]The headers kwarg is set but the url kwarg is not. The headers kwarg will be ignored.
 91%|█████████ | 2274/2510 [30:22<01:42,  2.29it/s]

*ERROR* //youtube.com/embed/rvO5EmhkaDg?fs=1&autoplay=1&rel=1


 93%|█████████▎| 2323/2510 [31:16<09:40,  3.10s/it]Error fetching or processing https://ems.ntu.edu.tw/activities/, exception: HTTPSConnectionPool(host='ems.ntu.edu.tw', port=443): Max retries exceeded with url: /activities/ (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f33d7cf9590>: Failed to establish a new connection: [Errno -2] Name or service not known'))
 94%|█████████▍| 2367/2510 [31:48<00:27,  5.29it/s]Error fetching or processing https://ems.ntu.edu.tw/alumni/, exception: HTTPSConnectionPool(host='ems.ntu.edu.tw', port=443): Max retries exceeded with url: /alumni/ (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f33d7e46750>: Failed to establish a new connection: [Errno -2] Name or service not known'))
 96%|█████████▌| 2399/2510 [32:15<00:21,  5.19it/s]The headers kwarg is set but the url kwarg is not. The headers kwarg will be ignored.


*ERROR* //youtube.com/embed/kHwllIhjScI?fs=1&autoplay=1&rel=1


 96%|█████████▋| 2416/2510 [32:27<01:07,  1.39it/s]Error fetching or processing https://ems.ntu.edu.tw, exception: HTTPSConnectionPool(host='ems.ntu.edu.tw', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f33d7bd6650>: Failed to establish a new connection: [Errno -2] Name or service not known'))
 98%|█████████▊| 2456/2510 [32:44<00:21,  2.52it/s]The headers kwarg is set but the url kwarg is not. The headers kwarg will be ignored.
 98%|█████████▊| 2458/2510 [32:44<00:13,  3.74it/s]

*ERROR* //youtube.com/embed/lWL8bcnFo3c?fs=1&autoplay=1&rel=1


 99%|█████████▉| 2496/2510 [33:34<00:05,  2.53it/s]The headers kwarg is set but the url kwarg is not. The headers kwarg will be ignored.


*ERROR* //youtube.com/embed/VwJkXbHUfO0?fs=1&autoplay=1&rel=1


100%|██████████| 2510/2510 [33:58<00:00,  1.23it/s]


In [5]:
df = pd.DataFrame(content_dict)
df.head()

Unnamed: 0,url,timestamp,content
0,https://webpageprodvm.ntu.edu.tw/SOTRC/News_Li...,1703259000.0,跳到主要內容區塊 船舶及海洋技術研究中心 首頁大圖 手機版選單 search 搜尋 搜尋 分...
1,https://www.lis.ntu.edu.tw/?p=11315,1703259000.0,English Version 國立臺灣大學圖書資訊學系 Department and G...
2,https://studyabroad.ntu.edu.tw/exchange-studen...,1703259000.0,Skip to content 國際交換學生甄選系統登入 國際交換學生甄選系統登入 Main...
3,https://rsprc.ntu.edu.tw/zh-tw/m06-3/upcoming-...,1703259000.0,【活動報名】國際學者訪問交流講座 The Electrification of Mobili...
4,https://ord.ntu.edu.tw/w/ordNTUplan/Login,1703259000.0,跳到主要內容 文字icon - 會員登入 關閉 搜尋 本校教職員生 校外人士


In [8]:
sum(df['content']=='')

348

In [7]:
df.to_csv('document/department_2_content.csv', index=False)

In [None]:
# import pandas as pd

# df2_urls = pd.read_csv('url_department_2_update.csv')
# df2_urls.head()

In [None]:
# from tqdm import tqdm
# import time

# content_dict = {'url': [],'timestamp': [], 'content': []}

# for url in tqdm(df2_urls['url']):
#     if url.endswith('zip'):
#         continue
#     content = generate_document(url)
#     if content is None:
#         continue
#     content_dict['url'].append(url)
#     content_dict['timestamp'].append(time.time())
#     content_dict['content'].append(content.page_content)

In [None]:
# df2 = pd.DataFrame(content_dict)
# df2.tail()

In [None]:
# df2.to_csv('document/department_2_content.csv', index=False)

In [None]:
# len(df2)

In [None]:
# sum([1 for i in df2['content'] if i==''])

In [None]:
# def summarize_document(url):
#  "Given an URL return the summary from OpenAI model"
#  openai_key = ""
#  llm = OpenAI(model_name='ada',temperature=0,openai_api_key=openai_key)
#  chain = load_summarize_chain(llm, chain_type="stuff")
#  tmp_doc = generate_document(url)
#  summary = chain.run([tmp_doc])
#  return clean_extra_whitespace(summary)

In [5]:
from bs4 import BeautifulSoup

html = 'http://ntumisrc.blogspot.com/feeds/posts/default'
soup = BeautifulSoup(html,"html.parser")
text = soup.get_text(strip=True)
print(text)

http://ntumisrc.blogspot.com/feeds/posts/default


  soup = BeautifulSoup(html,"html.parser")
