<a href="https://colab.research.google.com/github/rdiliberto77/web_crawler_project/blob/main/chatgpt-web-crawler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import requests
import os
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

class WebCrawler:
   def __init__(self, start_url, visiting_strategy='preorder'):
       self.start_url = start_url
       self.visiting_strategy = visiting_strategy.lower()
       self.visited_urls = set()
       self.corpus = {}
       self.main_domain = urlparse(start_url).netloc

   def crawl(self, url, depth=0):
       if depth > 10:  # Limiting depth to avoid potential infinite loops
           print(f"Reached maximum depth for {url}")
           return

       if url not in self.visited_urls and self.is_same_domain(url):
           print(f"Visiting: {url}")
           self.visited_urls.add(url)
           try:
               response = requests.get(url)
               soup = BeautifulSoup(response.content, 'html.parser')
               title = soup.title.string.strip() if soup.title else 'Untitled'
               text_content = self.extract_text_content(soup)
               self.corpus[title] = text_content
               print(f"Text Content: {text_content[:100]}...")  # Output a snippet of text

               if self.visiting_strategy == 'preorder':
                   links = self.extract_links(soup)
                   for link in links:
                       self.crawl(link, depth + 1)

               # Additional visiting strategies (inorder, postorder) can be implemented here

           except Exception as e:
               print(f"Error crawling {url}: {e}")

   def extract_text_content(self, soup):
       # Extract text content only from the body of the HTML
       text_content = ' '.join([p.get_text(separator=' ', strip=True) for p in soup.body.find_all('p')])
       return text_content

   def extract_links(self, soup):
       # Extract all links from the page
       links = [link.get('href') for link in soup.find_all('a', href=True)]
       # Filter internal links only
       links = [urljoin(self.start_url, link) for link in links if link.startswith(('http', 'https'))]
       # Exclude PDF links
       links = [link for link in links if not link.endswith('.pdf')]
       # Filter out external links
       links = [link for link in links if self.is_same_domain(link)]
       # Exclude links with 'resources' in the URL
       links = [link for link in links if 'resources' not in link.lower()]
       return links

   def is_same_domain(self, url):
       return urlparse(url).netloc == self.main_domain

   def start_crawling(self):
       self.crawl(self.start_url)
       self.save_corpus()

   def save_corpus(self):
       # Save the extracted corpus to text files
       for title, content in self.corpus.items():
           with open(f"{title}.txt", 'w', encoding='utf-8') as file:
               file.write(content)

   def save_corpus(self):
        # Create a directory named 'corpus' if it doesn't exist
        if not os.path.exists('corpus'):
            os.makedirs('corpus')

        # Save the extracted corpus to text files in the 'corpus' directory
        for title, content in self.corpus.items():
            with open(os.path.join('corpus', f"{title}.txt"), 'w', encoding='utf-8') as file:
                file.write(content)

if __name__ == "__main__":
   # Get the starting URL from the user
   start_url = input("Enter the website's URL: ")

   # Instantiate the WebCrawler with the provided URL and visiting strategy
   crawler = WebCrawler(start_url=start_url, visiting_strategy='preorder')

   # Start crawling
   crawler.start_crawling()

Enter the website's URL: https://www.stjohns.edu
Visiting: https://www.stjohns.edu
Text Content: See how your journey aligns with what drives you. Lucas Shears Warren, RI Hannah M. Queens, NY Maria...
Visiting: https://www.stjohns.edu/who-we-are/faith-and-mission/campus-ministry/opportunities/plunge-program
Text Content: Plunges, or service immersion, are weeklong experiences where students are given the opportunity to ...
Visiting: https://www.stjohns.edu/who-we-are/history-and-facts/vincentian-heritage
Text Content: St. John’s University looks to St. Vincent de Paul (1581–1660), founder of the Congregation of the M...
Visiting: https://www.stjohns.edu/who-we-are/campus-sustainability
Text Content: Sustainability is a long-term responsibility to meet the needs of the present without compromising t...
Visiting: https://www.stjohns.edu/life-st-johns/career-services
Text Content: University Career Services is committed to ensuring student and alumni success. Our dedicated team o...
Visit



Error crawling https://www.stjohns.edu/files/phd-curriculum-and-instruction-essay-prompt-instructions: 'NoneType' object has no attribute 'find_all'
Visiting: https://www.stjohns.edu/law/academics
Text Content: J.D. Programs LL.M. Programs Clinics Centers Co-Curricular Programs Study Abroad Course Catalog Acad...
Visiting: https://www.stjohns.edu/law/apply
Text Content: We seek to identify and admit a diverse group of talented J.D. students who will succeed at St. John...
Visiting: https://www.stjohns.edu/law/give
Text Content: Please note that this form is optimized for use with the current version of the Chrome browser. If y...
Visiting: https://www.stjohns.edu/law/law-career-development
Text Content: Current Students Alumni Employers Externship Program Career Development Team Graduate Employment Dat...
Reached maximum depth for https://www.stjohns.edu/law/apply
Reached maximum depth for https://www.stjohns.edu/law/give
Reached maximum depth for https://www.stjohns.edu/law/law-career



Error crawling https://www.stjohns.edu/sites/default/files/2020-03/M1-12853%20MA%20Chinese%20%26%20EAS.PDF: 'NoneType' object has no attribute 'find_all'
Visiting: https://www.stjohns.edu/media/64841
Text Content: 8000 Utopia Parkway Queens NY 11439 718-990-2000 St. John’s University does not discriminate on the ...
Visiting: https://www.stjohns.edu/academics/schools/college-professional-studies
Text Content: The Lesley H. and William L. Collins College of Professional Studies is a launchpad for innovators, ...
Visiting: https://www.stjohns.edu/academics/faculty/wendell-cruz
Text Content: 8000 Utopia Parkway Queens NY 11439 718-990-2000 St. John’s University does not discriminate on the ...
Visiting: https://www.stjohns.edu/events?cals=3508970&start=2023-10-22&end=2023-11-22
Text Content: Please consult this calendar frequently as events are subject to change.  Loading upcoming events......
Visiting: https://www.stjohns.edu/about/news/all-news?school=31
Text Content: The competition wa



Error crawling https://www.stjohns.edu/sites/default/files/2023-10/School%20of%20Education%20Visiting%20Scholar%20Review%20Checklist.docx: 'NoneType' object has no attribute 'find_all'
Visiting: https://www.stjohns.edu/academics/global-programs/office-international-education-inbound-programs/english-language-institute
Text Content: Join us on campus in New York City (NYC) for our innovative program that connects the classroom to t...
Visiting: https://www.stjohns.edu/academics/global-programs/english-language-and-american-culture-programs/educationusa-academy-and-academy-connects
Text Content: St. John's University is proud to offer two programs for secondary school students interested in lea...
Visiting: https://www.stjohns.edu/academics/global-programs/english-language-and-american-culture-programs/educationusa-academy-academy-connects#EdUSA
Text Content: St. John's University is proud to offer two programs for secondary school students interested in lea...
Visiting: https://www.stjo



Error crawling https://www.stjohns.edu/sites/default/files/2022-04/SJU%202022%20Instructions%20for%20Completing%20Summer%20Program%20Form.docx: 'NoneType' object has no attribute 'find_all'
Visiting: https://www.stjohns.edu/sites/default/files/2022-04/SJU%202022%20Camp%20Emergency%20Contact%20%26%20Consent%20to%20Treat%20Form.docx




Error crawling https://www.stjohns.edu/sites/default/files/2022-04/SJU%202022%20Camp%20Emergency%20Contact%20%26%20Consent%20to%20Treat%20Form.docx: 'NoneType' object has no attribute 'find_all'
Visiting: https://www.stjohns.edu/sites/default/files/2022-04/SJU%202022%20Meningitis%20Vaccination%20Parent%20Information%20%26%20Response%20Form.docx




Error crawling https://www.stjohns.edu/sites/default/files/2022-04/SJU%202022%20Meningitis%20Vaccination%20Parent%20Information%20%26%20Response%20Form.docx: 'NoneType' object has no attribute 'find_all'
Visiting: https://www.stjohns.edu/life-st-johns?utm_source=TD%20Student%20Life&utm_medium=website&utm_campaign=2022%20Admissions%20Changes&utm_id=UGAdmissions
Text Content: New York is just one of the remarkable places the St. John’s community calls home.  With a lively Re...
Visiting: https://www.stjohns.edu/life-st-johns/residence-life?utm_source=TD%20Residence%20Life&utm_medium=website&utm_campaign=2022%20Admissions%20Changes&utm_id=UGAdmissions
Text Content: Residence Life aims to develop a residential community that supports and enhances the academic missi...
Visiting: https://www.stjohns.edu/life-st-johns/public-safety#contact
Text Content: St. John’s University Department of Public Safety provides safety and security services to our commu...
Visiting: https://www.stjohns.edu/abo



Error crawling https://www.stjohns.edu/files/2023-annual-security-and-fire-safety-report: 'NoneType' object has no attribute 'find_all'
Visiting: https://www.stjohns.edu/life-st-johns/public-safety
Text Content: St. John’s University Department of Public Safety provides safety and security services to our commu...
Visiting: https://www.stjohns.edu/life-st-johns/student-services/career-services/employers
Text Content: Our Mission: Preparing and empowering all students for their career journey, creating connections wi...
Visiting: https://www.stjohns.edu/life-st-johns/student-services/career-services
Text Content: University Career Services is committed to ensuring student and alumni success. Our dedicated team o...
Visiting: https://www.stjohns.edu/life-st-johns/student-services/bookstore
Text Content: Show your St. John's pride all year long with the University Bookstore's authentic assortment of col...
Visiting: https://www.stjohns.edu/about/leadership-and-administration/administrativ

FileNotFoundError: [Errno 2] No such file or directory: "corpus/Bachelor of Science / Juris Doctor | St. John's University.txt"