In [13]:
import requests
from bs4 import BeautifulSoup
import re
import time
from urllib.parse import urljoin
import logging

# log to a file
logging.basicConfig(filename='smart_stories.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
log = logging.getLogger(__name__)

def get_smart_stories():
    # get the page
    base_url = 'https://www.thesmartfactory.io/'
    url = urljoin(base_url, 'smart-stories.html')
    log.info(f'Getting page {url}')
    r = requests.get(url)
    log.info(f'Got page {url} with status code {r.status_code}')
    log.info(f'Page content: {r.text}')
    r.raise_for_status()
    log.info('Parsing the page')
    soup = BeautifulSoup(r.text, 'html.parser')
    log.info(f'Page Parsed: {soup.contents}')
    log.info('Finding all the links')
    links = soup.find_all('a', href=re.compile('^smart-stories'))
    log.info(f'Found {len(links)} links')
    # get the href and the text
    log.info('If the link text is "Learn more", get the linked page')
    links = [(link['href'], link.text) for link in links if link.text.startswith('Learn more')]
    log.info(f'Found {len(links)} links with text "Learn more"')
    # get the text from the linked page
    stories = []
    for link in links:
        full_url = urljoin(base_url, link[0])
        log.info(f'Getting page {full_url}')
        r = requests.get(full_url)
        log.info(f'Got page {full_url} with status code {r.status_code}')
        r.raise_for_status()
        soup = BeautifulSoup(r.text, 'html.parser')
        stories.append(soup.get_text())
        time.sleep(2)
    log.info(f'Collected {len(stories)} stories')
    return stories

if __name__ == '__main__':
    stories = get_smart_stories()
    # Save the stories to a txt file
    with open('smart_stories.txt', 'w', encoding='utf-8') as f:
        for story in stories:
            f.write(story)
            f.write('\n')
    log.info('Stories saved to smart_stories.txt')

2024-09-30 15:46:19,301 - INFO - Getting page https://www.thesmartfactory.io/smart-stories.html
2024-09-30 15:46:19,874 - INFO - Got page https://www.thesmartfactory.io/smart-stories.html with status code 200
2024-09-30 15:46:19,874 - INFO - Page content: <!DOCTYPE html>
<html lang="en">

<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>The Smart Factory - Smart Stories</title>

	<!-- Google Tag Manager -->
	<script>(function (w, d, s, l, i) {
			w[l] = w[l] || []; w[l].push({
				'gtm.start':
					new Date().getTime(), event: 'gtm.js'
			}); var f = d.getElementsByTagName(s)[0],
				j = d.createElement(s), dl = l != 'dataLayer' ? '&l=' + l : ''; j.async = true; j.src =
					'https://www.googletagmanager.com/gtm.js?id=' + i + dl; f.parentNode.insertBefore(j, f);
		})(window, document, 'script', 'dataLayer', 'GTM-KHMV3Z7J');</script>
	<!-- End Google Tag Manager -->

	
	
	<!-- OneTrust Cookies Consent Notice start for www.the