In [1]:
import csv
import time
from urllib.parse import urlparse, urljoin
import requests

file_location = 'D:\Downloads\input-pages-blog-majestic-com.csv'
output_dir = r"C:\Users\alpra\Documents\blog-down"
index_csv_location = '{}/index-file.csv'.format(output_dir)

# Open the CSV file in read mode
with open(file_location, encoding='utf-8-sig') as csv_file:
    csv_reader = csv.DictReader(csv_file)

    # A set of resources to fetch later
    prospects = set()
    for row in csv_reader:
        url = urlparse(row['URL'])

        if not url.path.endswith(('/feed/', '.png', '.jpg', '.gif')):
            prospects.add(urljoin(row["URL"], url.path))

    print('There are {} prospective pages'.format(len(prospects)))

# Open index file to write output
with open(index_csv_location, mode='w', newline='', encoding='utf-8-sig') as index_csv:
    fieldnames = ['id', 'url', 'filename', 'status', 'location']
    writer = csv.DictWriter(index_csv, fieldnames=fieldnames)
    writer.writeheader()

    page_id = 0

    for url in prospects:
        print('Processing {}'.format(url))

        # Fetch header first to get check content type
        response = requests.head(url)
        # Content type can contain encoding information after a semi-colon (`;`), which we're not interested in
        content_type = response.headers.get('Content-Type').split(';')[0]

        if content_type == 'text/html':
            page_id += 1
            cache_filename = ''
            location = ''
            if response.is_redirect:
                location = response.headers.get('Location')
            elif response.status_code == 200:
                response = requests.get(url)

                cache_filename = '{}/page-{}.html'.format(output_dir, page_id)
                with open(cache_filename, mode='w', encoding='utf-8') as cache:
                    cache.write(response.text)

            writer.writerow({
                'id': page_id,
                'url': url,
                'filename': cache_filename,
                'status': response.status_code,
                'location': location
            })
        else:
            print('Ignoring non-HTML content type "{}"'.format(content_type))

        # Pause the execution of the script to prevent an aggressive spam of requests
        time.sleep(1)

There are 32 prospective pages
Processing https://blog.majestic.com/general/track-competitors-subdomain-root-domain-home-page/
Processing https://blog.majestic.com/training/a-guide-to-the-majestic-site-explorer/embed/
Processing https://blog.majestic.com/general/majestic-seo-fo-facebook/
Processing https://blog.majestic.com/pl/sieci-spolecznosciowe/wywiad-bartosz-szlachetka-moja-kariera-w-seo/
Processing https://blog.majestic.com/training/how-to-optimise-your-site-for-yandex-part-2/
Processing https://blog.majestic.com/majestic-seo-partners/
Processing https://blog.majestic.com/development/compare-backlinks/
Processing https://blog.majestic.com/pl/uncategorized-pl/wskazniki-trust-i-citation-flow/
Processing https://blog.majestic.com/general/3d-printing-in-space/
Processing https://blog.majestic.com/general/site-explorer-tutorial-2/
Processing https://blog.majestic.com/wp-content/uploads/2013/10/randomurlgenerator.xlsx
Ignoring non-HTML content type "application/vnd.openxmlformats-offic

In [7]:
import requests
from bs4 import BeautifulSoup
from os import path
import time
import csv
import urllib
from urllib.parse import urlparse, urljoin
from urllib.request import urlopen



output_dir = r"C:\Users\alpra\Documents\blog-down"
index_csv_location = '{}/index-file.csv'.format(output_dir)

response = requests.get('https://thehackernews.com/')

soup = BeautifulSoup(response.text, 'html.parser')

print('title: {}'.format(soup.title.string))

print('links:')
with open(index_csv_location, mode='w', newline='', encoding='utf-8-sig') as index_csv:
    fieldnames = ['id', 'url', 'filename', 'status', 'location']
    writer = csv.DictWriter(index_csv, fieldnames=fieldnames)
    writer.writeheader()
    page_id = 0
    for link in soup.find_all('a', class_="story-link"):
        url = link.get('href')
        html = urlopen(url)
        response = urllib.request.urlopen(url)
        webContent = response.read()
        print('Processing {}'.format(url))
        

        # Fetch header first to get check content type
        response = requests.head(url)
        # Content type can contain encoding information after a semi-colon (`;`), which we're not interested in
        content_type = response.headers.get('Content-Type').split(';')[0]

        if content_type == 'text/html':
            page_id += 1
            cache_filename = ''
            location = ''
            if response.is_redirect:
                location = response.headers.get('Location')
            elif response.status_code == 200:
                response = requests.get(url)

                cache_filename = '{}/page-{}.html'.format(output_dir, page_id)
                with open(cache_filename, mode='w', encoding='utf-8') as cache:
                    cache.write(response.text)

            writer.writerow({
                'id': page_id,
                'url': url,
                'filename': cache_filename,
                'status': response.status_code,
                'location': location
            })
        else:
            print('Ignoring non-HTML content type "{}"'.format(content_type))


title: The Hacker News - Cybersecurity News and Analysis
links:
Processing https://thehackernews.com/2021/06/hackers-breached-colonial-pipeline.html
Processing https://thehackernews.com/2021/06/latvian-woman-charged-for-her-role-in.html
Processing https://thehackernews.com/2021/06/github-updates-policy-to-remove-exploit.html
Processing https://thehackernews.com/2021/06/break-into-ethical-hacking-with-18.html
Processing https://thehackernews.com/2021/06/tiktok-quietly-updated-its-privacy.html
Processing https://thehackernews.com/2021/06/alert-critical-rce-bug-in-vmware.html
Processing https://thehackernews.com/2021/06/google-to-let-android-users-opt-out-to.html
Processing https://thehackernews.com/2021/06/10-critical-flaws-found-in-codesys.html


In [1]:
import requests
from bs4 import BeautifulSoup
import time
import urllib
from urllib.parse import urlparse, urljoin
from urllib.request import urlopen



output_dir = r"C:\Users\alpra\Documents\blog-down"

response = requests.get('https://thehackernews.com/')

soup = BeautifulSoup(response.text, 'html.parser')

print('title: {}'.format(soup.title.string))

print('links:')
page_id = 0
for link in soup.find_all('a', class_="story-link"):
        url = link.get('href')
        html = urlopen(url)
        response = urllib.request.urlopen(url)
        webContent = response.read()
        print('Processing {}'.format(url))
        

        # Fetch header first to get check content type
        response = requests.head(url)
        # Content type can contain encoding information after a semi-colon (`;`), which we're not interested in
        content_type = response.headers.get('Content-Type').split(';')[0]

        if content_type == 'text/html':
            page_id += 1
            cache_filename = ''
            response = requests.get(url)
            cache_filename = '{}/page-{}.html'.format(output_dir, page_id)
            with open(cache_filename, mode='w', encoding='utf-8') as cache:
                cache.write(response.text)

title: The Hacker News - Cybersecurity News and Analysis
links:
Processing https://thehackernews.com/2021/06/shifting-focus-from-reactive-to.html
Processing https://thehackernews.com/2021/06/researchers-discover-first-known.html
Processing https://thehackernews.com/2021/06/hackers-breached-colonial-pipeline.html
Processing https://thehackernews.com/2021/06/latvian-woman-charged-for-her-role-in.html
Processing https://thehackernews.com/2021/06/github-updates-policy-to-remove-exploit.html
Processing https://thehackernews.com/2021/06/break-into-ethical-hacking-with-18.html
Processing https://thehackernews.com/2021/06/tiktok-quietly-updated-its-privacy.html
Processing https://thehackernews.com/2021/06/alert-critical-rce-bug-in-vmware.html
