<a href="https://colab.research.google.com/github/riti215/FSN_Web_Scrapping/blob/main/Ritika_Parashar.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import re
import shutil

In [2]:
# Removing directory if exists
def remove_directory(path):
    if os.path.exists(path):
        shutil.rmtree(path)
        print("Directory removed")
    else:
        print("Directory does not exist.")

remove_directory('./text_data')
remove_directory('./image_data')

Directory removed
Directory removed


In [3]:
# Sending a request to the base URL
base_url = 'https://franchisesuppliernetwork.com'
base_response = requests.get(base_url, headers = {'User-agent': 'your bot 0.1'})

# Creating Directory if does not exists
os.makedirs('text_data', exist_ok=True)

# Check if base URL response is successful
if base_response.status_code == 200:
  soup = BeautifulSoup(base_response.content, 'html.parser')

# Extracting URLs from anchor tags
  a_tags = soup.find_all('a')
  urls = []
  for a in a_tags:
    href = a.get('href')
    if href and href.startswith('https://franchisesuppliernetwork.com/'):
      urls.append(href)
  unique_urls = list(dict.fromkeys(urls))

# Loop through each unique URL
  for unique_url in unique_urls:

    #Extract end of URL text for naming convention
    split_url = unique_url.split('/')
    end_of_url = split_url[-2]

# Fetching content from the unique URL
    sub_response = requests.get(unique_url, headers = {'User-agent': 'your bot 0.1'})
    sub_soup = BeautifulSoup(sub_response.content, 'html.parser')

# Extracting text data from header, main, and footer tags
    tags = sub_soup.find_all(['header', 'main', 'footer'])
    all_text = unique_url + '\n\n'
    for tag in tags:
      text = tag.get_text()
      formatted_text = re.sub(r'\n{2,}', '\n\n', text)
      if tag.name == 'header':
        final_text = "HEADER TEXT: \n" + formatted_text
      elif tag.name == 'main':
        final_text = "MAIN TEXT: \n" + formatted_text
      elif tag.name == 'footer':
        final_text = "FOOTER TEXT: \n" + formatted_text
      all_text += final_text + '\n'

# Writing text data to a file
    file_path = 'text_data/' + end_of_url + '.txt'
    with open(file_path, "a", encoding="utf-8") as file:
      file.write(all_text)

# Extracting image URLs and downloading images
    img_tags = sub_soup.find_all('img')
    imgs = []
    for img in img_tags:
      img_url = img.get('src')
      imgs.append(img_url)

    unique_img_urls = list(dict.fromkeys(imgs))

    for unique_img_url in unique_img_urls:
      img_data = requests.get(unique_img_url, headers = {'User-agent': 'your bot 0.1'}).content
      img_dir = 'image_data/{}'.format(end_of_url)
      img_name = unique_img_url.split('/')[-1]
      os.makedirs(img_dir, exist_ok=True)
      img_path = img_dir + '/' + img_name
      with open(img_path, 'wb') as img_file:
        img_file.write(img_data)
  print("Text files saved")
  print("Images download complete")
  print("Please check the directory")
else :
  print("Error Status: ", base_response.status_code)


# Summary Report generation
summary_report = []

for unique_url in unique_urls:
  url_response = requests.get(unique_url, headers = {'User-agent': 'your bot 0.1'})
  url_soup = BeautifulSoup(url_response.content, 'html.parser')

# Counting number of images
  img_tags = url_soup.find_all('img')
  img_count = len(img_tags)

# Extracting URLs referenced on the webpage
  a_tags = url_soup.find_all('a')
  ref_urls = []
  for a in a_tags:
    href = a.get('href')
    if href and href.startswith('https://franchisesuppliernetwork.com/'):
      ref_urls.append(href)
  unique_ref_urls = list(dict.fromkeys(ref_urls))

# Append data to summary report list
  summary_report.append({'Web Page URL': unique_url, 'Number of Images': img_count, ' List of URLs referenced on the webpage': unique_ref_urls})

# Creating DataFrame from summary report
print("Summary Overview: ")
summary_df = pd.DataFrame(summary_report)
summary_df.head()

Text files saved
Images download complete
Please check the directory
Summary Overview: 


Unnamed: 0,Web Page URL,Number of Images,List of URLs referenced on the webpage
0,https://franchisesuppliernetwork.com/,114,"[https://franchisesuppliernetwork.com/, https:..."
1,https://franchisesuppliernetwork.com/assessment/,2,"[https://franchisesuppliernetwork.com/, https:..."
2,https://franchisesuppliernetwork.com/fsn-suppl...,14,"[https://franchisesuppliernetwork.com/, https:..."
3,https://franchisesuppliernetwork.com/fsn-suppl...,2,"[https://franchisesuppliernetwork.com/, https:..."
4,https://franchisesuppliernetwork.com/fsn-suppl...,2,"[https://franchisesuppliernetwork.com/, https:..."


In [4]:
# Saving summary report to a CSV file
summary_file_path = 'summary.csv'
summary_df.to_csv(summary_file_path, index=False)
print("Summary Report saved")

Summary Report saved
