In [1]:
import requests
from bs4 import BeautifulSoup
import json
import markdown
import os

# URL
base_url = 'https://www.legalbites.in/topics/articles'
output_folder = 'output'
output_file = 'webscraping_output.json'

# Creating output folder
if not os.path.exists(output_folder):
    os.mkdir(output_folder)

# Function to convert HTML tables to Markdown
def html_table_to_markdown(table):
    markdown_table = ""
    for row in table.find_all('tr'):
        for cell in row.find_all(['th', 'td']):
            markdown_table += f"|{cell.text.strip()}"
        markdown_table += "|\n"
    return markdown_table

# Function to scrape a single page
def scrape_page(url):    
    try:
        response = requests.get(url)
        #print(response.status_code)
        #soup = BeautifulSoup(response.content, 'html.parser')  
        soup = BeautifulSoup(response.text, 'html.parser')

        # Handle the case where the selector is not found
        try:
            if(soup.select_one(".main-content")):
                
                for element in soup.find_all(class_='page-numbers next last page-numbers'):
                    element.extract()
                    #print(element.extract())

                for element in soup.find_all(class_='page-numbers prev first page-numbers'):
                    element.extract()
                    #print(element.extract())           
                
                # Extract the title and content
                title = soup.title.string if soup.title else ""
                #content = soup.find_all('div', {'class': 'listing-page'}).text.strip()
                content = soup.find('div', {'class': 'listing-wrapper'}).text.strip()

                # Convert HTML tables to Markdown
                tables = soup.find_all('table')
                for table in tables:
                    table.replace_with(html_table_to_markdown(table))

                return {'title': title, 'content': content}

        except AttributeError as e:
            print(e)

    except Exception as e:
        print(f"Given url page is not found {url}: {str(e)}")
        return None


# Scraping the first 5 pages
results = []
for page_num in range(1, 6):  
    page_url = f'{base_url}/page/{page_num}'
    #print(page_url)
    page_data = scrape_page(page_url)
    print('*'*50)
    print(page_data)
    print('*'*50)
    if page_data:
        results.append(page_data)

# Save results into JSON file
with open(os.path.join(output_folder, output_file), 'w', encoding='utf-8') as file:
    json.dump(results, file, ensure_ascii=False)

**************************************************
{'title': 'Articles', 'content': "Benefits of Studying Legal General Knowledgeby LB Desk\xa025 Sep 2023 10:51 AM GMT Arbitration: Introduction and Key Componentsby Dibakar Banerjee\xa022 Sep 2023 5:18 AM GMT Significance of Studying Constitutional Lawby LB Desk\xa015 Sep 2023 4:32 AM GMT The Path to Becoming a Corporate Lawyer in India: Skills and Insightsby Mayank Shekhar\xa014 Sep 2023 1:04 PM GMT Concept of Eminent Domainby Gitika Wadhwani\xa013 Sep 2023 10:39 AM GMT Supercharge Your Legal Journey with Legal Bites Subscription!by LB Desk\xa010 Sep 2023 12:53 PM GMT Protecting Your Rights: Navigating Spousal Support Claimsby Mayank Shekhar\xa07 Sep 2023 6:51 AM GMT The Benefits of Hiring a Local Car Accident Lawyerby Mayank Shekhar\xa026 Aug 2023 7:11 AM GMT How To Deal With A Car Accident At The Roadway Construction Zone?by Mayank Shekhar\xa026 Aug 2023 7:11 AM GMT Importance of Studying Landmark Judgments for Law Studentsby LB Desk