In [6]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
input_file_path = r'C:\Users\abhis\Desktop\projects task\Input.xlsx'
urls_df = pd.read_excel(input_file_path)

In [15]:
print(urls_df.head())

       URL_ID                                                URL
0  bctech2011  https://insights.blackcoffer.com/ml-and-ai-bas...
1  bctech2012  https://insights.blackcoffer.com/streamlined-i...
2  bctech2013  https://insights.blackcoffer.com/efficient-dat...
3  bctech2014  https://insights.blackcoffer.com/effective-man...
4  bctech2015  https://insights.blackcoffer.com/streamlined-t...


In [25]:
urls_df.columns

Index(['URL_ID', 'URL'], dtype='object')

In [28]:
def extract_article(url):
    """
    Extract the title and body of an article from a given URL.
    
    Args:
        url (str): The URL of the article.

    Returns:
        tuple: A tuple containing the article title and body text.
    """
    try:
        # Fetch the page content
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status codes
        
        # Parse the page content with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract the article title
        title_tag = soup.find('title') or soup.find('h1')
        title = title_tag.get_text(strip=True) if title_tag else 'No Title Found'
        
        # Extract the article body
        article_body = soup.find('article') or soup.find('div', {'class': 'content'})
        body = article_body.get_text(strip=True) if article_body else 'No Content Found'
        
        return title, body
    except requests.RequestException as e:
        print(f"Request error for URL {url}: {e}")
    except Exception as e:
        print(f"Error processing URL {url}: {e}")
    return None, None

In [29]:
output_directory = r'C:\Users\abhis\Desktop\projects task\extracted_articles'
os.makedirs(output_directory, exist_ok=True)

In [30]:
valid_rows_count = urls_df.dropna(subset=['URL', 'URL_ID']).shape[0]
print("Number of valid rows to process:", valid_rows_count)

Number of valid rows to process: 147


In [31]:
for index, row in urls_df.iterrows():
    url = row.get('URL')  # Column name for URL
    url_id = row.get('URL_ID')  # Column name for URL_ID
    
    if pd.isna(url) or pd.isna(url_id):
        print("Skipping row {} due to missing URL or ID.".format(index))
        continue
    
    print("Processing URL_ID: {}".format(url_id))

    title, body = extract_article(url)
    
    if title and body:
        file_path = os.path.join(output_directory, "{}.txt".format(url_id))
        print("Saving to file: {}".format(file_path))
        try:
            with open(file_path, 'w', encoding='utf-8') as file:
                file.write("Title: {}\n\n".format(title))
                file.write(body)
                
            print("Saved article {} to {}".format(url_id, file_path))
        except IOError as e:
            print("Error saving file {}: {}".format(file_path, e))

Processing URL_ID: bctech2011
Saving to file: C:\Users\abhis\Desktop\projects task\extracted_articles\bctech2011.txt
Saved article bctech2011 to C:\Users\abhis\Desktop\projects task\extracted_articles\bctech2011.txt
Processing URL_ID: bctech2012
Saving to file: C:\Users\abhis\Desktop\projects task\extracted_articles\bctech2012.txt
Saved article bctech2012 to C:\Users\abhis\Desktop\projects task\extracted_articles\bctech2012.txt
Processing URL_ID: bctech2013
Saving to file: C:\Users\abhis\Desktop\projects task\extracted_articles\bctech2013.txt
Saved article bctech2013 to C:\Users\abhis\Desktop\projects task\extracted_articles\bctech2013.txt
Processing URL_ID: bctech2014
Saving to file: C:\Users\abhis\Desktop\projects task\extracted_articles\bctech2014.txt
Saved article bctech2014 to C:\Users\abhis\Desktop\projects task\extracted_articles\bctech2014.txt
Processing URL_ID: bctech2015
Saving to file: C:\Users\abhis\Desktop\projects task\extracted_articles\bctech2015.txt
Saved article bctec