In [None]:
import requests
from bs4 import BeautifulSoup

def scrape_data(url):
    """
    Scrapes links, titles, and descriptions from a webpage.

    Args:
        url (str): The URL of the webpage to scrape.

    Returns:
        tuple: A tuple containing lists of links, titles, and descriptions.
    """
    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract links, titles, and descriptions
    scraped_links = []
    scraped_titles = []
    scraped_descriptions = []

    for link in soup.find_all('a'):
        href = link.get('href')
        if href and href.startswith('http'):
            scraped_links.append(href)
            title = link.get_text(strip=True)
            scraped_titles.append(title)
            # Extract description if available
            description = link.find_next_sibling(string=True)
            if description and len(description.strip()) > 0:
                scraped_descriptions.append(description.strip())
            else:
                scraped_descriptions.append("")

    return scraped_links, scraped_titles, scraped_descriptions

# Example usage
dawn_links, dawn_titles, dawn_descriptions = scrape_data('https://www.dawn.com/')
bbc_links, bbc_titles, bbc_descriptions = scrape_data('https://www.bbc.com/')

# Print the extracted data
print("Scraped Dawn Links:")
print(dawn_links)
print("\nScraped Dawn Titles:")
print(dawn_titles)
print("\nScraped Dawn Descriptions:")
print(dawn_descriptions)

print("\nScraped BBC Links:")
print(bbc_links)
print("\nScraped BBC Titles:")
print(bbc_titles)
print("\nScraped BBC Descriptions:")
print(bbc_descriptions)


In [None]:
import re

def clean_and_lowercase(text_data):
    """
    Cleans the text data by removing special characters and digits, and converts it to lowercase.

    Args:
        text_data (list): A list of strings containing the data to clean and lowercase.

    Returns:
        list: A list of cleaned and lowercased strings.
    """
    cleaned_data = []
    for item in text_data:
        # Remove special characters and digits
        cleaned_text = re.sub(r'[^a-zA-Z\s]', '', item)
        # Convert to lowercase
        cleaned_text = cleaned_text.lower()
        cleaned_data.append(cleaned_text)
    return cleaned_data

# Preprocess the extracted data
dawn_titles_cleaned = clean_and_lowercase(dawn_titles)
dawn_descriptions_cleaned = clean_and_lowercase(dawn_descriptions)
bbc_titles_cleaned = clean_and_lowercase(bbc_titles)
bbc_descriptions_cleaned = clean_and_lowercase(bbc_descriptions)

# Print the preprocessed data
print("\nCleaned Dawn Titles:")
print(dawn_titles_cleaned)
print("\nCleaned Dawn Descriptions:")
print(dawn_descriptions_cleaned)

print("\nCleaned BBC Titles:")
print(bbc_titles_cleaned)


In [None]:
import csv

# Preprocess titles and descriptions
def preprocess_and_collect_data_titles_and_descriptions(titles, descriptions):
    preprocessed_titles = []
    preprocessed_descriptions = []
    for title, description in zip(titles, descriptions):
        preprocessed_title_stemmed, preprocessed_title_lemmatized = preprocess_text(title)
        preprocessed_description_stemmed, preprocessed_description_lemmatized = preprocess_text(description)
        preprocessed_titles.append((preprocessed_title_stemmed, preprocessed_title_lemmatized))
        preprocessed_descriptions.append((preprocessed_description_stemmed, preprocessed_description_lemmatized))
    return preprocessed_titles, preprocessed_descriptions

preprocessed_dawn_titles, preprocessed_dawn_descriptions = preprocess_and_collect_data_titles_and_descriptions(dawn_titles, dawn_descriptions)
preprocessed_bbc_titles, preprocessed_bbc_descriptions = preprocess_and_collect_data_titles_and_descriptions(bbc_titles, bbc_descriptions)

# Write the preprocessed data into CSV
def write_preprocessed_data_to_csv(preprocessed_titles, preprocessed_descriptions, website_name):
    with open('dataExtracted.csv', 'a', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        for title, description in zip(preprocessed_titles, preprocessed_descriptions):
            writer.writerow([website_name, title, description])

write_preprocessed_data_to_csv(preprocessed_dawn_titles, preprocessed_dawn_descriptions, 'Dawn')
write_preprocessed_data_to_csv(preprocessed_bbc_titles, preprocessed_bbc_descriptions, 'BBC')

print("Data has been saved to 'dataExtracted.csv'.")


In [None]:
import requests
from bs4 import BeautifulSoup
import re
import csv
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Function to scrape links, titles, and descriptions from a webpage
def scrape_webpage_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extracting links, titles, and descriptions
    links = []
    titles = []
    descriptions = []
    
    for link in soup.find_all('a'):
        href = link.get('href')
        if href and href.startswith('http'):
            links.append(href)
            title = link.get_text(strip=True)
            titles.append(title)
            # Extracting description if available
            description = link.find_next_sibling(string=True)
            if description and len(description.strip()) > 0:
                descriptions.append(description.strip())
            else:
                descriptions.append("")
    
    return links, titles, descriptions

# Function to preprocess text
def preprocess_text(text):
    # Remove HTML tags
    text = re.sub('<[^<]+?>', '', text)
    # Remove punctuation and special characters
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Stemming and Lemmatization
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return stemmed_tokens, lemmatized_tokens

# Example usage
dawn_links, dawn_titles, dawn_descriptions = scrape_webpage_data('https://www.dawn.com/')
bbc_links, bbc_titles, bbc_descriptions = scrape_webpage_data('https://www.bbc.com/')

# Preprocess titles and descriptions
preprocessed_dawn_titles = []
preprocessed_dawn_descriptions = []
for title, description in zip(dawn_titles, dawn_descriptions):
    preprocessed_title_stemmed, preprocessed_title_lemmatized = preprocess_text(title)
    preprocessed_description_stemmed, preprocessed_description_lemmatized = preprocess_text(description)
    preprocessed_dawn_titles.append((preprocessed_title_stemmed, preprocessed_title_lemmatized))
    preprocessed_dawn_descriptions.append((preprocessed_description_stemmed, preprocessed_description_lemmatized))

preprocessed_bbc_titles = []
preprocessed_bbc_descriptions = []
for title, description in zip(bbc_titles, bbc_descriptions):
    preprocessed_title_stemmed, preprocessed_title_lemmatized = preprocess_text(title)
    preprocessed_description_stemmed, preprocessed_description_lemmatized = preprocess_text(description)
    preprocessed_bbc_titles.append((preprocessed_title_stemmed, preprocessed_title_lemmatized))
    preprocessed_bbc_descriptions.append((preprocessed_description_stemmed, preprocessed_description_lemmatized))

# Write the preprocessed data into CSV
with open('dataExtracted.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['Website', 'Title', 'Lemmatized Title', 'Description', 'Lemmatized Description']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    # Write Dawn data
    for title, description in zip(preprocessed_dawn_titles, preprocessed_dawn_descriptions):
        writer.writerow({'Website': 'Dawn', 
                         'Title': ' '.join(title[0]),  # Join stemmed tokens
                         'Lemmatized Title': ' '.join(title[1]),  # Join lemmatized tokens
                         'Description': ' '.join(description[0]),  # Join stemmed tokens
                         'Lemmatized Description': ' '.join(description[1])})  # Join lemmatized tokens

    # Write BBC data
    for title, description in zip(preprocessed_bbc_titles, preprocessed_bbc_descriptions):
        writer.writerow({'Website': 'BBC', 
                         'Title': ' '.join(title[0]),  # Join stemmed tokens
                         'Lemmatized Title': ' '.join(title[1]),  # Join lemmatized tokens
                         'Description': ' '.join(description[0]),  # Join stemmed tokens
                         'Lemmatized Description': ' '.join(description[1])})  # Join lemmatized tokens

print("Data has been saved to 'dataExtracted.csv'.")
