In [1]:
import logging
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import WebDriverException

In [5]:
import logging
import os

# Set up logging
logging.basicConfig(level=logging.INFO)  # Set the logging level to INFO
logger = logging.getLogger(__name__)  # Create a logger instance

# Function to save content to a text file
def save_content(content, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(content)

def fetch_articles(url):
    options = Options()
    options.add_argument('--disable-gpu')
    options.binary_location = "C:\\Program Files\\Mozilla Firefox\\firefox.exe"  # Provide path to Firefox binary
    service = Service(executable_path=r"C:\Users\ABEL\Downloads\geckodriver.exe")  # Change path to geckodriver

    driver = webdriver.Firefox(service=service, options=options)
    driver.get(url)
    logger.info(f"Loading main page: {url}")

    try:
        articles = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.XPATH, '//h3/a'))
        )
        urls = [article.get_attribute('href') for article in articles]
        logger.info(f"Found {len(urls)} articles on the main page.")
    finally:
        driver.quit()

    for href in urls:
        try:
            driver = webdriver.Firefox(service=service, options=options)
            driver.get(href)
            logger.info(f"Navigating to article page: {href}")

            # Wait for the content to be loaded
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'div.post--content'))
            )
            content = driver.execute_script("return document.querySelector('div.post--content').innerText;")
            if content:
                logger.info(f"Content fetched: {content[:100]}...")
                
                # Save content to a file
                filename = f"{os.path.basename(href)}.txt"
                save_content(content, filename)
                logger.info(f"Content saved to {filename}")
            else:
                logger.error("No content found.")
        except WebDriverException as e:
            logger.error(f"Error while fetching content from {href}: {e}")
        finally:
            driver.quit()

if __name__ == "__main__":
    url = 'https://www.herald.co.zw/category/articles/sport/'
    fetch_articles(url)


INFO:__main__:Loading main page: https://www.herald.co.zw/category/articles/sport/
INFO:__main__:Found 22 articles on the main page.
INFO:__main__:Navigating to article page: https://www.herald.co.zw/zra-hands-over-classroom-block-clinic-2/
INFO:__main__:Content fetched: Walter Nyamukondiwa in HURUNGWE

The Zambezi River Authority (ZRA) has commissioned a fully furnishe...
INFO:__main__:Content saved to .txt
INFO:__main__:Navigating to article page: https://www.herald.co.zw/bindura-hospital-loses-pump-and-pipes-to-thieves/
INFO:__main__:Content fetched: Fungai Lupande

Mashonaland Central Bureau

A procurement officer and an administrator at Bindura Ho...
INFO:__main__:Content saved to .txt
INFO:__main__:Navigating to article page: https://www.herald.co.zw/warriors-to-play-at-orlando/
INFO:__main__:Content fetched: Sports Reporter

ZIMBABWE will play their next World Cup qualifier home game against Lesotho at Orla...
INFO:__main__:Content saved to .txt
INFO:__main__:Navigating to artic

In [6]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def fetch_articles(url):
    options = Options()
    options.add_argument('--disable-gpu')
    options.binary_location = "C:\\Program Files\\Mozilla Firefox\\firefox.exe"
    service = Service(executable_path=r'C:\Users\ABEL\Downloads\geckodriver.exe')
    
    # Initialize the Chrome driver
    driver = webdriver.Chrome(service=service, options=options)
    driver.get(url)
    logger.info(f"Loading main page: {url}")

    # Prepare to save articles
    with open('sport_articles.txt', 'w', encoding='utf-8') as file:
        try:
            articles = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.XPATH, '//h3/a'))
            )
            urls = [article.get_attribute('href') for article in articles]
            titles = [article.text for article in articles]
            logger.info(f"Found {len(urls)} articles on the main page.")

            for href, title in zip(urls, titles):
                try:
                    driver.get(href)
                    logger.info(f"Navigating to article page: {href}")

                    # Wait for the content to be loaded
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, 'div.post--content'))
                    )
                    content = driver.execute_script("return document.querySelector('div.post--content').innerText;")
                    if content:
                        # Write title, URL, and content to file
                        file.write(f"Title: {title}\nURL: {href}\nContent:\n{content}\n\n")
                        logger.info(f"Content fetched and saved: {title[:100]}...")
                    else:
                        logger.error("No content found.")
                except WebDriverException as e:
                    logger.error(f"Error while fetching content from {href}: {e}")
        finally:
            driver.quit()

if __name__ == "__main__":
    url = 'https://www.herald.co.zw/category/articles/sport/'
    fetch_articles(url)

INFO:__main__:Loading main page: https://www.herald.co.zw/category/articles/sport/
INFO:__main__:Found 22 articles on the main page.
INFO:__main__:Navigating to article page: https://www.herald.co.zw/zra-hands-over-classroom-block-clinic-2/
INFO:__main__:Content fetched and saved: ZRA hands over classroom block, clinic...
INFO:__main__:Navigating to article page: https://www.herald.co.zw/bindura-hospital-loses-pump-and-pipes-to-thieves/
INFO:__main__:Content fetched and saved: Bindura Hospital loses pump and pipes to thieves...
INFO:__main__:Navigating to article page: https://www.herald.co.zw/warriors-to-play-at-orlando/
INFO:__main__:Content fetched and saved: Warriors to play at Orlando...
INFO:__main__:Navigating to article page: https://www.herald.co.zw/zra-hands-over-classroom-block-clinic/
INFO:__main__:Content fetched and saved: ZRA hands over classroom block, clinic...
INFO:__main__:Navigating to article page: https://www.herald.co.zw/workers-efforts-hailed/
INFO:__main__:Cont

In [10]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

def read_articles(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        articles = file.read().split('\n\n')
    contents = []
    for article in articles:
        # Extract content part after "Content:" label
        match = re.search(r"Content:\n(.*)", article, re.S)
        if match:
            contents.append(match.group(1).strip())
    return contents

def preprocess(contents):
    # Lowercasing and tokenization (you might want to refine this with more preprocessing)
    return [re.sub(r'\W+', ' ', content.lower()).split() for content in contents]

def perform_clustering(contents, n_clusters=5):
    # Vectorization
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform([' '.join(content) for content in contents])

    # Clustering
    kmeans = KMeans(n_clusters=n_clusters)
    kmeans.fit(tfidf_matrix)

    # Cluster labels
    cluster_labels = kmeans.labels_
    return cluster_labels, tfidf_matrix

def main():
    filename = 'news_articles.txt'
    contents = read_articles(filename)
    processed_contents = preprocess(contents)
    cluster_labels, tfidf_matrix = perform_clustering(processed_contents)

    # Calculate the total number of articles and the number of articles per cluster
    total_articles = len(contents)
    print(f"Total number of articles: {total_articles}")

    cluster_counts = {i: list(cluster_labels).count(i) for i in range(max(cluster_labels) + 1)}
    for cluster_id, count in cluster_counts.items():
        print(f"Cluster {cluster_id} contains {count} articles")

    # Optional: Evaluate the clustering
    if len(contents) > 1:  # Silhouette score requires at least 2 points
        silhouette_avg = silhouette_score(tfidf_matrix, cluster_labels)
        print("Silhouette Score:", silhouette_avg)

if __name__ == "__main__":
    main()

Total number of articles: 22
Cluster 0 contains 1 articles
Cluster 1 contains 1 articles
Cluster 2 contains 9 articles
Cluster 3 contains 9 articles
Cluster 4 contains 2 articles
Silhouette Score: 0.15022703507231233


In [2]:
import pandas as pd

# For a tab-separated file
df = pd.read_csv('news_articles.txt', delimiter='\t', error_bad_lines=False)


TypeError: read_csv() got an unexpected keyword argument 'error_bad_lines'

In [34]:
df

Unnamed: 0,Text
0,Title: Workers’ efforts hailed
1,URL: https://www.herald.co.zw/workers-efforts-...
2,Content:
3,Wallace Ruzvidzo Herald Reporter
4,President Mnangagwa has hailed the country’s w...
...,...
501,Local restaurants and others will buy fresh fi...
502,For thousands of years fishers have dried thei...
503,These techniques are not difficult to implemen...
504,We would like to think that as commercial fish...


In [35]:
# Extract title and URL from the Text column
df[['Title', 'URL', 'Content']] = df['Text'].str.extract(r'Title: (.*)URL: (.*)Content:(.*)')
df.drop(columns=['Text'], inplace=True)
import numpy as np

# Preprocessing
df['Title'] = df['Title'].apply(lambda x: re.sub(r'\W+', ' ', str(x).lower()).strip())
df['URL'] = df['URL'].apply(lambda x: re.sub(r'\W+', ' ', str(x).lower()).strip())

# Save the preprocessed DataFrame to CSV
df.to_csv('preprocessed_articles.csv', index=False)

In [17]:
df



Unnamed: 0,Title,URL,Content
0,,,
1,,,
2,,,
3,,,
4,,,
...,...,...,...
501,,,
502,,,
503,,,
504,,,


In [None]:
import pandas as pd

# Read the text file and create a DataFrame
df = pd.read_csv('news_articles.txt', sep='\n', delimiter='\n', header=None, names=['Text'])

# Extract title and URL from the Text column
df[['Title', 'URL', 'Content']] = df['Text'].str.extract(r'Title: (.*)URL: (.*)Content:(.*)')
df.drop(columns=['Text'], inplace=True)

# Preprocessing
df['Title'] = df['Title'].apply(lambda x: re.sub(r'\W+', ' ', x.lower()).strip())
df['URL'] = df['URL'].apply(lambda x: re.sub(r'\W+', ' ', x.lower()).strip())

# Save the preprocessed DataFrame to CSV
df.to_csv('preprocessed_articles.csv', index=False)

# Perform clustering based on Title and URL
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Title'] + ' ' + df['URL'])

# Clustering
kmeans = KMeans(n_clusters=5)  # Specify the number of clusters
kmeans.fit(tfidf_matrix)

# Add cluster labels to DataFrame
df['Cluster'] = kmeans.labels_

# Save the clustered DataFrame to CSV
df.to_csv('clustered_articles.csv', index=False)

# Output the number of articles per cluster
cluster_counts = df['Cluster'].value_counts()
print("Number of articles per cluster:")
print(cluster_counts)
