## News Sentiment Pipeline

In [1]:
# Data & numerical
import pandas as pd
import numpy as np
from datetime import datetime  # provides date and time manipulation functions

# Core / stdlib
import os
import time
os.listdir()

# Networking / HTTP
import requests   # fetch pages when Selenium fails

# HTML parsing & article extraction
from bs4 import BeautifulSoup  # parse page DOM
from newspaper import Article  # fallback article text extraction

# NLP / tokenization & models
from nltk import sent_tokenize  # sentence splitter 
from transformers import pipeline  # Hugging Face inference pipeline

# Browser automation
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager  # auto-download ChromeDriver

# Visualization / plotting
import matplotlib.pyplot as plt

In [2]:
# Configure headless Chrome for scraping (safer in CI)
options = webdriver.ChromeOptions()
options.add_argument('--headless')                 # run without opening a browser window
options.add_argument('--no-sandbox')               # required in some restricted environments
options.add_argument('--disable-dev-shm-usage')    # avoid /dev/shm issues in containers

# Start ChromeDriver (auto-downloads compatible driver)
driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

# Create project folder and switch to it so all outputs are saved there
PROJECT_DIR = "News_Sentiment"
os.makedirs(PROJECT_DIR, exist_ok=True)
os.chdir(PROJECT_DIR)
print(os.getcwd())
RUN_DATE = datetime.now().strftime("%Y-%m-%d")  # stamp each manual run

c:\Users\Admin\Code_Louisville\Course_Work\News\News_Sentiment


In [None]:
# Write text to Readme.md
readme_text = """# News Sentiment Pipeline

### This code collected seven days' worth of unstructured news-article data and separated it into daily output files for future analysis.

### ***Code Outline***
* Create News_Sentiment folder, prepare README.md, set working directory, record run date
* Configure headless ChromeDriver (webdriver-manager) and start Selenium
* Load BBC News, collect deduplicated /news/ links (limit applied)
* For each URL: fetch HTML via Selenium with requests fallback
* Extract article text (BeautifulSoup selectors, newspaper3k fallback), clean and filter short items
* Build DataFrame with title, url, text, run_date, and text length
* Load NLTK sentence tokenizer and HF sentiment pipeline (DistilBERT on CPU)
* Split articles into sentences, run batched sentence-level sentiment, convert to signed scores
* Expand sentence results to a long table and save sentences_all_articles.csv
* Save per-article CSVs and top-sentences text files under ./sentences/
* Aggregate per-article polarity counts and mean score, save articles_sentiment_summary.csv and articles_sentiment.csv
* Write human-readable articles_texts.txt archive
* Log fetch/parse/sentiment errors, suppress HF symlink warning if set, and quit Selenium driver
* File aggregation and histogram visualization


### ***Link***
[BBC News Articles](https://www.bbc.com/news) 


### ***Directory***
| Column Name         | Description                                                                          | Data Type |
|:--------------------|:-------------------------------------------------------------------------------------|:---------:|
| article_index       | Unique identifier for each article                                                   | Integer   |
| title               | Title of the article                                                                 | String    |
| url                 | URL link to the article                                                              | String    |
| avg_label           | Average sentiment label (e.g., POSITIVE, NEGATIVE)                                   | String    |
| avg_score           | Average absolute signed score (0-1)                                                  | Float     |
| num_sentences       | Total number of sentences in the article                                             | Integer   |
| sentence            | A specific sentence extracted from the article                                       | String    |
| sent_label          | Sentiment label for the specific sentence (e.g., POSITIVE, NEGATIVE)                 | String    |
| sent_score          | Confidence score (0–1)                                                               | Float     |
| sent_value          | A numerical representation of the sentence score, possibly reflecting its sentiment  | Float     |


### ***Project Setup & Run Instructions***
|Action	                                                |Command / Notes                                                                                           |
|:------------------------------------------------------|:---------------------------------------------------------------------------------------------------------|
|Clone or fork the repo                                 |git clone https://github.com/ragogzheyan/Course_Work/tree/main/News                                       |
|Create & activate virtualenv (Git Bash)                |python -m venv venv         source venv/Scripts/activate                                                  | 
|Upgrade pip                                            |python -m pip install --upgrade pip                                                                       |
|Install from requirements.txt	                        |pip install -r requirements.txt                                                                           |
|Install packages manually (if no requirements.txt)  	|pip install selenium webdriver-manager beautifulsoup4 newspaper3k nltk transformers pandas numpy requests | 
|NLTK tokenizer setup (one-time)	                    |python -c "import nltk; nltk.download('punkt')"                                                           |
|Run the pipeline (Jupyter notebook)	                |jupyter lab or jupyter notebook — then open and run the project's notebook (e.g., run_pipeline.ipynb)     |
|Deactivate virtualenv (Git Bash)	                    |deactivate                                                                                                |


### ***AI tools & Provenance(APA)***
**Model**: GPT-5((OpenAI; accessed 2025-11-07-2025-11-14) 
**Role**: Assisted with Selenium configuration, rule generation, and sentence-level sentiment processing. 
**Prompts & outputs**: Archived: not available — prompts and important AI outputs are summarized below. 
**Settings**: model=gpt-5 

#### ***Selenium Fetch & Extraction***  
**Selenium config**: headless ChromeDriver via webdriver-manager; short wait (time.sleep(1)) after driver.get() to allow JS rendering. 
**Fetch logic**: Try Selenium first; if page_source length < 500 or Selenium errors, fallback to requests with timeout=8. 
**Extraction logic**: BeautifulSoup selectors: article, [role='main'], .story-body, .ssrcss-uf6wea-RichTextComponentWrapper; accept only extracts >200 chars — otherwise fallback to newspaper3k parsing. 
**Failure behavior**: On fetch/parse exceptions return empty string; errors. 
"""
with open("README.md", "w", encoding="utf-8") as f:
    f.write(readme_text)

In [4]:
# Load BBC News homepage and wait briefly for dynamic content
url = "https://www.bbc.com/news"
driver.get(url)
time.sleep(2)

# Parse the rendered page with BeautifulSoup
soup = BeautifulSoup(driver.page_source, "html.parser")

# Collect article links under /news/, ignore image links, deduplicate, and keep top 18
links = []
for a in soup.find_all("a", href=True):
    href = a['href']
    if href.startswith("/news/") and not href.lower().endswith((".png", ".jpg", ".gif")):
        full = "https://www.bbc.com" + href if href.startswith("/") else href
        links.append(full)
links = list(dict.fromkeys(links))[:18]  # dedupe while preserving order, then limit

In [5]:
def fetch_article_html(url):
    # Try Selenium first (handles JS-rendered pages)
    try:
        driver.get(url)
        time.sleep(1)                     # short wait for dynamic content
        html = driver.page_source
        if len(html) < 500:               # heuristic: very short HTML likely incomplete
            raise ValueError("Short html from Selenium")
        return html
    except Exception:
        # Fallback to requests for static pages or if Selenium fails
        try:
            r = requests.get(url, timeout=8)
            r.raise_for_status()
            return r.text
        except Exception as e:
            print("Fetch failed:", url, e)
            return ""                      # return empty string on complete failure

def extract_text_from_html(html, url=""):
    # Extract main article text using common selectors first, then newspaper3k fallback
    try:
        s = BeautifulSoup(html, "html.parser")
        # Common containers: <article>, role=main, legacy story-body, BBC-specific class
        parts = s.select("article, [role='main'], .story-body, .ssrcss-uf6wea-RichTextComponentWrapper")
        if parts:
            # Join visible text while preserving sentence spacing
            text = " ".join(p.get_text(separator=" ", strip=True) for p in parts)
            if len(text) > 200:           # require a minimum length to accept extraction
                return text
        # Fallback: use newspaper3k to parse article heuristically
        a = Article(url)
        a.set_html(html)
        a.parse()
        return a.text or ""
    except Exception as e:
        print("Parse failed:", url, e)
        return ""

In [6]:
# Build dataset: iterate collected links, fetch & extract article text, and store rows
rows = []
for u in links:
    try:
        html = fetch_article_html(u)              # fetch page (Selenium then requests)
        if not html:
            continue                              # skip if fetch failed
        text = extract_text_from_html(html, u)    # extract main article text
        s = BeautifulSoup(html, "html.parser")
        # Prefer <h1> for title, fallback to <title>; safe-get with empty default
        title = (s.find("h1") or s.title).get_text(strip=True) if (s.find("h1") or s.title) else ""
        text = text.replace("Advertisement", "").strip()  # remove common noise
        if len(text) < 120:                       # skip short/insufficient articles
            continue
        rows.append({"title": title, "url": u, "text": text, "run_date": RUN_DATE})
    except Exception as e:
        print("Article loop error:", u, e)        # log and continue on errors

# Convert to DataFrame and enforce minimum text length
df = pd.DataFrame(rows).reset_index(drop=True)
df['text_len'] = df['text'].str.len()             # helpful for filtering/analysis
df = df[df['text_len'] > 120].reset_index(drop=True)
df.sample(5)                                        # quick preview in interactive runs

Unnamed: 0,title,url,text,run_date,text_len
1,NewsNews,https://www.bbc.com/news/war-in-ukraine,News News War in Ukraine Fog helps Russian for...,2025-11-13,5212
12,NewsNews,https://www.bbc.com/news/world/africa,News News Africa Nigeria cancels mother-tongue...,2025-11-13,6923
13,NewsNews,https://www.bbc.com/news/world/asia,News News Asia China India Indian police inves...,2025-11-13,4727
8,NewsNews,https://www.bbc.com/news/scotland,News News Scotland Scotland Politics Hitman pl...,2025-11-13,6868
10,NewsNews,https://www.bbc.com/news/wales,"News News Wales Wales Politics Man, 18, arrest...",2025-11-13,6907


In [7]:
# Load a small fine-tuned sentiment model and bind to CPU (device=-1)
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
sent = pipeline("sentiment-analysis", model=model_name, device=-1)

def article_sentiment_analysis(text, max_sents=200, sent_trunc=590, batch_size=32):
    """
    Per-article sentence-level sentiment:
      - tokenizes into sentences
      - runs batched inference via the pipeline
      - converts labels/scores into signed values (positive = +score, negative = -score)
      - returns averages, counts, top positive/negative sentences, and all sentence scores
    """
    try:
        sents = sent_tokenize(text or "")                    # split text into sentences
        if not sents:
            return {                                         # empty-safe return
                "avg_label": None, "avg_score": None,
                "num_sentences": 0,
                "polarity_counts": {"positive": 0, "negative": 0},
                "top_positive": [], "top_negative": [], "sentence_scores": []
            }

        sents = sents[:max_sents]                           # cap number of sentences
        sentence_scores = []

        # Batch inference: truncate long sentences to avoid tokenizer/model limits
        for i in range(0, len(sents), batch_size):
            batch = [s[:sent_trunc] for s in sents[i:i+batch_size]]
            results = sent(batch)                           # pipeline accepts list input
            for txt, r in zip(batch, results):
                label = r.get("label")
                score = float(r.get("score", 0.0))
                val = score if label == "POSITIVE" else -score
                sentence_scores.append({
                    "sentence": txt,
                    "label": label,
                    "score": score,
                    "value": val
                })

        values = [s["value"] for s in sentence_scores]
        if not values:
            return {                                         # no scored sentences
                "avg_label": None, "avg_score": None,
                "num_sentences": len(sents),
                "polarity_counts": {"positive": 0, "negative": 0},
                "top_positive": [], "top_negative": [], "sentence_scores": sentence_scores
            }

        # Aggregate per-article polarity and top sentences
        avg_val = float(np.mean(values))
        avg_label = "POSITIVE" if avg_val > 0 else "NEGATIVE"
        avg_score = abs(avg_val)
        pos_count = sum(1 for v in values if v > 0)
        neg_count = sum(1 for v in values if v < 0)

        sorted_by_val = sorted(sentence_scores, key=lambda x: x["value"], reverse=True)
        top_pos = [s["sentence"] for s in sorted_by_val if s["value"] > 0][:3]
        top_neg = [s["sentence"] for s in sorted_by_val if s["value"] < 0][-3:][::-1]

        return {
            "avg_label": avg_label,
            "avg_score": avg_score,
            "num_sentences": len(sents),
            "polarity_counts": {"positive": pos_count, "negative": neg_count},
            "top_positive": top_pos,
            "top_negative": top_neg,
            "sentence_scores": sentence_scores
        }

    except Exception as e:
        print("Sentiment error:", e)
        return {                                             # graceful fallback on error
            "avg_label": None, "avg_score": None,
            "num_sentences": 0,
            "polarity_counts": {"positive": 0, "negative": 0},
            "top_positive": [], "top_negative": [], "sentence_scores": []
        }

# Apply function to each article's text, expand results to columns, and preview  
results = df['text'].apply(article_sentiment_analysis)
sent_df = pd.DataFrame(results.tolist())
# Ensure run_date travels with the article-level output
output = pd.concat([df[['title','url','text','run_date']].reset_index(drop=True), sent_df], axis=1)

# Print key summary columns and example outputs for inspection
print(output[['title','url','avg_label','avg_score','num_sentences','polarity_counts','top_positive','top_negative']])
print(df.loc[0,'text'][:500])                     # peek at raw text of first article
print(article_sentiment_analysis(df.loc[0,'text']))  # full sentiment dict for first article

Device set to use cpu


       title                                                url avg_label  \
0   NewsNews       https://www.bbc.com/news/topics/c2vdnvdg6xxt  NEGATIVE   
1   NewsNews            https://www.bbc.com/news/war-in-ukraine  NEGATIVE   
2   NewsNews                 https://www.bbc.com/news/us-canada  NEGATIVE   
3   NewsNews                        https://www.bbc.com/news/uk  NEGATIVE   
4   NewsNews                  https://www.bbc.com/news/politics  NEGATIVE   
5   NewsNews                   https://www.bbc.com/news/england  NEGATIVE   
6   NewsNews          https://www.bbc.com/news/northern_ireland  NEGATIVE   
7   NewsNews  https://www.bbc.com/news/northern_ireland/nort...  NEGATIVE   
8   NewsNews                  https://www.bbc.com/news/scotland  NEGATIVE   
9   NewsNews  https://www.bbc.com/news/scotland/scotland_pol...  NEGATIVE   
10  NewsNews                     https://www.bbc.com/news/wales  NEGATIVE   
11  NewsNews      https://www.bbc.com/news/wales/wales_politics  NEGATIVE   

In [8]:
# Append to master CSV 
master_path = "articles_sentiment_master.csv"
master_cols = [
    "run_date", "title", "url", "avg_label", "avg_score",
    "num_sentences", "polarity_counts"
]
master_out = output[master_cols].copy()

# De‑duplicate by run_date + url before writing
if os.path.exists(master_path):
    existing = pd.read_csv(master_path, usecols=["run_date", "url"])
    new_rows = master_out.merge(existing, on=["run_date", "url"], how="left", indicator=True)
    master_out = new_rows[new_rows["_merge"] == "left_only"].drop(columns=["_merge"])

# Append mode; write header only if file doesn't exist
write_header = not os.path.exists(master_path)
if not master_out.empty:
    master_out.to_csv(master_path, mode="a", header=write_header, index=False)
    print(f"Appended {len(master_out)} new rows to {master_path}")
else:
    print("No new rows to append today (all duplicates).")

Appended 18 new rows to articles_sentiment_master.csv


In [9]:
# Create output folders for sentence CSVs
os.makedirs("sentences", exist_ok=True)

# Expand per-article sentence scores into a long table (one row per sentence)
rows = []
for idx, row in output.iterrows():
    title = row.get('title', '')
    url = row.get('url', '')
    meta = {
        "article_index": idx,
        "title": title,
        "url": url,
        "avg_label": row.get('avg_label'),
        "avg_score": row.get('avg_score'),
        "num_sentences": row.get('num_sentences')
    }
    for s in row.get('sentence_scores', []):            # flatten sentence-level dicts
        rows.append({
            **meta,
            "sentence": s.get('sentence'),
            "sent_label": s.get('label'),
            "sent_score": s.get('score'),
            "sent_value": s.get('value')               # signed polarity value
        })

sent_long = pd.DataFrame(rows)                         # long-form dataframe for analysis

# Save full sentence-level table for external use # Date-stamped filename to preserve previous runs
sentences_all_path = f"sentences_all_articles_{RUN_DATE}.csv"
sent_long.to_csv(sentences_all_path, index=False)
# Also maintain the classic filename so your final print remains truthful
sent_long.to_csv("sentences_all_articles.csv", index=False)

# For each article: save per-article CSV and write top sentences 
for idx, grp in sent_long.groupby("article_index"):
    title_safe = f"article_{idx}"
    # Date-stamped names to avoid overwriting
    article_dir_csv = os.path.join("sentences", f"{title_safe}_{RUN_DATE}.csv")
    grp.to_csv(article_dir_csv, index=False)        # per-article sentence CSV
    grp.to_csv(os.path.join("sentences", f"{title_safe}.csv"), index=False)

    # Select top positive and most negative sentences for quick inspection
    top_pos = grp[grp['sent_value'] > 0].sort_values('sent_value', ascending=False).head(3)
    top_neg = grp[grp['sent_value'] < 0].sort_values('sent_value').head(3)

    # Write a small text summary with top sentences
    dated_top_path = os.path.join("sentences", f"{title_safe}_{RUN_DATE}_top_sentences.txt")
    with open(dated_top_path, "w", encoding="utf-8") as f:
        f.write(f"Title: {output.at[idx,'title']}\nURL: {output.at[idx,'url']}\n\n")
        f.write("Top positive sentences:\n")
        for i, r in top_pos.iterrows():
            f.write(f"- ({r['sent_score']:.3f}) {r['sentence']}\n")
        f.write("\nTop negative sentences:\n")
        for i, r in top_neg.iterrows():
            f.write(f"- ({r['sent_score']:.3f}) {r['sentence']}\n")

    # Also maintain the classic filename silently
    classic_path = os.path.join("sentences", f"{title_safe}_top_sentences.txt")
    with open(classic_path, "w", encoding="utf-8") as f2:
        f2.write(f"Title: {output.at[idx,'title']}\nURL: {output.at[idx,'url']}\n\n")
        f2.write("Top positive sentences:\n")
        for i, r in top_pos.iterrows():
            f2.write(f"- ({r['sent_score']:.3f}) {r['sentence']}\n")
        f2.write("\nTop negative sentences:\n")
        for i, r in top_neg.iterrows():
            f2.write(f"- ({r['sent_score']:.3f}) {r['sentence']}\n")

# Aggregate per-article polarity counts and mean signed score
polarity_summary = sent_long.groupby('article_index')['sent_value'].apply(
    lambda vals: pd.Series({
        "positive_count": (vals > 0).sum(),
        "negative_count": (vals < 0).sum(),
        "neutral_count": (vals == 0).sum(),
        "mean_signed": vals.mean()
    })
).unstack().reset_index()

# Merge summary with article-level output and save final CSV
summary_df = output.merge(polarity_summary, left_on=output.index, right_on='article_index', 
                        how='left').drop(columns=['key_0','article_index'], errors='ignore')
summary_df.to_csv(f"articles_sentiment_summary_{RUN_DATE}.csv", index=False)
# Also maintain classic filename so your print remains truthful
summary_df.to_csv("articles_sentiment_summary.csv", index=False)

print("Saved: sentences_all_articles.csv, per-article CSVs in ./sentences/, and articles_sentiment_summary.csv")

Saved: sentences_all_articles.csv, per-article CSVs in ./sentences/, and articles_sentiment_summary.csv


In [10]:
#  Save full article-level sentiment table to CSV for later analysis
def save_outputs(output_df):
    """Save article-level outputs and derive sentence-level files/histograms."""
    output_df.to_csv(f"articles_sentiment_{RUN_DATE}.csv", index=False)
    output_df.to_csv("articles_sentiment.csv", index=False)

# Write a human-readable text file with all articles (simple archive)
with open(f"articles_texts_{RUN_DATE}.txt", "w", encoding="utf-8") as f:
    for i, r in output.iterrows():
        title = r.get('title', '').replace('\n', ' ')   # keep single-line titles
        url = r.get('url', '')
        text = r.get('text', '').replace('\r', '')      # normalize line endings
        f.write(f"## {i+1} {title}\n{url}\n{text}\n\n")
        
with open("articles_texts.txt", "w", encoding="utf-8") as f2:
    for i, r in output.iterrows():
        title = r.get('title', '').replace('\n', ' ')
        url = r.get('url', '')
        text = r.get('text', '').replace('\r', '')
        f2.write(f"## {i+1} {title}\n{url}\n{text}\n\n")

# Cleanly close the Selenium browser and end the WebDriver session
driver.quit()  # releases browser resources before the script exits

### ***File aggregation and histogram visualization***

In [11]:
# List of CSV file paths for the last 7 days
files = [
    "../News_Sentiment/sentences_all_articles_2025-11-07.csv",
    "../News_Sentiment/sentences_all_articles_2025-11-08.csv",
    "../News_Sentiment/sentences_all_articles_2025-11-09.csv",
    "../News_Sentiment/sentences_all_articles_2025-11-10.csv",
    "../News_Sentiment/sentences_all_articles_2025-11-11.csv",
    "../News_Sentiment/sentences_all_articles_2025-11-12.csv",
    "../News_Sentiment/sentences_all_articles_2025-11-13.csv"     
]

In [12]:
# Function to generate histogram from 'sent_value'
def save_polarity_histogram_from_list(csv_files, out_png="histogram.png"):
    try:
        # Read each CSV into a DataFrame and collect them
        df_list = [pd.read_csv(f) for f in csv_files]
        # Combine all DataFrames into one for aggregate plotting
        combined_df = pd.concat(df_list, ignore_index=True)

        # Ensure required column exists before plotting
        if 'sent_value' not in combined_df.columns:
            raise ValueError("Column 'sent_value' not found in combined data.")

        # Create and save a histogram of sentiment values
        plt.figure(figsize=(8, 6))
        combined_df['sent_value'].hist(bins=30, color='salmon', edgecolor='black')   
        plt.title("Sentiment Polarity Histogram")
        plt.xlabel("Sentiment Value")
        plt.ylabel("Frequency")
        plt.tight_layout()
        plt.savefig(out_png)
        plt.close()
        print(f"Histogram saved as {out_png}")
    except Exception as e:
        print("Failed to generate histogram:", e)

# Main block: read files, combine, assign index, save CSV, and plot
try:
    dfs = []
    missing = []
    for f in files:
        # Check file exists before attempting read to avoid crashes
        if os.path.exists(f):
            dfs.append(pd.read_csv(f))
        else:
            missing.append(f)

    # Report any paths that were not found
    if missing:
        print("Missing files (skipped):", missing)
    if not dfs:
        raise FileNotFoundError("No input CSVs found.")

    # Concatenate all loaded DataFrames into a single table
    combined = pd.concat(dfs, ignore_index=True)
    # Add a monotonically increasing index for articles (useful as a stable identifier)
    combined["article_index"] = range(len(combined))
    # Persist combined data for downstream use
    combined.to_csv("combined_last_7d.csv", index=False)
    print("Saved combined_last_7d.csv")

    # # Generate histogram from the combined CSV file
    save_polarity_histogram_from_list(["combined_last_7d.csv"], out_png="polarity_last_7d.png")

except Exception as e:
    # Catch-all to avoid unhandled exceptions in scripting contexts
    print("Failed to build combined CSV or histogram:", e)

# Ensure all matplotlib figures are closed (release resources)    
plt.close('all')

Saved combined_last_7d.csv
Histogram saved as polarity_last_7d.png
