# BBC News Classification with Logistic Regression

This notebook demonstrates how to train a Logistic Regression model to classify BBC news articles into different categories (business, entertainment, politics, sport, tech).

## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

print("✓ Libraries imported successfully")

## 2. Load and Prepare Data

In [None]:
import os

texts = []
categories = []

base_path = "bbc"

for category in os.listdir(base_path):
    category_path = os.path.join(base_path, category)
    if not os.path.isdir(category_path):
        continue
    for filename in os.listdir(category_path):
        file_path = os.path.join(category_path, filename)
        with open(file_path, "r", encoding="latin-1") as f:
            texts.append(f.read())
            categories.append(category)

bbc_text = pd.DataFrame({
    "text": texts,
    "category": categories
})

print(f"✓ Loaded {len(bbc_text)} articles")
print(f"Categories distribution:\n{bbc_text['category'].value_counts()}")
print(f"\nDataset shape: {bbc_text['category'].shape}")
bbc_text.head()

In [None]:
# Save the DataFrame as CSV for future use
csv_path = "bbc-text.csv"
bbc_text.to_csv(csv_path, index=False)
print(f"✓ CSV file saved to: {csv_path}")
print(f"CSV file size: {os.path.getsize(csv_path) / 1024 / 1024:.2f} MB")

## Load CSV File

In [None]:
# Read our dataset using read_csv()
bbc_text = pd.read_csv('bbc-text.csv')

# Rename 'text' column to 'News_Headline' for clarity
bbc_text = bbc_text.rename(columns={'text': 'News_Headline'}, inplace=False)

print(f"✓ CSV file loaded successfully")
print(f"Dataset shape: {bbc_text.shape}")
print(f"Columns: {bbc_text.columns.tolist()}")
print(f"\nFirst few rows:")
bbc_text.head()

## Data Visualization - Articles per Category

In [5]:
plt.figure(figsize=(8, 5))
sns.countplot(
    data=bbc_text,
    y="category",
    hue="category",
    palette="Set1",
    legend=False
)
plt.title("Number of Articles per Category", fontsize=14, fontweight='bold')
plt.xlabel("Count")
plt.ylabel("Category")
plt.tight_layout()
plt.show()

# Print summary statistics
print("\nArticles per Category:")
print(bbc_text['category'].value_counts().sort_values(ascending=False))
print(f"\nTotal articles: {len(bbc_text)}")

NameError: name 'plt' is not defined

## Stop Words Analysis

In [None]:
import nltk
from nltk.corpus import stopwords
from collections import defaultdict

# Download stopwords corpus
nltk.download('stopwords', quiet=True)

def plot_stopwords(data):
    """Visualize the top 10 most common stop words in the dataset"""
    stop = set(stopwords.words('english'))
    data_split = data.str.split()
    data_list = data_split.values.tolist()
    corpus = [word for i in data_list for word in i]
    
    dictionary_stopwords = defaultdict(int)
    for word in corpus:
        if word in stop:
            dictionary_stopwords[word] += 1
    
    # Get top 10 stop words
    top = sorted(dictionary_stopwords.items(), key=lambda x: x[1], reverse=True)[:10]
    x, y = zip(*top)
    
    # Create visualization
    plt.figure(figsize=(10, 6))
    plt.bar(x, y, color='steelblue')
    plt.title('Top 10 Most Common Stop Words in BBC News Dataset', fontsize=14, fontweight='bold')
    plt.xlabel('Stop Words')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
    
    print(f"✓ Stop words analysis completed")
    print(f"\nTop 10 Stop Words:")
    for word, count in top:
        print(f"  {word}: {count}")

# Run the analysis
plot_stopwords(bbc_text['News_Headline'])

## Top Frequent Words Analysis

In [None]:
from collections import Counter

def top_frequent_words(data):
    """Visualize the top 20 most frequent non-stopwords using a rainbow palette"""
    stop = set(stopwords.words('english'))

    # Convert to lowercase and split into words
    data_split = data.str.lower().str.split()
    data_list = data_split.values.tolist()
    corpus = [word for i in data_list for word in i]

    # Count word frequencies
    counter = Counter(corpus)
    most_common = counter.most_common()

    # Extract top 20 non-stopwords (alphabetic only)
    words, counts = [], []
    for word, count in most_common:
        if word not in stop and word.isalpha():
            words.append(word)
            counts.append(count)
        if len(words) == 20:
            break

    # Create DataFrame for seaborn
    plot_df = pd.DataFrame({
        "word": words,
        "count": counts
    })

    # Create visualization with rainbow palette
    plt.figure(figsize=(10, 8))
    sns.barplot(
        data=plot_df,
        x="count",
        y="word",
        hue="word",
        palette="rainbow",
        legend=False
    )
    plt.title("Top 20 Most Frequent Non-Stopwords in BBC News", fontsize=14, fontweight='bold')
    plt.xlabel("Frequency")
    plt.ylabel("Word")
    plt.tight_layout()
    plt.show()
    
    print("✓ Top frequent words analysis completed")
    print(f"\nTop 20 Most Frequent Words:")
    for idx, (word, count) in enumerate(zip(words, counts), 1):
        print(f"  {idx:2d}. {word:15s}: {count:4d}")

# Run the analysis
top_frequent_words(bbc_text["News_Headline"])

## Word Cloud Visualization

In [None]:
import os
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud, STOPWORDS
from nltk.tokenize import TreebankWordTokenizer

# Setup NLTK data directory
NLTK_DATA_DIR = os.path.join(os.getcwd(), "nltk_data")
os.makedirs(NLTK_DATA_DIR, exist_ok=True)

if NLTK_DATA_DIR not in nltk.data.path:
    nltk.data.path.insert(0, NLTK_DATA_DIR)

# Download required NLTK packages
for pkg in ["stopwords", "wordnet", "punkt", "punkt_tab", "omw-1.4"]:
    try:
        nltk.download(pkg, download_dir=NLTK_DATA_DIR, quiet=True)
    except Exception as e:
        print(f"NLTK download warning for {pkg}: {e}")

# Initialize tokenizer and lemmatizer
tokenizer = TreebankWordTokenizer()
lemmatizer = WordNetLemmatizer()

def plot_wordcloud(data):
    """Generate and display a word cloud from the dataset"""
    stop = set(stopwords.words("english"))
    
    corpus = []
    for text in data:
        # Tokenize and normalize
        tokens = tokenizer.tokenize(str(text).lower())
        # Filter: alphabetic, non-stopwords, length > 2, and lemmatize
        tokens = [
            lemmatizer.lemmatize(w)
            for w in tokens
            if w.isalpha() and w not in stop and len(w) > 2
        ]
        corpus.extend(tokens)

    # Generate word cloud
    wc = WordCloud(
        background_color="white",
        stopwords=STOPWORDS,
        max_words=200,
        max_font_size=40,
        scale=3,
        random_state=1,
        colormap='viridis'
    ).generate(" ".join(corpus))

    # Display
    plt.figure(figsize=(14, 10))
    plt.axis("off")
    plt.imshow(wc, interpolation='bilinear')
    plt.title("Word Cloud - BBC News Articles", fontsize=16, fontweight='bold', pad=20)
    plt.tight_layout()
    plt.show()
    
    print("✓ Word cloud visualization completed!")
    print(f"Total unique words in cloud: {len(wc.words_)}")

# Generate the word cloud
plot_wordcloud(bbc_text["News_Headline"])

## Data Preprocessing and Cleaning

Before building our model, we need to clean and preprocess the dataset to remove stop words, punctuation, inconsistencies, and other noise that could cause the model to under/overfit. We'll apply several techniques: lowercasing, tokenization, lemmatization, and stop word removal.

### Step 1: Lowercasing

In [None]:
# Convert all text to lowercase for consistency
print("Converting text to lowercase...")
bbc_text_clean = bbc_text.copy()
bbc_text_clean['News_Headline'] = bbc_text_clean['News_Headline'].apply(lambda x: str(x).lower())

print("✓ Lowercasing completed")
print(f"\nSample before and after:")
print(f"Original: {bbc_text['News_Headline'].iloc[0][:100]}")
print(f"Lowercase: {bbc_text_clean['News_Headline'].iloc[0][:100]}")
print(f"\nDataset shape: {bbc_text_clean.shape}")

### Step 2: Removing Stop Words

In [None]:
# Remove stop words from the text
print("Removing stop words...")

# Get English stop words
stop_words = set(stopwords.words('english'))

# Tokenize and remove stop words
def remove_stopwords(text):
    """Tokenize text and remove stop words"""
    tokens = str(text).split()
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(filtered_tokens)

# Apply to the dataset
bbc_text_clean['News_Headline'] = bbc_text_clean['News_Headline'].apply(remove_stopwords)

print("✓ Stop words removal completed")
print(f"\nNumber of stop words removed: {len(stop_words)}")
print(f"Sample stop words: {list(stop_words)[:15]}")

print(f"\nSample after stop word removal:")
print(f"Text: {bbc_text_clean['News_Headline'].iloc[0][:100]}")

print(f"\nDataset shape: {bbc_text_clean.shape}")
print(f"\nFirst 5 rows:")
print(bbc_text_clean.head())

### Step 3: Stemming and Lemmatization

**Stemming** removes the last few characters from a word to get its root form, but can produce non-words (e.g., "running" → "runn").

**Lemmatization** reduces words to their dictionary base form (lemma) using context, producing valid words (e.g., "running" → "run", "better" → "good").

In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Download wordnet if not already available
nltk.download('wordnet', download_dir=NLTK_DATA_DIR, quiet=True)
nltk.download('averaged_perceptron_tagger', download_dir=NLTK_DATA_DIR, quiet=True)

# Demonstration of Stemming vs Lemmatization
sample_words = ['running', 'runs', 'ran', 'runner', 'walked', 'walks', 'better', 'study', 'studying']
print("=" * 60)
print("STEMMING vs LEMMATIZATION COMPARISON")
print("=" * 60)
print(f"{'Word':<15} {'Stemmed':<15} {'Lemmatized':<15}")
print("-" * 60)

for word in sample_words:
    stemmed = stemmer.stem(word)
    lemmatized = lemmatizer.lemmatize(word, pos='v')  # pos='v' for verbs
    print(f"{word:<15} {stemmed:<15} {lemmatized:<15}")

print("\n" + "=" * 60)
print("Applying Lemmatization to dataset...")
print("=" * 60)

# Function to lemmatize text
def lemmatize_text(text):
    """Tokenize text and apply lemmatization"""
    tokens = str(text).split()
    lemmatized_tokens = [lemmatizer.lemmatize(token, pos='v') for token in tokens]
    return ' '.join(lemmatized_tokens)

# Apply lemmatization to the dataset
bbc_text_clean['News_Headline'] = bbc_text_clean['News_Headline'].apply(lemmatize_text)

print("✓ Lemmatization completed")
print(f"\nSample after lemmatization:")
print(f"Text: {bbc_text_clean['News_Headline'].iloc[0][:100]}")
print(f"\nDataset shape: {bbc_text_clean.shape}")
print(f"\nFirst 3 rows:")
print(bbc_text_clean.head(3))

## 3. Split Data into Training and Testing Sets

In [5]:
from sklearn.model_selection import train_test_split

# Prepare feature and target
if 'text_clean' in bbc_text.columns:
    X = bbc_text['text_clean']
elif 'text_clean_str' in bbc_text.columns:
    X = bbc_text['text_clean_str']
else:
    # fallback to raw headlines
    X = bbc_text['News_Headline'].astype(str)

y = bbc_text['category']

# Split data with 60% train, 40% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.6, random_state=1, stratify=y if 'category' in bbc_text.columns else None
)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size:  {len(X_test)}")
print(f"Class distribution (train):\n{y_train.value_counts()}\n")
print(f"Class distribution (test):\n{y_test.value_counts()}\n")


Training set size: 1335
Testing set size:  890
Class distribution (train):
category
sport            307
business         306
politics         250
tech             240
entertainment    232
Name: count, dtype: int64

Class distribution (test):
category
sport            204
business         204
politics         167
tech             161
entertainment    154
Name: count, dtype: int64



## 4. Feature Transformation (TF-IDF Vectorization)

In [4]:
# Robust TF-IDF encoding step: ensure data is available and use cleaned text if present
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import os

# Attempt to load CSV; if missing, rebuild from folder
try:
    if 'bbc_text' not in globals():
        bbc_text = pd.read_csv('/workspaces/BBCnews/bbc-text.csv')
        if 'News_Headline' not in bbc_text.columns and 'text' in bbc_text.columns:
            bbc_text = bbc_text.rename(columns={'text': 'News_Headline'}, inplace=False)
except Exception:
    print('bbc-text.csv not found; rebuilding dataset from folder...')
    texts = []
    categories = []
    base_path = '/workspaces/BBCnews/bbc'
    for category in os.listdir(base_path):
        category_path = os.path.join(base_path, category)
        if not os.path.isdir(category_path):
            continue
        for filename in sorted(os.listdir(category_path)):
            file_path = os.path.join(category_path, filename)
            try:
                with open(file_path, 'r', encoding='latin-1') as f:
                    texts.append(f.read())
                    categories.append(category)
            except Exception as e:
                continue
    bbc_text = pd.DataFrame({'News_Headline': texts, 'category': categories})

# Prefer cleaned dataframe if available
if 'bbc_text_clean' in globals():
    source_series = bbc_text_clean['News_Headline'].astype(str)
else:
    # fall back to original
    source_series = bbc_text['News_Headline'].astype(str)

# Create a cleaned string column if not present
if 'text_clean_str' not in bbc_text.columns:
    # If token lists exist in 'text_clean', join them; otherwise use source_series lowercase
    if 'text_clean' in bbc_text.columns:
        bbc_text['text_clean_str'] = bbc_text['text_clean'].apply(lambda x: ' '.join(x) if isinstance(x, (list, tuple)) else str(x))
    else:
        bbc_text['text_clean_str'] = source_series.str.lower()

# TF-IDF vectorization
print('Fitting TF-IDF (may take a moment)...')
vectorizer = TfidfVectorizer(
    max_features=1000,
    lowercase=True,
    analyzer='word',
    stop_words='english',
    ngram_range=(1,1)
)

tf_idf1 = vectorizer.fit_transform(bbc_text['text_clean_str'])
print(f'✓ TF-IDF completed — shape: {tf_idf1.shape} (documents, features)')

# Keep vectorizer for reuse
tfidf_vectorizer = vectorizer


Fitting TF-IDF (may take a moment)...
✓ TF-IDF completed — shape: (2225, 1000) (documents, features)


## 5. Train Logistic Regression Model

In [None]:
# Instantiate and train the Logistic Regression model
logistic_reg = LogisticRegression(max_iter=1000, random_state=42, multi_class='multinomial')

print("Training Logistic Regression model...")
logistic_reg.fit(X_train_transformed, y_train)

print("✓ Model training completed!")
print(f"Model: {logistic_reg}")
print(f"Number of classes: {len(logistic_reg.classes_)}")
print(f"Classes: {logistic_reg.classes_}")

## 6. Make Predictions

In [None]:
# Generate predictions on training and test sets
y_pred_train = logistic_reg.predict(X_train_transformed)
y_pred_test = logistic_reg.predict(X_test_transformed)

# Get prediction probabilities
y_pred_proba = logistic_reg.predict_proba(X_test_transformed)

print(f"✓ Predictions generated!")
print(f"Sample predictions (first 10): {y_pred_test[:10].tolist()}")
print(f"Confidence scores for first sample: {dict(zip(logistic_reg.classes_, y_pred_proba[0]))}")

## 7. Evaluate Model Performance

In [None]:
# Calculate performance metrics
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)
precision = precision_score(y_test, y_pred_test, average='weighted')
recall = recall_score(y_test, y_pred_test, average='weighted')
f1 = f1_score(y_test, y_pred_test, average='weighted')

print("=" * 50)
print("MODEL PERFORMANCE METRICS")
print("=" * 50)
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy:  {test_accuracy:.4f}")
print(f"Precision (weighted): {precision:.4f}")
print(f"Recall (weighted):    {recall:.4f}")
print(f"F1-Score (weighted):  {f1:.4f}")
print("=" * 50)

In [None]:
# Generate classification report
print("\nDETAILED CLASSIFICATION REPORT:")
print("=" * 50)
print(classification_report(y_test, y_pred_test))

In [11]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_test)

# Visualize confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=logistic_reg.classes_, 
            yticklabels=logistic_reg.classes_)
plt.title('Confusion Matrix - Logistic Regression', fontsize=14, fontweight='bold')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

print("✓ Confusion matrix visualization completed!")

NameError: name 'confusion_matrix' is not defined

In [10]:
# In a notebook cell:
%pip install -q pandas scikit-learn nltk wordcloud seaborn matplotlib


1266.66s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Note: you may need to restart the kernel to use updated packages.
