In [None]:
import pandas as pd
import re

# Load dataset
df = pd.read_csv("bbc-news.csv")
texts = df["description"].dropna().astype(str)

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["cleaned_text"] = texts.apply(clean_text)

print(df["cleaned_text"].head())


0    the ukrainian president says the country will ...
1    jeremy bowen was on the frontline in irpin as ...
2    one of the worlds biggest fertiliser firms say...
3    the parents of the manchester arena bombings y...
4    consumers are feeling the impact of higher ene...
Name: cleaned_text, dtype: object


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

nltk.download("punkt")
nltk.download("stopwords")

stop_words = set(stopwords.words("english"))

def nltk_tokenize(text):
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words and len(w) > 2]
    return tokens

df["tokens_nltk"] = df["cleaned_text"].apply(nltk_tokenize)

all_tokens = [token for tokens in df["tokens_nltk"] for token in tokens]
freq_dist = Counter(all_tokens).most_common(10)

print("Top 10 frequent tokens (NLTK):")
print(freq_dist)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Top 10 frequent tokens (NLTK):
[('says', 4561), ('world', 2030), ('bbc', 2011), ('people', 1989), ('england', 1922), ('first', 1905), ('new', 1894), ('say', 1676), ('cup', 1486), ('two', 1392)]


In [None]:
import spacy
from nltk.stem import PorterStemmer

nlp = spacy.load("en_core_web_sm")
stemmer = PorterStemmer()

def spacy_process(text):
    doc = nlp(text)
    lemmas = [token.lemma_.lower() for token in doc
              if not token.is_stop and token.is_alpha and len(token) > 2]
    stems = [stemmer.stem(token) for token in lemmas]
    return lemmas, stems

df["lemmas"], df["stems"] = zip(*df["cleaned_text"].apply(spacy_process))

all_lemmas = [lemma for lemmas in df["lemmas"] for lemma in lemmas]
lemma_freq = Counter(all_lemmas).most_common(10)

print("Top 10 frequent lemmas (spaCy):")
print(lemma_freq)


Top 10 frequent lemmas (spaCy):
[('say', 5463), ('england', 2313), ('year', 2250), ('world', 2170), ('bbc', 2026), ('people', 2021), ('win', 1918), ('new', 1898), ('cup', 1501), ('day', 1500)]
