<a href="https://colab.research.google.com/github/rahmamohax/Elevvo-Tasks/blob/master/Topic%20Modeling%20on%20News%20Articles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task 5: Topic Modeling on News Articles

In [None]:
!pip install gensim pyLDAvis



### Load Dataset

In [None]:
import os

print("Dataset path:", path)
print("Files in dataset folder:", os.listdir(path))

Dataset path: /root/.cache/kagglehub/datasets/gpreda/bbc-news/versions/1007
Files in dataset folder: ['bbc_news.csv']


### Import Libraries

In [None]:
import pandas as pd
import nltk
import re
import gensim
import pyLDAvis.gensim_models
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import corpora
import warnings
warnings.filterwarnings("ignore")

# Load data
texts = pd.read_csv(os.path.join(path, "bbc_news.csv"))
texts.head()

Unnamed: 0,title,pubDate,guid,link,description
0,Ukraine: Angry Zelensky vows to punish Russian...,"Mon, 07 Mar 2022 08:01:56 GMT",https://www.bbc.co.uk/news/world-europe-60638042,https://www.bbc.co.uk/news/world-europe-606380...,The Ukrainian president says the country will ...
1,War in Ukraine: Taking cover in a town under a...,"Sun, 06 Mar 2022 22:49:58 GMT",https://www.bbc.co.uk/news/world-europe-60641873,https://www.bbc.co.uk/news/world-europe-606418...,"Jeremy Bowen was on the frontline in Irpin, as..."
2,Ukraine war 'catastrophic for global food',"Mon, 07 Mar 2022 00:14:42 GMT",https://www.bbc.co.uk/news/business-60623941,https://www.bbc.co.uk/news/business-60623941?a...,One of the world's biggest fertiliser firms sa...
3,Manchester Arena bombing: Saffie Roussos's par...,"Mon, 07 Mar 2022 00:05:40 GMT",https://www.bbc.co.uk/news/uk-60579079,https://www.bbc.co.uk/news/uk-60579079?at_medi...,The parents of the Manchester Arena bombing's ...
4,Ukraine conflict: Oil price soars to highest l...,"Mon, 07 Mar 2022 08:15:53 GMT",https://www.bbc.co.uk/news/business-60642786,https://www.bbc.co.uk/news/business-60642786?a...,Consumers are feeling the impact of higher ene...


### Download NLTK Resources

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Preprocessing

In [None]:
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text) #numbers
    text = re.sub(r'[^\w\s]', '', text) #punctuation
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    return tokens

texts['tokens'] = texts['description'].apply(preprocess)
texts.head()

Unnamed: 0,title,pubDate,guid,link,description,tokens
0,Ukraine: Angry Zelensky vows to punish Russian...,"Mon, 07 Mar 2022 08:01:56 GMT",https://www.bbc.co.uk/news/world-europe-60638042,https://www.bbc.co.uk/news/world-europe-606380...,The Ukrainian president says the country will ...,"[ukrainian, president, says, country, forgive,..."
1,War in Ukraine: Taking cover in a town under a...,"Sun, 06 Mar 2022 22:49:58 GMT",https://www.bbc.co.uk/news/world-europe-60641873,https://www.bbc.co.uk/news/world-europe-606418...,"Jeremy Bowen was on the frontline in Irpin, as...","[jeremy, bowen, frontline, irpin, residents, c..."
2,Ukraine war 'catastrophic for global food',"Mon, 07 Mar 2022 00:14:42 GMT",https://www.bbc.co.uk/news/business-60623941,https://www.bbc.co.uk/news/business-60623941?a...,One of the world's biggest fertiliser firms sa...,"[one, worlds, biggest, fertiliser, firms, says..."
3,Manchester Arena bombing: Saffie Roussos's par...,"Mon, 07 Mar 2022 00:05:40 GMT",https://www.bbc.co.uk/news/uk-60579079,https://www.bbc.co.uk/news/uk-60579079?at_medi...,The parents of the Manchester Arena bombing's ...,"[parents, manchester, arena, bombings, younges..."
4,Ukraine conflict: Oil price soars to highest l...,"Mon, 07 Mar 2022 08:15:53 GMT",https://www.bbc.co.uk/news/business-60642786,https://www.bbc.co.uk/news/business-60642786?a...,Consumers are feeling the impact of higher ene...,"[consumers, feeling, impact, higher, energy, c..."


### Prepare Dictionary & Corpus for LDA

In [None]:
dictionary = corpora.Dictionary(texts['tokens'])

dictionary.filter_extremes(no_below=5, no_above=0.5)

corpus = [dictionary.doc2bow(token_list) for token_list in texts['tokens']]


### LDA Model

In [None]:
lda_model = gensim.models.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=5,
    random_state=42,
    passes=10,
    alpha='auto'
)

### Display Topics

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")

Topic 0: 0.023*"says" + 0.007*"pay" + 0.007*"media" + 0.007*"take" + 0.006*"make" + 0.006*"members" + 0.006*"trump" + 0.006*"face" + 0.006*"number" + 0.006*"set"
Topic 1: 0.022*"bbc" + 0.021*"people" + 0.016*"say" + 0.015*"two" + 0.014*"new" + 0.013*"years" + 0.011*"could" + 0.010*"year" + 0.007*"last" + 0.006*"since"
Topic 2: 0.013*"party" + 0.013*"president" + 0.012*"election" + 0.012*"minister" + 0.012*"government" + 0.008*"new" + 0.008*"leader" + 0.008*"prime" + 0.008*"labour" + 0.007*"court"
Topic 3: 0.024*"says" + 0.012*"former" + 0.012*"police" + 0.011*"said" + 0.010*"died" + 0.008*"days" + 0.008*"three" + 0.007*"man" + 0.007*"one" + 0.006*"found"
Topic 4: 0.021*"first" + 0.017*"england" + 0.016*"world" + 0.012*"league" + 0.012*"manchester" + 0.012*"city" + 0.011*"win" + 0.010*"cup" + 0.008*"time" + 0.008*"day"


### Interactive Visualization

In [None]:
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
vis