In [None]:
# Import necessary libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel

# Download NLTK Resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
data = pd.read_csv('news_dataset.csv')

In [None]:
# Use only the text column
documents = data['text'].dropna().tolist()

# Text Pre-Processing

In [None]:
# remove stopwords, lemmatization, and stemming
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalnum() and not token.isdigit()] # Remove non-alphanumeric tokens and numbers
    tokens = [token for token in tokens if token not in stop_words] # Remove stopwords
    tokens = [stemmer.stem(token) for token in tokens] # Apply stemming
    tokens = [lemmatizer.lemmatize(token) for token in tokens] # Apply lemmatization
    return tokens

# Preprocess every document in the list
preprocessed_documents = [preprocess_text(doc) for doc in documents]

# Document-term Matrix

In [None]:
# Create a Gensim Dictionary object 
dictionary = corpora.Dictionary(preprocessed_documents)

# Convert each preprocessed document into a bag-of-words representation 
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents]

# LDA

In [None]:
# Train an LDA model on the corpus with 4 topics using Gensim's LdaModel class
lda_model = LdaModel(corpus, num_topics=4, id2word=dictionary, passes=15)

# Calculate the coherence score 
coherence_model_lda = CoherenceModel(model=lda_model, texts=preprocessed_documents, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

# Result

In [None]:
# Empty list to store dominant topic labels for each document
article_labels = []

# Iterate over each processed document
for i, doc in enumerate(preprocessed_documents):
   
    bow = dictionary.doc2bow(doc)
    topics = lda_model.get_document_topics(bow)
    dominant_topic = max(topics, key=lambda x: x[1])[0]
    article_labels.append(dominant_topic)
    
# Create DataFrame
df = pd.DataFrame({"The Article": documents, "The Topic": article_labels})

# Print the DataFrame
print("Table with the Articles and Topic:")
print(df)
print()

In [None]:
# Print the top terms for every topic
print("Top Terms for Each Topic:")
for idx, topic in lda_model.print_topics():
    print(f"Topic {idx}:")
    terms = [term.strip() for term in topic.split("+")]
    for term in terms:
        weight, word = term.split("*")
        print(f"-{word.strip()} (weight: {weight.strip()})")
    print()

# Display the coherence score
print(f'Topic Coherence Score (C_V): {coherence_lda:.4f}')