In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...


True

In [2]:
df = pd.read_csv('news_dataset.csv')
texts = df['text'].dropna().tolist()


In [3]:
# Stopwords
stop_words = set(stopwords.words('english'))

# Lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to preprocess text
def preprocess_text(text):
    # Tokenize
    tokens = word_tokenize(text.lower())
    # Remove stopwords and non-alphabetic characters, lemmatize
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]
    return tokens

# Apply preprocessing to the texts
processed_texts = [preprocess_text(text) for text in texts]


In [4]:
# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(processed_texts)
# Filter out extremes to limit the number of features
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

# Create a corpus: Term Document Frequency
corpus = [dictionary.doc2bow(text) for text in processed_texts]


In [5]:
# Set parameters for LDA
num_topics = 4  # Number of topics

# Build LDA model
lda_model = LdaModel(corpus=corpus,
                     id2word=dictionary,
                     num_topics=num_topics,
                     random_state=100,
                     update_every=1,
                     chunksize=100,
                     passes=10,
                     alpha='auto',
                     per_word_topics=True)


In [6]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)



Coherence Score:  0.5945879818788151


In [8]:
# Print the topics
topics = lda_model.print_topics(num_words=10)
for topic in topics:
    print(topic)


(0, '0.031*"key" + 0.016*"encryption" + 0.016*"chip" + 0.014*"db" + 0.012*"system" + 0.012*"use" + 0.010*"information" + 0.009*"privacy" + 0.009*"clipper" + 0.008*"file"')
(1, '0.013*"armenian" + 0.007*"administration" + 0.006*"enforcement" + 0.006*"two" + 0.005*"use" + 0.005*"block" + 0.005*"system" + 0.005*"state" + 0.005*"one" + 0.005*"start"')
(2, '0.198*"q" + 0.087*"x" + 0.081*"n" + 0.069*"e" + 0.049*"k" + 0.037*"c" + 0.035*"p" + 0.026*"f" + 0.024*"r" + 0.023*"g"')
(3, '0.013*"would" + 0.012*"people" + 0.010*"one" + 0.007*"government" + 0.007*"think" + 0.007*"know" + 0.006*"right" + 0.006*"say" + 0.006*"u" + 0.005*"could"')


In [None]:
#Despite the initial topic having a lot of noise, the topics of the LDA model show important patterns across the dataset. 
#Topic 1 likely to focus on legal and political matters, as indicated by terms like "people," "government," and "law." 
#Topic 2 is less formal and more conversational, with words like "would," "like," and "know," which allude to general discussions or opinions.
#Topic 3 is about technology and encryption, as evidenced by terms like "chip," "key," and "encryption." 
#The first subject may indicate preprocessing issues or non-standard material because it appears to include random characters and symbols.
#A moderate level of interpretability and coherence across the topics is indicated by the coherence score of 0.5347, suggesting that while the model has identified some pertinent themes, there is still room for improvement, especially in terms of addressing data noise.