# Lab Assignment 3

1. Alyaa Natasha (SW01081000)
2. Putri Qistina (SW01081178)

#### Import the necessary libraries

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import pandas as pd

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
#Load the dataset
df = pd.read_csv('news_dataset.csv')
df['text'] = df['text'].astype(str).fillna('')

# Remove rows with null values in 'text' column
df = df.dropna(subset=['text'])

#Read the data (use only the ‘text’ column)
documents = df['text'].tolist()

#### Perform text pre-processing

In [3]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalnum()]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [stemmer.stem(token) for token in tokens]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

preprocessed_documents = [preprocess_text(doc) for doc in documents]
print(preprocessed_documents[0])

['wonder', 'anyon', 'could', 'enlighten', 'car', 'saw', 'day', 'sport', 'car', 'look', 'late', 'earli', '70', 'call', 'bricklin', 'door', 'realli', 'small', 'addit', 'front', 'bumper', 'separ', 'rest', 'bodi', 'know', 'anyon', 'tellm', 'model', 'name', 'engin', 'spec', 'year', 'product', 'car', 'made', 'histori', 'whatev', 'info', 'funki', 'look', 'car', 'plea']


#### Perform LDA using Gensim

In [4]:
#create a gensim dictionary object from the preprocessed doc
dictionary = corpora.Dictionary(preprocessed_documents)

#filter out tokens that appear in less than 15 doc or more than 50% of the doc
dictionary.filter_extremes(no_below=15, no_above=0.5)

#convert each preprocessed document into a BoW representation using the dictionary
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents]

In [5]:
#Run LDA
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

In [6]:
#empty list to store dominiant topic labels for each doc
article_labels = []

#iterate over each preprocessed document
for i, doc in enumerate(preprocessed_documents):
    #for each doc, convert to box representation
    bow = dictionary.doc2bow(doc)
    
    #get list of topic probabilities
    topics = lda_model.get_document_topics(bow)
    
    #determine the topic with highest probability
    dominant_topic = max(topics, key=lambda x: x[1])[0]
    
    #append to the list
    article_labels.append(dominant_topic)

#### Interpret the result 

In [7]:
#create dataframe
df_result = pd.DataFrame({"Article":documents, "Topic":article_labels})

#print dataframe
print("Table with Articles and Topic:")
print(df_result)
print()

Table with Articles and Topic:
                                                 Article  Topic
0      I was wondering if anyone out there could enli...      0
1      I recently posted an article asking what kind ...      0
2      \nIt depends on your priorities.  A lot of peo...      0
3      an excellent automatic can be found in the sub...      0
4      : Ford and his automobile.  I need information...      0
...                                                  ...    ...
11309  Secrecy in Clipper Chip\n\nThe serial number o...      1
11310  Hi !\n\nI am interested in the source of FEAL ...      1
11311  The actual algorithm is classified, however, t...      1
11312  \n\tThis appears to be generic calling upon th...      0
11313  \nProbably keep quiet and take it, lest they g...      0

[11314 rows x 2 columns]



In [8]:
#print top terms for each topic
for topic_id in range(lda_model.num_topics):
    print(f"Top terms for Topic #{topic_id}:")
    top_terms = lda_model.show_topic(topic_id, topn=10)
    print([term[0] for term in top_terms])
    print()

Top terms for Topic #0:
['would', 'get', 'one', 'go', 'like', 'know', 'think', 'time', 'good', 'could']

Top terms for Topic #1:
['use', 'key', 'file', 'x', 'system', 'encrypt', 'program', 'chip', 'window', 'db']

Top terms for Topic #2:
['1', '0', 'q', 'max', 'x', '2', 'g', 'r', '7', 'p']

Top terms for Topic #3:
['peopl', 'would', 'one', 'govern', 'say', 'god', 'think', 'state', 'law', 'right']

Top terms for Topic #4:
['game', 'team', '1', '2', 'year', 'play', '3', 'new', '4', '10']



In [9]:
#print the top terms for each topic with weight
print("Top Terms for Each Topic:")
for idx, topic in lda_model.print_topics():
    print(f"Topic {idx}:")
    terms = [term.strip() for term in topic.split("+")]
    
    for term in terms:
        weight, word = term.split("*")
        print(f"- {word.strip()} (weight: {weight.strip()})")
    print()

Top Terms for Each Topic:
Topic 0:
- "would" (weight: 0.012)
- "get" (weight: 0.012)
- "one" (weight: 0.012)
- "go" (weight: 0.011)
- "like" (weight: 0.010)
- "know" (weight: 0.009)
- "think" (weight: 0.008)
- "time" (weight: 0.007)
- "good" (weight: 0.006)
- "could" (weight: 0.006)

Topic 1:
- "use" (weight: 0.020)
- "key" (weight: 0.011)
- "file" (weight: 0.009)
- "x" (weight: 0.009)
- "system" (weight: 0.009)
- "encrypt" (weight: 0.007)
- "program" (weight: 0.007)
- "chip" (weight: 0.006)
- "window" (weight: 0.006)
- "db" (weight: 0.005)

Topic 2:
- "1" (weight: 0.066)
- "0" (weight: 0.060)
- "q" (weight: 0.059)
- "max" (weight: 0.057)
- "x" (weight: 0.052)
- "2" (weight: 0.041)
- "g" (weight: 0.036)
- "r" (weight: 0.036)
- "7" (weight: 0.033)
- "p" (weight: 0.030)

Topic 3:
- "peopl" (weight: 0.010)
- "would" (weight: 0.008)
- "one" (weight: 0.008)
- "govern" (weight: 0.006)
- "say" (weight: 0.006)
- "god" (weight: 0.005)
- "think" (weight: 0.005)
- "state" (weight: 0.005)
- "law" 

#### Evaluate the LDA model using Coherence score 

In [10]:
# Calculate the coherence score for the LDA model
coherence_model_lda = CoherenceModel(model=lda_model, texts=preprocessed_documents, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

print(f'Topic Coherence Score (C_V): {coherence_lda:.4f}')

Topic Coherence Score (C_V): 0.6301
