# Using LDA to conduct topic modelling on Coursera data

<strong>The descriptions and summaries will be aggregated into a bag-of-words and the LDA model from Gensim will be used to generate n topics for each program (decided using grid search).</strong>

## (1) Import libraries and coursera data into notebook

In [1]:
import pandas as pd
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from gensim import corpora
from gensim.models import LdaModel
from gensim.models import CoherenceModel

import numpy as np


In [9]:
# Step 1: Extract and preprocess documents
documents = []
for i in range(475):  # Your 475 files
    file_path = f"../corpus/program{i}.txt"
    with open(file_path, 'r') as file:
        content = file.read()
        # Extract content between DOC tags
        docs = re.findall(r'<DOC>(.*?)</DOC>', content, re.DOTALL)
        documents.extend(docs)

## (2) Cleaning and preprocessing text to create dictionary for LDA model

In [3]:
nlp = spacy.load("en_core_web_sm")

In [12]:
coursera_stopwords = { # extension of the stop words - for now this is all I have, but feel free to add on more 
    "\n", "course", "program", "learn", "learning", "outcome", "outcomes",
    "description", "skill", "skills", "module", "modules", "specialization",
    "specialisation", "certificate", "certificates", "certificate,", "certificates,",
    "professional", "career", "opportunity", "opportunities", "project", "projects", "work", "experience", "experiences"
}
STOP_WORDS = STOP_WORDS.union(coursera_stopwords)

def preprocess_text(text: str):
    '''
    preprocessing 
    1. lowercase conversion 
    2. removing nonalphanum
    3. stopwords, puncutation, short token and frequent short token removal
    '''
    doc = nlp(text)
    tokens = []
    for token in doc:
        if (token.lemma_ not in STOP_WORDS):
            tokens.append(token.lemma_)
    return tokens

In [15]:
# creating a dictionary for the LDA model
docs = [preprocess_text(doc) for doc in documents]
dictionary_prog = corpora.Dictionary(docs)
dictionary_prog.filter_extremes(no_below=5, no_above=0.5)

In [16]:
# creating BOW corpus
bow_corpus_prog = [dictionary_prog.doc2bow(doc) for doc in docs]
print(f"Number of unique tokens in dictionary: {len(dictionary_prog)}")
# print(bow_corpus_prog)
# print(bow_corpus_prog[0])
print("Example BOW for the first document:", bow_corpus_prog[0][:5])


Number of unique tokens in dictionary: 6048
Example BOW for the first document: [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]


## (3) Calculating coherence score + Best number of topics through grid-search 

In [18]:
def compute_coherence_values(dictionary, corpus, texts, start, limit, step):
    """
    Computes c_v coherence for various values of num_topics.
    
    Returns:
        model_list: List of trained LdaModel
        coherence_values: Coherence values corresponding to the models
    """
    coherence_values = []
    model_list = []
    
    for num_topics in range(start, limit, step):
        model = LdaModel(
            corpus=corpus,
            id2word=dictionary,
            num_topics=num_topics,
            random_state=42,
            passes=10,       # tweak for more stable training
            alpha='auto',    # auto tuning of alpha by gensim
            per_word_topics=True
        )
        model_list.append(model)
        
        # calcuate coherence score
        coherencemodel = CoherenceModel(
            model=model, 
            texts=texts, 
            dictionary=dictionary,
            coherence='c_v'
        )
        coherence_values.append(coherencemodel.get_coherence())
    
    return model_list, coherence_values

start, limit, step = 5, 26, 5  
model_list, coherence_values = compute_coherence_values(
    dictionary_prog, 
    bow_corpus_prog, 
    docs, 
    start, 
    limit, 
    step
)

# identifying best coherence
best_index = np.argmax(coherence_values)
optimal_num_topics = range(start, limit, step)[best_index]
best_model = model_list[best_index]
best_coherence = coherence_values[best_index]
print("Coherence Values:", coherence_values)
print(f"Best number of topics: {optimal_num_topics} with Coherence = {best_coherence:.4f}")


Coherence Values: [0.3778542942059359, 0.45786182162013944, 0.48045017530713086, 0.48176933293730817, 0.45535870266646294]
Best number of topics: 20 with Coherence = 0.4818


## (4) Viewing top words for each topic

In [9]:
for idx in range(optimal_num_topics):
    terms = best_model.get_topic_terms(idx, topn=10)
    term_words = [dictionary_prog[term_id] for term_id, _ in terms]
    print(f"\nTopic {idx} top words: {term_words}")



Topic 0 top words: ['machine', 'programming', 'build', 'model', 'image', 'datum', 'application', 'java', 'language', 'tensorflow']

Topic 1 top words: ['application', 'web', 'design', 'build', 'create', 'technology', 'blockchain', 'learner', 'develop', 'game']

Topic 2 top words: ['datum', 'data', 'science', 'analysis', 'create', 'python', 'tool', 'database', 'analyze', 'sql']

Topic 3 top words: ['marketing', 'create', 'social', 'digital', 'business', 'product', 'strategy', 'music', 'brand', 'practice']

Topic 4 top words: ['design', 'business', 'job', 'new', 'create', 'management', 'analytic', 'customer', 'product', 'tool']

Topic 5 top words: ['cloud', 'google', 'network', 'cybersecurity', 'handson', 'security', 'lab', 'certification', 'new', 'engineer']

Topic 6 top words: ['health', 'design', 'healthcare', 'patient', 'learner', 'care', 'develop', 'human', 'new', 'field']

Topic 7 top words: ['datum', 'learner', 'business', 'model', 'analysis', 'complete', 'new', 'teach', 'techniq