#### LDA topic model for content for content categories in security policies

In [None]:
!pip install nltk==3.9.1

In [None]:
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt

import pyLDAvis
import pyLDAvis.gensim_models as gensimvisualize
from collections import defaultdict, Counter
import numpy as np
from sklearn.manifold import TSNE
import nltk
nltk.download('punkt_tab')

In [3]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation and special characters
    tokens = word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    tokens = [word for word in tokens if len(word) > 2]
    return tokens

In [None]:
# Find best number of topics based on coherence and perplexity scores
data = pd.read_csv('dataset/Document_All_Category.csv')

data['processed_Document'] = data['Document'].apply(preprocess_text)
texts = data['processed_Document'].tolist()

# Create dictionary and corpus
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=2)
corpus = [dictionary.doc2bow(text) for text in texts]

num_topics_list = range(2, 20)
coherence_scores = []
perplexity_scores = []

# Train models and calculate coherence and perplexity scores
for num_topics in num_topics_list:
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, random_state=63,
                         chunksize=20, num_topics=num_topics, passes=100, iterations=200, eta=0.5)
    
    # Compute Coherence Score
    coherence_model = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_scores.append(coherence_model.get_coherence())

    # Compute Perplexity Score
    perplexity_scores.append(lda_model.log_perplexity(corpus))

# Plot the coherence scores
plt.figure(figsize=(10, 6))
plt.plot(num_topics_list, coherence_scores, marker='o', label='Coherence Score', color='b')
plt.xlabel('# Topics')
plt.ylabel('Score')
plt.title('Coherence Score vs. Number of Topics')
plt.xticks(num_topics_list)
plt.grid()
plt.legend()
plt.show()

# Plot the perplexity scores
plt.figure(figsize=(10, 6))
plt.plot(num_topics_list, perplexity_scores, marker='o', label='Perplexity Score', color='r')
plt.xlabel('# Topics')
plt.ylabel('Log Perplexity')
plt.title('Perplexity Score vs. Number of Topics')
plt.xticks(num_topics_list)
plt.grid()
plt.legend()
plt.show()


In [None]:
# LDA modeling for all documents
data = pd.read_csv('dataset/Document_All_Category.csv')

data['processed_Document'] = data['Document'].apply(preprocess_text)
texts = data['processed_Document'].tolist()

# load dictionary
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below = 2)

# words to be procedd
total_words = sum(len(text) for text in texts)
print("Total words after preprocessing:", total_words)
unique_words = len(dictionary)
print("Unique words after filtering:", unique_words)


# generate corpus as BoW
corpus = [dictionary.doc2bow(text) for text in texts]

# train LDA model
num_topics=6
lda_model = LdaModel(corpus=corpus, id2word=dictionary, chunksize=20, num_topics=num_topics, passes=100, iterations=200, random_state=63, eta=0.5)

for topic in lda_model.print_topics(num_topics=num_topics, num_words=10):
    print(topic)

coherence_model = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print('Coherence:', coherence_score)

perplexity_score = lda_model.log_perplexity(corpus)
print('Perplexity:', perplexity_score)

In [None]:
# Topics visualization
# Get topic distributions for each document
topic_distributions = lda_model.get_document_topics(corpus, minimum_probability=0)

# Convert topic distributions to a matrix
topic_matrix = np.zeros((len(topic_distributions), num_topics))
for i, dist in enumerate(topic_distributions):
    for topic_num, prob in dist:
        topic_matrix[i, topic_num] = prob

# Perform t-SNE
tsne_model = TSNE(n_components=2, random_state=42)
tsne_values = tsne_model.fit_transform(topic_matrix)

# Get the dominant topic for each document
dominant_topics = np.argmax(topic_matrix, axis=1)

# Create a DataFrame to store t-SNE values and dominant topics
tsne_df = pd.DataFrame(tsne_values, columns=['x', 'y'])
tsne_df['topic'] = dominant_topics

# Plot the t-SNE results with topic labels
plt.figure(figsize=(10, 6))
scatter = plt.scatter(tsne_df['x'], tsne_df['y'], c=tsne_df['topic'], cmap='viridis', s=50, alpha=0.7)
plt.colorbar(label='Topic')
plt.title('t-SNE Clustering of LDA Topics')

plt.show()

# Print the topics and their corresponding colors
for topic in range(num_topics):
    terms = [term for term, _ in lda_model.show_topic(topic, topn=10)]
    print(f"Topic {topic}: " + ", ".join(terms) + f" (Color: {scatter.cmap(scatter.norm(topic))})")

In [None]:
# Top 10 most relevance terms in each topics
# visualization
dickens_visual = gensimvisualize.prepare(lda_model, corpus, dictionary, mds='mmds')
# pyLDAvis.display(dickens_visual)

topic_info = dickens_visual.topic_info
topic_terms = topic_info.groupby('Category')

category_term_counts = {topic: Counter() for topic in topic_terms.groups}
category_term_details = {topic: {} for topic in topic_terms.groups}

for topic, group in topic_terms:
    print(f"Topic {topic}:")
    sorted_words = group.sort_values(by='Freq', ascending=False)
    top_words = sorted_words[['Term', 'Freq']].head(10)
    for word, freq in zip(top_words['Term'], top_words['Freq']):
        print(f"  {word} ({freq:.2f})")
    print("\n")

top_terms_per_topic = {}

for topic, group in topic_terms:
    sorted_words = group.sort_values(by='Freq', ascending=False)
    top_terms = sorted_words[['Term', 'Freq']].head(10)
    top_terms_per_topic[topic] = set(top_terms['Term'])

In [None]:
# Count terms occurance in each category
data = pd.read_csv('dataset/Document_4_Category.csv')
data['processed_Document'] = data['Document'].apply(preprocess_text)

topic_terms = lda_model.show_topics(num_topics=num_topics, num_words=10, formatted=False)
top_terms_per_topic = {topic: [term for term, _ in words] for topic, words in topic_terms}

# count term occurrences in each category
category_term_counts = defaultdict(Counter)
category_term_details = defaultdict(lambda: defaultdict(Counter))

# Identify terms that appear in multiple topics
term_topic_mapping = defaultdict(set)
for topic, terms in top_terms_per_topic.items():
    for term in terms:
        term_topic_mapping[term].add(topic)
multi_topic_terms = {term: topics for term, topics in term_topic_mapping.items() if len(topics) > 1}

print("Multi-topic terms:")
for term, topics in multi_topic_terms.items():
    print(f"{term}: Topics {sorted(topics)}")

multi_topic_term_category_counts = defaultdict(lambda: defaultdict(int))

# Count occurrences of multi-topic terms per category
for term, topics in multi_topic_terms.items():
    for _, row in data.iterrows():
        category = row['Category']
        document_content = str(row['processed_Document']).lower()
        count = document_content.count(term)

        if count > 0:
            multi_topic_term_category_counts[term][category] += count

# Count occurrences only for non-multi-topic terms
for topic, top_terms in top_terms_per_topic.items():
    single_topic = [term for term in top_terms if term not in multi_topic_terms]
    for _, row in data.iterrows():
        category = row['Category']
        document_content = str(row['processed_Document']).lower()

        term_counts = {term: document_content.count(term) for term in single_topic}
        total_count = sum(term_counts.values())

        category_term_counts[topic][category] += total_count

        if category not in category_term_details[topic]:
            category_term_details[topic][category] = Counter()
        category_term_details[topic][category].update(term_counts)


multi_topic_term_df = pd.DataFrame(multi_topic_term_category_counts).T.fillna(0)
multi_topic_term_df['Topics'] = multi_topic_term_df.index.map(lambda term: sorted(multi_topic_terms[term]))

# most occurrences for each topic
topic_category_analysis = {
    topic: {
        "Category": category_counts.most_common(1)[0][0] if category_counts else "None",
        "Occurrences": category_counts.most_common(1)[0][1] if category_counts else 0,
        "Terms": category_term_details[topic][category_counts.most_common(1)[0][0]] if category_counts else {}
    }
    for topic, category_counts in category_term_counts.items()
}

print(multi_topic_terms)
display(multi_topic_term_df)
display(topic_category_analysis)