In [26]:
from gensim import corpora, models
from gensim.models.ldamodel import LdaModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Sample document
document = """
Topic modeling is an interesting field. Latent Dirichlet Allocation is used for topic modeling.
Anomaly detection is crucial for identifying unusual patterns. LDA and anomaly detection can be combined for specific use cases.
"""

# Split the document into segments (sentences) after each full stop
segments = document.split('.')

# Tokenize and preprocess each segment
tokenized_segments = [segment.strip() for segment in segments if segment.strip()]

# Create a dictionary representation of the document
dictionary = corpora.Dictionary([tokenized_segments])

# Convert the tokenized document into a document-term matrix
doc_term_matrix = [dictionary.doc2bow(segment.split()) for segment in tokenized_segments]

# Build LDA model
lda_model = LdaModel(doc_term_matrix, num_topics=6, id2word=dictionary)

# Function to get the topic distribution of a segment
def get_topic_distribution(segment):
    bow_vector = dictionary.doc2bow(segment.split())
    topics = lda_model.get_document_topics(bow_vector)
    topic_distribution = np.zeros(lda_model.num_topics)

    for topic, prob in topics:
        topic_distribution[topic] = prob

    return topic_distribution

# Get the overall topic distribution of the document
overall_topic_distribution = np.mean([get_topic_distribution(segment) for segment in tokenized_segments], axis=0)

# Function to detect anomalies in a segment
def detect_anomaly(segment, threshold):
    segment_topic_distribution = get_topic_distribution(segment)
    similarity_score = cosine_similarity([segment_topic_distribution], [overall_topic_distribution])[0, 0]

    if similarity_score < threshold:
        return True
    else:
        return False

# Example usage
threshold = 0.9

for i, segment in enumerate(tokenized_segments):
    if detect_anomaly(segment, threshold):
        print(f"Anomaly detected in segment {i + 1}: {segment}")
    else:
        print(f"No anomaly detected in segment {i + 1}: {segment}")


No anomaly detected in segment 1: Topic modeling is an interesting field
No anomaly detected in segment 2: Latent Dirichlet Allocation is used for topic modeling
No anomaly detected in segment 3: Anomaly detection is crucial for identifying unusual patterns
No anomaly detected in segment 4: LDA and anomaly detection can be combined for specific use cases


  perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)


Document 0 is an anomaly.
Document 1 is an anomaly.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shivam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shivam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
