In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import string

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_document(document):
    # Tokenize the document
    tokens = word_tokenize(document.lower())  # Convert to lowercase for consistency
    
    # Remove punctuation
    tokens = [token for token in tokens if token not in string.punctuation]
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    # Join tokens back into a preprocessed document
    preprocessed_document = ' '.join(tokens)
    
    return preprocessed_document


In [None]:

# Preprocess documents
preprocessed_documents = [preprocess_document(doc) for doc in raw_documents]

# Train a Doc2Vec model
documents = [TaggedDocument(words=word_tokenize(doc), tags=[str(i)]) for i, doc in enumerate(preprocessed_documents)]
model = Doc2Vec(vector_size=100, window=5, min_count=1, workers=4)
model.build_vocab(documents)
model.train(documents, total_examples=model.corpus_count, epochs=10)

# Get embeddings for preprocessed documents
embeddings = [model.dv[i] for i in range(len(preprocessed_documents))]

# Standardize the embeddings
scaler = StandardScaler()
embeddings_standardized = scaler.fit_transform(embeddings)

# Apply DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=2)
labels = dbscan.fit_predict(embeddings_standardized)

# Analyze clusters and anomalies
unique_labels = set(labels)
for i, label in enumerate(labels):
    if label == -1:
        print(f'Document {i} is an anomaly.')
    else:
        print(f'Document {i} belongs to cluster {label}.')
