In [11]:
import nltk
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rahilshaik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rahilshaik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
def word_frequency_count(text):
    words = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.isalnum() and word not in stop_words]

    word_freq = Counter(filtered_words)

    keywords = [word for word, _ in word_freq.most_common(10)]

    return keywords


In [13]:
def lda_topics(text, n_topics=5, n_top_words=10):
    sentences = text.split(". ")

    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    dtm = vectorizer.fit_transform(sentences)

    lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
    lda.fit(dtm)

    lda_keywords = set()
    for topic in lda.components_:
        lda_keywords.update([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-n_top_words:]])

    return list(lda_keywords)


In [14]:
def intersect_keywords_and_topics(keywords, lda_keywords):
    return list(set(keywords) & set(lda_keywords))


In [16]:

sentences = [["gorbachev", "this", "is", "a", "sample", "sentence"],
             ["another", "example", "sentence", "for", "word2vec"]]

word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

def cosine_sim(word1, word2):
    if word1 in word2vec_model.wv and word2 in word2vec_model.wv:
        return cosine_similarity([word2vec_model.wv[word1]], [word2vec_model.wv[word2]])[0][0]
    return 0

def cosine_similarity_filter(keywords, lda_keywords, threshold=0.75):
    similar_words = []
    for kw in keywords:
        for lda_kw in lda_keywords:
            if cosine_sim(kw, lda_kw) >= threshold:
                similar_words.append((kw, lda_kw))
    return similar_words


In [17]:
def process_text(text):
    keywords = word_frequency_count(text)
    lda_keywords = lda_topics(text)

    intersection = intersect_keywords_and_topics(keywords, lda_keywords)

    similar_words = cosine_similarity_filter(keywords, lda_keywords)

    return intersection, similar_words