In [1]:
from gensim import corpora, models
from gensim.models import LsiModel
from gensim.models import TfidfModel
from gensim.matutils import corpus2csc
from gensim import similarities
import numpy as np

# Sample corpus
documents = [
    "human interface computer",
    "survey user computer system response time",
    "eps user interface system",
    "system human system eps",
    "user response time",
    "trees",
    "graph trees",
    "graph minors trees",
    "graph minors survey"
]

# Tokenize the documents
texts = [[word for word in document.lower().split()] for document in documents]

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(texts)

# Create a bag of words corpus
corpus = [dictionary.doc2bow(text) for text in texts]

# Create TF-IDF model
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

# Create LSI model
lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)

# Get topics
topics = lsi_model.print_topics()
for topic in topics:
    print(topic)


(0, '0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"response" + 0.060*"time" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"')
(1, '-0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"response" + -0.320*"time" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"')


In [2]:
# New text to classify
new_text = "user computer system"

# Tokenize and convert to bag of words representation
new_text_bow = dictionary.doc2bow(new_text.lower().split())

# Convert to TF-IDF representation
new_text_tfidf = tfidf[new_text_bow]

# Transform using LSI model to get topic distribution
new_text_lsi = lsi_model[new_text_tfidf]

# Convert the topic distribution to a dense numpy array
new_text_lsi_dense = corpus2csc([new_text_lsi], num_terms=lsi_model.num_topics).toarray().flatten()

# Calculate cosine similarity with existing documents
similarities_matrix = similarities.MatrixSimilarity(lsi_model[corpus])

similarities_list = similarities_matrix[new_text_lsi]
most_similar_index = np.argmax(similarities_list)

print(f"The new text is most similar to document {most_similar_index} in the corpus.")


The new text is most similar to document 4 in the corpus.
