In [1]:
from sklearn.datasets import fetch_20newsgroups

# Load the 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

documents = newsgroups.data
labels = newsgroups.target
label_names = newsgroups.target_names

In [2]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize the text
    tokens = text.split()
    # Remove stopwords and lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply preprocessing to all documents
preprocessed_docs = [preprocess_text(doc) for doc in documents]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [3]:
from gensim import corpora
from gensim.models import LdaModel

# Create a dictionary representation of the documents.
dictionary = corpora.Dictionary([doc.split() for doc in preprocessed_docs])

# Filter out words that occur in less than 5 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=5, no_above=0.5)

# Create a bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc.split()) for doc in preprocessed_docs]

# Train the LDA model
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=10)

# Display the topics discovered
topics = lda_model.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.021*"x" + 0.016*"file" + 0.013*"window" + 0.012*"image" + 0.010*"program" + 0.007*"use" + 0.006*"version" + 0.005*"get" + 0.005*"available" + 0.005*"application"')
(1, '0.008*"system" + 0.007*"maxaxaxaxaxaxaxaxaxaxaxaxaxaxax" + 0.007*"use" + 0.007*"drive" + 0.006*"would" + 0.006*"one" + 0.006*"key" + 0.005*"card" + 0.005*"computer" + 0.005*"data"')
(2, '0.009*"would" + 0.009*"one" + 0.009*"people" + 0.007*"god" + 0.006*"dont" + 0.006*"think" + 0.006*"say" + 0.005*"know" + 0.005*"like" + 0.004*"thing"')
(3, '0.010*"game" + 0.007*"year" + 0.006*"car" + 0.006*"get" + 0.006*"team" + 0.006*"one" + 0.005*"like" + 0.005*"would" + 0.005*"good" + 0.004*"dont"')
(4, '0.005*"armenian" + 0.005*"state" + 0.005*"year" + 0.004*"u" + 0.004*"q" + 0.004*"space" + 0.003*"said" + 0.003*"people" + 0.003*"one" + 0.003*"president"')


In [5]:
import pyLDAvis
import pyLDAvis.gensim_models

# Visualize the topics
pyLDAvis.enable_notebook()
lda_vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(lda_vis)

In [8]:
import spacy

# Load pre-trained word embeddings model
nlp = spacy.load('en_core_web_md')

# Convert documents to vectors
doc_vectors = [nlp(doc).vector for doc in preprocessed_docs]

# Measure similarity between documents
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(doc_vectors)
print(similarity_matrix)

  and should_run_async(code)


[[0.99999964 0.59445596 0.7807321  ... 0.67396367 0.7025887  0.77021724]
 [0.59445596 0.99999976 0.6425032  ... 0.67173135 0.5765346  0.7189265 ]
 [0.7807321  0.6425032  1.0000004  ... 0.6511085  0.7892355  0.88690454]
 ...
 [0.67396367 0.67173135 0.6511085  ... 1.         0.7206993  0.75065863]
 [0.7025887  0.5765346  0.7892355  ... 0.7206993  0.99999994 0.878264  ]
 [0.77021724 0.7189265  0.88690454 ... 0.75065863 0.878264   0.9999998 ]]
