In [60]:
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.corpus import stopwords

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS, TfidfVectorizer

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models

> organize the text into two topics, plot cosine similarities betwen vectorized keywords

In [61]:
text = pd.Series([
    "Stretched Growth Story Casts a Shadow on AAPL Stock",
    "Apple (NASDAQ:AAPL) Earning Somewhat Favorable Media Coverage, Analysis Shows",
    "Flat Affect: Markets Tread Water With Fed, Other Central Bank Meetings Ahead",
    "Yes, Apple Stock Can Make New Highs This Year",
    "U.S. Investment Analyst is Upbeat on Apple's upcoming Q3 Financials, citing strong service revenues & Growth in China",
    "Apple (AAPL) Outpaces Stock Market Gains: What You Should Know",
    "In Case The Bears Are Right About Apple",
    "Microsoft says its Teams app is bigger than Slack and growing faster",
    "The Slackification of the American Home",
    "Here are the biggest analyst calls of the day: Slack, Zoom, Morgan Stanley & more",
    "If Slack is so good, why are so many companies trying to fix it?"
])

In [62]:
def download_packages():
    import nltk
    import ssl

    try:
        _create_unverified_https_context = ssl._create_unverified_context
    except AttributeError:
        pass
    else:
        ssl._create_default_https_context = _create_unverified_https_context

    nltk.download("wordnet")

In [63]:
download_packages()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/pansproperties/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [64]:
CUSTOM_STOPWORDS = ENGLISH_STOP_WORDS.union(
    ["like"] # add more stopwords here as needed
)

def stem_func(text):
    stemmer = SnowballStemmer('english')
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def is_meaningful(token):
    cond1 = token not in CUSTOM_STOPWORDS
    cond2 = len(token) > 3
    return cond1 and cond2

def topic_prepare(text):
    result, tokens = [], gensim.utils.simple_preprocess(text)
    for token in tokens:
        if is_meaningful(token):
            result.append(token)
    return result

In [65]:
text = text.apply(topic_prepare)

In [66]:
dictionary = gensim.corpora.Dictionary(text)
dictionary.filter_extremes(no_below=1, no_above=0.5, keep_n=1000)
bow_dict = [dictionary.doc2bow(doc) for doc in text]

In [67]:
tfidf = models.TfidfModel(bow_dict)
corpus_tfidf = tfidf[bow_dict]
lda_model = gensim.models.LdaMulticore(corpus_tfidf, num_topics = 5, id2word=dictionary, passes=2, workers=2)

In [77]:
for topic in lda_model.get_topic_terms(1):
    print(topic)

(59, 0.04039356)
(57, 0.04038282)
(55, 0.040380765)
(56, 0.04035519)
(58, 0.04034967)
(28, 0.032518283)
(50, 0.0279612)
(42, 0.013207874)
(8, 0.013198013)
(43, 0.013196349)
