# Topic Modeling with Crawled Reddit Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path


from tqdm import tqdm

tqdm.pandas()

import json

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

import spacy

%load_ext autoreload
%autoreload 2

In [2]:
random_state = 6471

In [3]:
data_dir = Path.cwd() / 'data'
crawled_reddit_data_dir = data_dir / 'reddit' / 'crawled'
processed_reddit_data_dir = data_dir / 'reddit' / 'processed'
processed_submissions_file = processed_reddit_data_dir / 'df_submissions.csv'
processed_comments_file = processed_reddit_data_dir / 'df_comments.csv'
temp_dir = Path.cwd() / 'temp'

for d in [data_dir, crawled_reddit_data_dir, processed_reddit_data_dir, temp_dir]:
    if not d.exists():
        d.mkdir()

In [4]:
# load the data
all_submissions =[]
all_comments = []

crawled_data_files = list(crawled_reddit_data_dir.glob("*.json"))
for data_file in tqdm(crawled_data_files):
    data_file: Path
    data = json.load(data_file.open(mode='r', encoding='utf-8'))

    submission = data['submission']['data']
    comments = map(lambda c: c['data'], data['comments'])
    all_submissions.append(submission)
    all_comments.extend(comments)
pd.DataFrame(all_submissions).to_csv(processed_submissions_file)
pd.DataFrame(all_comments).to_csv(processed_comments_file)

100%|██████████| 16312/16312 [00:46<00:00, 348.63it/s] 


In [5]:
df_submissions = pd.read_csv(
    processed_submissions_file,
    usecols=['author', 'title', 'name', 'created_utc', 'num_comments', 'score', 'subreddit_name_prefixed', 'subreddit_subscribers', 'selftext'],
    parse_dates=['created_utc'],
    infer_datetime_format=True,
    keep_date_col=True
)
df_submissions['created_date_time'] = pd.to_datetime(df_submissions['created_utc'], unit='s')
df_submissions

Unnamed: 0,selftext,title,subreddit_name_prefixed,name,score,author,num_comments,subreddit_subscribers,created_utc,created_date_time
0,"This is the best tl;dr I could make, [original...",Why no one really quits Google or Facebook,r/autotldr,t3_anarph,1,autotldr,0,17846,1549342875.0,2019-02-05 05:01:15
1,,Privacy Policy - Router-LoginHelp | Online Sup...,r/technology,t3_anatdu,1,robbiewill4,0,11787248,1549343212.0,2019-02-05 05:06:52
2,,Privacy Policy - Shivam Autozone | Maruti Suzu...,u/eflintstop,t3_anbhip,1,eflintstop,0,0,1549348620.0,2019-02-05 06:37:00
3,,Privacy Policy - Shivam Nexa | Maruti Suzuki C...,u/eflintstop,t3_anbl9b,1,eflintstop,0,0,1549349533.0,2019-02-05 06:52:13
4,[removed],Top 10 Profitable Niche blog to Start in 2019,r/fantasywriters,t3_anc1ww,1,[deleted],1,413969,1549354085.0,2019-02-05 08:08:05
...,...,...,...,...,...,...,...,...,...,...
16307,I'm trying to calculate my benefits and it's v...,Does anyone know how I'm supposed to calculate...,r/IBEW,t3_rlpz79,5,ImLikeAnOuroboros,0,27624,1640125887.0,2021-12-21 22:31:27
16308,"First, I want to state for the record that I h...",VHF propagation from aircraft at 130 MHZ: An o...,r/rfelectronics,t3_rlqcm2,35,50_Year_Plan_Bro,8,22455,1640126973.0,2021-12-21 22:49:33
16309,[removed],[WTB] Few Hundred Peace/Morgans,r/Coins4Sale,t3_rlqlvg,5,LawStudentAndrew,1,23934,1640127741.0,2021-12-21 23:02:21
16310,I'm a former researcher who moved to a new job...,A few questions on the practicals of consulting,r/bioinformatics,t3_rlqshy,2,andrewrgross,1,76059,1640128282.0,2021-12-21 23:11:22


## Submissions with content

In [6]:
mask_contented_submissions = (df_submissions['selftext'] != '') & (df_submissions['selftext'] != '[deleted]') & (df_submissions['selftext'] != '[removed]') & (df_submissions['selftext'].notna())
df_contented_submissions = df_submissions[mask_contented_submissions]
df_contented_submissions

Unnamed: 0,selftext,title,subreddit_name_prefixed,name,score,author,num_comments,subreddit_subscribers,created_utc,created_date_time
0,"This is the best tl;dr I could make, [original...",Why no one really quits Google or Facebook,r/autotldr,t3_anarph,1,autotldr,0,17846,1549342875.0,2019-02-05 05:01:15
5,The OATH platform allows the contracting parti...,OATHPROTOCOL PLATFORM,r/IcoInvestor,t3_anc631,24,Aflozy25,0,8430,1549355304.0,2019-02-05 08:28:24
7,"So, I love the hotel I work at. It's part of a...",I just love when random people come to chill i...,r/TalesFromTheFrontDesk,t3_andebh,89,1992kisy,20,421940,1549367891.0,2019-02-05 11:58:11
13,"This is the best tl;dr I could make, [original...",Record profits put new bull’s-eye on tech giants,r/autotldr,t3_ang3wk,1,autotldr,0,17846,1549386040.0,2019-02-05 17:00:40
14,"(Note that I'm using the /r/reddit.com method,...",The canned text responses to reports are getti...,r/ModSupport,t3_angwd2,67,reseph,24,58482,1549390429.0,2019-02-05 18:13:49
...,...,...,...,...,...,...,...,...,...,...
16306,Same pay in terms of salary. Just worried abou...,need some advice here. Is it a bad move to go ...,r/ITCareerQuestions,t3_rlpz6g,3,texasgrabem,8,279847,1640125885.0,2021-12-21 22:31:25
16307,I'm trying to calculate my benefits and it's v...,Does anyone know how I'm supposed to calculate...,r/IBEW,t3_rlpz79,5,ImLikeAnOuroboros,0,27624,1640125887.0,2021-12-21 22:31:27
16308,"First, I want to state for the record that I h...",VHF propagation from aircraft at 130 MHZ: An o...,r/rfelectronics,t3_rlqcm2,35,50_Year_Plan_Bro,8,22455,1640126973.0,2021-12-21 22:49:33
16310,I'm a former researcher who moved to a new job...,A few questions on the practicals of consulting,r/bioinformatics,t3_rlqshy,2,andrewrgross,1,76059,1640128282.0,2021-12-21 23:11:22


In [7]:
df_contented_submissions['selftext']

0        This is the best tl;dr I could make, [original...
5        The OATH platform allows the contracting parti...
7        So, I love the hotel I work at. It's part of a...
13       This is the best tl;dr I could make, [original...
14       (Note that I'm using the /r/reddit.com method,...
                               ...                        
16306    Same pay in terms of salary. Just worried abou...
16307    I'm trying to calculate my benefits and it's v...
16308    First, I want to state for the record that I h...
16310    I'm a former researcher who moved to a new job...
16311    I'm going to post the short form TL;DR of the ...
Name: selftext, Length: 9946, dtype: object

### TF-IDF Features & Non-negative Matrix Factorization

In [8]:
tf_idf_vectorizer = TfidfVectorizer(
    max_df=0.95,
    min_df=2,
    max_features=1000,
    stop_words='english'
)
tf_idf_features = tf_idf_vectorizer.fit_transform(df_contented_submissions['selftext'])

In [9]:
nmf: NMF = NMF(
    n_components=20,
    random_state=random_state,
    alpha_W=0.1,
    alpha_H=0.1,
    l1_ratio=0.5,
)
nmf.fit(tf_idf_features)



NMF(alpha_H=0.1, alpha_W=0.1, l1_ratio=0.5, n_components=20, random_state=6471)

#### Also fit a model wit KL divergence

In [10]:
nmf_kl: NMF = NMF(
    n_components=20,
    random_state=random_state,
    alpha_W=0.1,
    alpha_H=0.1,
    l1_ratio=0.5,
    beta_loss='kullback-leibler',
    solver='mu',
    max_iter=1000
)
nmf_kl.fit(tf_idf_features)



NMF(alpha_H=0.1, alpha_W=0.1, beta_loss='kullback-leibler', l1_ratio=0.5,
    max_iter=1000, n_components=20, random_state=6471, solver='mu')

### Raw Count with LDA

In [11]:
count_vectorizer = CountVectorizer(
    max_df=0.95,
    min_df=2,
    max_features=1000,
    stop_words='english'
)
bow_count = count_vectorizer.fit_transform(df_contented_submissions['selftext'])

In [12]:
lda = LatentDirichletAllocation(
    n_components=20,
    max_iter=20,
    learning_method='online',
    learning_offset=50.0,
    random_state=random_state
)
lda.fit(bow_count)

LatentDirichletAllocation(learning_method='online', learning_offset=50.0,
                          max_iter=20, n_components=20, random_state=6471)

## Plotting Topics

In [13]:
tf_idf_feature_names = tf_idf_vectorizer.get_feature_names_out()
count_feature_names = count_vectorizer.get_feature_names_out()

In [14]:
rankings_nmf_kl = nmf_kl.components_.argsort(axis=1)    # sort each row (word) by the least assigned word to the most assigned word
rankings_nmf_kl = rankings_nmf_kl[:, -20:]  # grab the most frequent words for each topic
for topic_index, word_ranking in enumerate(rankings_nmf_kl):
    top_words_str: str = ', '.join([tf_idf_feature_names[w] for w in reversed(word_ranking)])
    print(f'Topic {topic_index + 1} top words: {top_words_str}')

Topic 1 top words: https, com, privacy, www, policy, service, like, data, just, x200b, people, time, account, terms, market, information, use, new, reddit, don
Topic 2 top words: youtube, floor, friend, freedom, free, franchise, forward, format, form, forecast, force, food, following, follow, focus, fix, features, fit, fine, financial
Topic 3 top words: 2026, youtube, fully, friend, freedom, free, franchise, forward, format, form, forecast, force, food, following, follow, focus, floor, fix, fit, fine
Topic 4 top words: youtube, floor, friend, freedom, free, franchise, forward, format, form, forecast, force, food, following, follow, focus, fix, features, fit, fine, financial
Topic 5 top words: oddity, truth, letter, society, freedom, evidence, fact, wants, words, believe, mantra, claims, self, true, claim, say, human, people, political, consider
Topic 6 top words: sbcglobal, quickbooks, aol, 958, 877, att, toll, antivirus, mail, password, enterprisenical, reset, tap, specialists, number

## Spacy processing

In [15]:
nlp = spacy.load('en_core_web_sm')

In [16]:
spacy_preprocessed_selftext = df_contented_submissions['selftext'].progress_apply(nlp)
spacy_preprocessed_selftext

100%|██████████| 9946/9946 [20:02<00:00,  8.27it/s]  


0        (This, is, the, best, tl;dr, I, could, make, ,...
5        (The, OATH, platform, allows, the, contracting...
7        (So, ,, I, love, the, hotel, I, work, at, ., I...
13       (This, is, the, best, tl;dr, I, could, make, ,...
14       ((, Note, that, I, 'm, using, the, /r, /, redd...
                               ...                        
16306    (Same, pay, in, terms, of, salary, ., Just, wo...
16307    (I, 'm, trying, to, calculate, my, benefits, a...
16308    (First, ,, I, want, to, state, for, the, recor...
16310    (I, 'm, a, former, researcher, who, moved, to,...
16311    (I, 'm, going, to, post, the, short, form, TL;...
Name: selftext, Length: 9946, dtype: object

In [18]:
spacy_preprocessed_selftext: pd.Series
spacy_preprocessed_selftext.to_pickle(temp_dir / 'spacy_preprocessed_selftext.pkl')

In [29]:
df_contented_submissions['preprocessed_selftext'] = spacy_preprocessed_selftext.progress_apply(lambda t: t.text)
df_contented_submissions['preprocessed_selftext']

100%|██████████| 9946/9946 [00:04<00:00, 2420.37it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


0        This is the best tl;dr I could make, [original...
5        The OATH platform allows the contracting parti...
7        So, I love the hotel I work at. It's part of a...
13       This is the best tl;dr I could make, [original...
14       (Note that I'm using the /r/reddit.com method,...
                               ...                        
16306    Same pay in terms of salary. Just worried abou...
16307    I'm trying to calculate my benefits and it's v...
16308    First, I want to state for the record that I h...
16310    I'm a former researcher who moved to a new job...
16311    I'm going to post the short form TL;DR of the ...
Name: preprocessed_selftext, Length: 9946, dtype: object

In [59]:
count_vectorizer_spacy_text = CountVectorizer(
    stop_words='english',
    ngram_range=(1, 1),
    max_df=0.95,
    min_df=3,
    max_features=3000,
    # vocabulary=nlp.vocab.strings
)

bow_spacy_text = count_vectorizer_spacy_text.fit_transform(df_contented_submissions['preprocessed_selftext'])

In [60]:
lda_spacy = LatentDirichletAllocation(
    n_components=20,
    learning_method='online',
    max_iter=20,
    verbose=1,
    n_jobs=-1,
    random_state=random_state
)
lda_spacy.fit(bow_spacy_text)

[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    1.7s remaining:   16.2s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    1.8s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parall

iteration: 1 of max_iter: 20


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parall

iteration: 2 of max_iter: 20


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parall

iteration: 3 of max_iter: 20


[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_j

iteration: 4 of max_iter: 20


[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_j

iteration: 5 of max_iter: 20


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parall

iteration: 6 of max_iter: 20


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parall

iteration: 7 of max_iter: 20


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parall

iteration: 8 of max_iter: 20


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parall

iteration: 9 of max_iter: 20


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parall

iteration: 10 of max_iter: 20


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parall

iteration: 11 of max_iter: 20


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parall

iteration: 12 of max_iter: 20


[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_j

iteration: 13 of max_iter: 20


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parall

iteration: 14 of max_iter: 20


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parall

iteration: 15 of max_iter: 20


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parall

iteration: 16 of max_iter: 20


[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_j

iteration: 17 of max_iter: 20


[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_j

iteration: 18 of max_iter: 20


[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_j

iteration: 19 of max_iter: 20


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parall

iteration: 20 of max_iter: 20


[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.2s remaining:    2.4s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.3s finished


LatentDirichletAllocation(learning_method='online', max_iter=20,
                          n_components=20, n_jobs=-1, random_state=6471,
                          verbose=2)

In [63]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [64]:
pyLDAvis.sklearn.prepare(
    lda_spacy,
    bow_spacy_text,
    count_vectorizer_spacy_text
)

[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    0.2s remaining:    2.4s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    0.3s finished


## Topic Modeling using BERTopic

In [33]:
from bertopic import BERTopic

In [54]:
topic_model = BERTopic(
    language='english',
    top_n_words=10,
    n_gram_range=(1, 1),
    # nr_topics='auto'
)
topic_model.fit(df_contented_submissions['preprocessed_selftext'].tolist())

<bertopic._bertopic.BERTopic at 0x24ee6aa7688>

In [55]:
topics, probs = topic_model.transform(df_contented_submissions['preprocessed_selftext'].tolist())

In [56]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,4003,-1_the_and_of_to
1,0,206,0_google_information_privacy_app
2,1,194,1_data_information_privacy_personal
3,2,165,2_oddity_odditys_that_its
4,3,101,3_post_posts_content_subreddit
...,...,...,...
195,194,10,194_ml_data_engineer_isye
196,195,10,195_chats_cliques_everdale_views4
197,196,10,196_her_ellie_she_magan
198,197,10,197_ps4_psn_manager_family


In [57]:
topic_model.visualize_topics()

In [58]:
topic_model.visualize_barchart()