## Following the workflow from https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/#5.-Build-the-Topic-Model

In [1]:
# !pip install -U pip setuptools wheel
# !pip install -U spacy
# !python -m spacy download en_core_web_sm

In [4]:
#!pip install nltk



In [6]:
import sys
# !{sys.executable} -m spacy download en
import re, numpy as np, pandas as pd
from pprint import pprint

# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
from gensim.parsing.preprocessing import STOPWORDS

# NLTK
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.extend(['work', 'job', 'www', 'com', 'https', 'antiwork', 'amp'])

from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer

%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Adding some personal stop words since these words contributed to creating a topic that doesn't make the most sense.

In [7]:
my_stop_words = STOPWORDS.union(set(['work', 'job', 'www', 'com', 'https', 'antiwork', 'amp', 'delete', 'post', 'working', 'worked', 'works','worker']))

In [8]:
print(my_stop_words)

frozenset({'computer', 'seems', 'thereafter', 'than', 'thereupon', 'must', 'only', 'by', 'beforehand', 'themselves', 'were', 'anything', 'empty', 'seem', 'is', 'sixty', 'even', 'put', 'thus', 'although', 'side', 'found', 'herein', 'take', 'else', 'toward', 'bottom', 'either', 'not', 'serious', 'much', 'ourselves', 'cry', 'none', 'always', 'does', 'because', 'job', 'didn', 'upon', 'becomes', 'third', 'sometime', 'ie', 'four', 'seemed', 'except', 'via', 'through', 'see', 'formerly', 'somewhere', 'into', 'fifty', 'whoever', 'her', 'eg', 'make', 'should', 'few', 'do', 'un', 'ten', 'can', 'what', 'somehow', 'nowhere', 'yourself', 'full', 'something', 'to', 'hers', 'least', 'anyhow', 'myself', 'therein', 'too', 'well', 'don', 'their', 'may', 'himself', 'next', 'km', 'hasnt', 'but', 'towards', 'whereby', 'herself', 'them', 'sincere', 'whenever', 'these', 'works', 'several', 'whereafter', 'never', 'very', 'therefore', 'once', 'are', 'still', 'since', 'someone', 'had', 'throughout', 'say', 'ove

## import and clean data

## This section can be skipped as the cleaned df is exported to a csv below

In [9]:
df = pd.read_csv('./data/the-antiwork-subreddit-dataset-posts.csv')
print(df.shape)
df.head()

(256279, 12)


Unnamed: 0,type,id,subreddit.id,subreddit.name,subreddit.nsfw,created_utc,permalink,domain,url,selftext,title,score
0,post,svw6x3,2y77d,antiwork,False,1645228719,https://old.reddit.com/r/antiwork/comments/svw...,self.antiwork,,I was hired at the **Neon Museum** as a tour g...,Neon Museum Las Vegas took away our tips,15
1,post,svw6jv,2y77d,antiwork,False,1645228687,https://old.reddit.com/r/antiwork/comments/svw...,i.redd.it,https://i.redd.it/vuoctaq0koi81.png,,Working,1887
2,post,svw5e8,2y77d,antiwork,False,1645228588,https://old.reddit.com/r/antiwork/comments/svw...,self.antiwork,,"So, I'm quite new to the jobs front then most ...",Kind of feel like screaming into the cyberspace,4
3,post,svw498,2y77d,antiwork,False,1645228495,https://old.reddit.com/r/antiwork/comments/svw...,i.redd.it,https://i.redd.it/1w1unxjfjoi81.png,,"Democracy is a lie, especially in the modern w...",14060
4,post,svw3qt,2y77d,antiwork,False,1645228450,https://old.reddit.com/r/antiwork/comments/svw...,self.antiwork,,My boss asked me today what I plan on doing wh...,Master's Degree - No Pay Raise but OT,63


In [10]:
df.nunique()

type                   1
id                256279
subreddit.id           1
subreddit.name         1
subreddit.nsfw         1
created_utc       253907
permalink         256279
domain              5126
url                93951
selftext           83301
title             238727
score               7860
dtype: int64

Because in each of the columns below, they all have the same value throughout so I'm going to drop them

In [11]:
df.drop(columns =['type', 'subreddit.id', 'subreddit.name','subreddit.nsfw'], inplace=True)

In [12]:
df.isna().mean()

id             0.000000
created_utc    0.000000
permalink      0.000000
domain         0.000000
url            0.602429
selftext       0.397571
title          0.000000
score          0.000000
dtype: float64

Nearly 40% of the selftext values are null, so I will likely combine the title with the selftext to not lose any data

In [13]:
df.dtypes

id             object
created_utc     int64
permalink      object
domain         object
url            object
selftext       object
title          object
score           int64
dtype: object

I need to get the UTC column into readable date time formatting

In [14]:
from datetime import datetime

#writing a function to convert the strings into readable date time strings
def time_utc(ts):   
    return datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')

In [15]:
df['created_utc']=df['created_utc'].map(time_utc)

In [16]:
df.head(1)

Unnamed: 0,id,created_utc,permalink,domain,url,selftext,title,score
0,svw6x3,2022-02-18 23:58:39,https://old.reddit.com/r/antiwork/comments/svw...,self.antiwork,,I was hired at the **Neon Museum** as a tour g...,Neon Museum Las Vegas took away our tips,15


In [17]:
df.dtypes

id             object
created_utc    object
permalink      object
domain         object
url            object
selftext       object
title          object
score           int64
dtype: object

Can see that now the UTC is readable, but it still isn't in date time format for pandas

In [18]:
df['created_utc']=pd.to_datetime(df['created_utc'])

In [19]:
df.dtypes

id                     object
created_utc    datetime64[ns]
permalink              object
domain                 object
url                    object
selftext               object
title                  object
score                   int64
dtype: object

In [20]:
# Combining the title and text column so there aren't nulls in the self text
df['title_w_text']=df['title'] + ' ' + df['selftext'].fillna('')

In [21]:
df.columns

Index(['id', 'created_utc', 'permalink', 'domain', 'url', 'selftext', 'title',
       'score', 'title_w_text'],
      dtype='object')

## Saving this cleaned df for later efficiency

In [22]:

df.to_csv('./data/cleaned_df_posts.csv')

# Preprocessing

In [23]:
# df = pd.read_csv('./data/cleaned_df_posts.csv')
# df.head()

In [24]:
df.dtypes

id                      object
created_utc     datetime64[ns]
permalink               object
domain                  object
url                     object
selftext                object
title                   object
score                    int64
title_w_text            object
dtype: object

### Tokenizing

In [None]:
# def sent_to_words(sentences):
#     for sent in sentences:
#         #print(sent)
#         sent = re.sub('\s+', ' ', sent)  # remove newline chars
#         sent = re.sub("\'", "", sent)  # remove single quotes
#         tokenizer = RegexpTokenizer('\w+') #tokenize at word boundaries
#         sent = tokenizer.tokenize(sent.lower())
#         sent = gensim.utils.simple_preprocess(str(sent), deacc=True)
#         yield(sent)

# data = df['title_w_text'].values.tolist()

# #this is showing a single list with every document in it

# data;

# data_words = list(sent_to_words(data))

# data_words[:5]


### Stem and Lem

In [31]:
stemmer = SnowballStemmer('english')

In [33]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    text = re.sub('\s+', ' ', text)  # remove newline chars
    text = re.sub("\'", "", text)  # remove single quotes
    #tokenizer = RegexpTokenizer('\w+') #tokenize at word boundaries
    text = text.lower()
    for token in gensim.utils.simple_preprocess(text, deacc=True):
        if token not in my_stop_words and len(token) >= 3:
           # print(token)
            result.append(lemmatize_stemming(token))
    return result

Test case to ensure preprocessing worked properly. I added a custom stop word to this test case to ensure my custom stop words work properly

In [34]:
test = "www.Democracy ''is a lie, especially in the modern workplace work"
test

"www.Democracy ''is a lie, especially in the modern workplace work"

In [35]:
# print('original document: ')
# print()
# words = []
# for word in test.split(' '):
#     words.append(word)
# print(words)
# print()
# print('preprocessed document: ')
print(preprocess(test))

['democraci', 'lie', 'especi', 'modern', 'workplac']


#### It works! Applying the preprocessing function to the df

In [None]:
preprocessed = df['title_w_text'].map(preprocess)
preprocessed

In [None]:
df['preprocessed'] = preprocessed
df.head()

# Running LDA on the tokenized, stemmed, and lemmed words

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(preprocessed)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in preprocessed]

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5,
                                           workers=3,
                                           random_state=42,
                                           chunksize=256,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True)

pprint(lda_model.print_topics())

In [38]:
# def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
#     # Init output
#     sent_topics_df = pd.DataFrame()

#     # Get main topic in each document
#     for i, row_list in enumerate(ldamodel[corpus]):
#         row = row_list[0] if ldamodel.per_word_topics else row_list            
#         #print(row)
#         row = sorted(row, key=lambda x: (x[1]), reverse=True)
#         # Get the Dominant topic, Perc Contribution and Keywords for each document
#         for j, (topic_num, prop_topic) in enumerate(row):
#             #print('for 2')
#             if j == 0:  # => dominant topic
#                 wp = ldamodel.show_topic(topic_num)
#                 topic_keywords = ", ".join([word for word, prop in wp])
#                 sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
#             else:
#                 break
#     sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

#     # Add original text to the end of the output
#     contents = pd.Series(texts)
#     sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
#     return(sent_topics_df)

In [None]:
#df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus[:10], texts=preprocessed)

In [None]:
#df_topic_sents_keywords.head()

In [None]:
# batch_size = 100
# row_count = df.shape[0]
# for epoch in range(100):
#     for batch_idx in range(0, row, batch_size):

In [49]:
# lst2 = []
# for i in range(0,3000, 1000):
#     test=format_topics_sentences(ldamodel=lda_model, corpus=corpus[i:i+1000], texts=preprocessed)
#     lst2.append(test)
#     print(i)
# test_df = pd.concat(lst2)

In [52]:
# test_df['Topic_Keywords'][1000]

1000    NaN
1000    NaN
1000    NaN
Name: Topic_Keywords, dtype: object

In [None]:
# test_df[1000]

In [None]:
!pip install pyLDAvis

In [33]:
%matplotlib inline
import pyLDAvis
import pyLDAvis.gensim_models

In [34]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(topic_model=lda_model, corpus=corpus, dictionary=id2word)

pyLDAvis.display(vis)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [179]:
df['year'] = df['created_utc'].dt.year

In [181]:
df['month'] = df['created_utc'].dt.month

In [183]:
df

Unnamed: 0,id,created_utc,permalink,domain,url,selftext,title,score,title_w_text,year,month
0,svw6x3,2022-02-18 23:58:39,https://old.reddit.com/r/antiwork/comments/svw...,self.antiwork,,I was hired at the **Neon Museum** as a tour g...,Neon Museum Las Vegas took away our tips,15,Neon Museum Las Vegas took away our tips I was...,2022,2
1,svw6jv,2022-02-18 23:58:07,https://old.reddit.com/r/antiwork/comments/svw...,i.redd.it,https://i.redd.it/vuoctaq0koi81.png,,Working,1887,Working,2022,2
2,svw5e8,2022-02-18 23:56:28,https://old.reddit.com/r/antiwork/comments/svw...,self.antiwork,,"So, I'm quite new to the jobs front then most ...",Kind of feel like screaming into the cyberspace,4,Kind of feel like screaming into the cyberspac...,2022,2
3,svw498,2022-02-18 23:54:55,https://old.reddit.com/r/antiwork/comments/svw...,i.redd.it,https://i.redd.it/1w1unxjfjoi81.png,,"Democracy is a lie, especially in the modern w...",14060,"Democracy is a lie, especially in the modern w...",2022,2
4,svw3qt,2022-02-18 23:54:10,https://old.reddit.com/r/antiwork/comments/svw...,self.antiwork,,My boss asked me today what I plan on doing wh...,Master's Degree - No Pay Raise but OT,63,Master's Degree - No Pay Raise but OT My boss ...,2022,2
...,...,...,...,...,...,...,...,...,...,...,...
256274,1kchry,2013-08-14 13:34:59,https://old.reddit.com/r/antiwork/comments/1kc...,theanarchistlibrary.org,http://theanarchistlibrary.org/library/Bob_Bla...,,The Abolition of Work (Bob Black),16,The Abolition of Work (Bob Black),2013,8
256275,1kche2,2013-08-14 13:28:34,https://old.reddit.com/r/antiwork/comments/1kc...,4hourworkday.org,http://www.4hourworkday.org/,,Global Campaign for the 4 Hour Work-day,21,Global Campaign for the 4 Hour Work-day,2013,8
256276,1kch2k,2013-08-14 13:22:45,https://old.reddit.com/r/antiwork/comments/1kc...,jacobinmag.com,http://jacobinmag.com/2012/04/the-politics-of-...,,The Politics of Getting a Life,8,The Politics of Getting a Life,2013,8
256277,1kcgwh,2013-08-14 13:20:06,https://old.reddit.com/r/antiwork/comments/1kc...,jacobinmag.com,http://jacobinmag.com/2012/05/new-works-and-an...,,New Works and Anti-Works,5,New Works and Anti-Works,2013,8
