In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
%matplotlib inline
import itertools
from wordcloud import WordCloud, STOPWORDS

from src.features.preprocess import PreProcess

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re

from gensim import corpora, models
import gensim

[nltk_data] Downloading package punkt to /home/andy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/andy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Read the posts csv file

In [2]:
subreddit = "Music"

In [3]:
df_posts = pd.read_csv("../data/raw/" + subreddit + "_posts.csv")
df_posts.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created
0,rhuzow,"Hi, this is Joss Stone. I'm excited to answer ...",1404,0.91,Music,https://www.reddit.com/r/Music/comments/rhuzow...,455,I had so much fun writing and recording my new...,1639674000.0
1,s8m1uh,The Avalanches - Since I Left You [Plunderphon...,528,0.94,Music,https://youtu.be/wpqm-05R2Jk,65,,1642694000.0
2,s8dqe7,99 Luftballons or 99 Red Balloons?,1258,0.91,Music,https://www.reddit.com/r/Music/comments/s8dqe7...,649,I don’t speak German at all and still sing alo...,1642665000.0
3,s8jx84,Agents of Fortune by Blue Oyster Cult is quiet...,165,0.87,Music,https://www.reddit.com/r/Music/comments/s8jx84...,39,Blue Oyster Cult's fourth studio album is an a...,1642688000.0
4,s86gam,Bad Company's 1974 self titled debut album is ...,1505,0.92,Music,https://www.reddit.com/r/Music/comments/s86gam...,216,I'd like to point fingers and say that people ...,1642642000.0


# Preprocess

In [4]:
PreProcess.preprocess(df_posts, 'title')
PreProcess.preprocess(df_posts, 'body')
df_posts.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,title_word_token,title_filtered,title_stem,body_word_token,body_filtered,body_stem
0,rhuzow,"Hi, this is Joss Stone. I'm excited to answer ...",1404,0.91,Music,https://www.reddit.com/r/Music/comments/rhuzow...,455,I had so much fun writing and recording my new...,1639674000.0,"[Hi, ,, this, is, Joss, Stone, .]","[hi, joss, stone]","[hi, joss, stone]","[I, had, so, much, fun, writing, and, recordin...","[much, fun, writing, recording, new, album, da...","[much, fun, write, record, new, album, dave, s..."
1,s8m1uh,The Avalanches - Since I Left You [Plunderphon...,528,0.94,Music,https://youtu.be/wpqm-05R2Jk,65,,1642694000.0,"[The, Avalanches, -, Since, I, Left, You, [, P...","[avalanches, since, left, plunderphonics]","[avalanch, sinc, left, plunderphon]",[],[],[]
2,s8dqe7,99 Luftballons or 99 Red Balloons?,1258,0.91,Music,https://www.reddit.com/r/Music/comments/s8dqe7...,649,I don’t speak German at all and still sing alo...,1642665000.0,"[99, Luftballons, or, 99, Red, Balloons, ?]","[99, luftballons, 99, red, balloons]","[99, luftballon, 99, red, balloon]","[I, don, ’, t, speak, German, at, all, and, st...","[speak, german, still, sing, along, 99, luftba...","[speak, german, still, sing, along, 99, luftba..."
3,s8jx84,Agents of Fortune by Blue Oyster Cult is quiet...,165,0.87,Music,https://www.reddit.com/r/Music/comments/s8jx84...,39,Blue Oyster Cult's fourth studio album is an a...,1642688000.0,"[Agents, of, Fortune, by, Blue, Oyster, Cult, ...","[agents, fortune, blue, oyster, cult, quietly,...","[agent, fortun, blue, oyster, cult, quietli, o...","[Blue, Oyster, Cult, 's, fourth, studio, album...","[blue, oyster, cult, fourth, studio, album, ad...","[blue, oyster, cult, fourth, studio, album, ad..."
4,s86gam,Bad Company's 1974 self titled debut album is ...,1505,0.92,Music,https://www.reddit.com/r/Music/comments/s86gam...,216,I'd like to point fingers and say that people ...,1642642000.0,"[Bad, Company, 's, 1974, self, titled, debut, ...","[bad, company, 1974, self, titled, debut, albu...","[bad, compani, 1974, self, titl, debut, album,...","[I, 'd, like, to, point, fingers, and, say, th...","[like, point, fingers, say, people, nt, talk, ...","[like, point, finger, say, peopl, nt, talk, ba..."


# Topic Modeling (LDA)

LDA Config Settings

In [5]:
# Number of topics to generate
num_topics = 5
# Number of passes the LDA should run
num_passes = 100

# Number of words to print out
num_words = 5

LDA on titles of all posts

In [6]:
posts = list(df_posts['title_stem'])
dictionary = corpora.Dictionary(posts)
corpus = [dictionary.doc2bow(article) for article in 
          posts] # All except the last one

In [7]:
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, 
                                            id2word = dictionary, 
                                            passes=num_passes)

topic_results = lda_model.print_topics(num_topics=num_topics, num_words=num_words)
topic_results

[(0,
  '0.042*"song" + 0.015*"rock" + 0.014*"music" + 0.007*"hiphop" + 0.007*"need"'),
 (1,
  '0.039*"music" + 0.023*"song" + 0.021*"new" + 0.013*"help" + 0.012*"album"'),
 (2,
  '0.020*"beat" + 0.019*"2022" + 0.016*"music" + 0.016*"instrument" + 0.014*"rap"'),
 (3,
  '0.029*"rock" + 0.013*"music" + 0.008*"indi" + 0.006*"genr" + 0.006*"would"'),
 (4,
  '0.016*"pop" + 0.015*"rock" + 0.013*"band" + 0.012*"name" + 0.008*"2022"')]

LDA on body of all posts

In [8]:
posts = list(df_posts['body_stem'])
dictionary = corpora.Dictionary(posts)
corpus = [dictionary.doc2bow(article) for article in 
          posts] # All except the last one

In [9]:
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, 
                                            id2word = dictionary, 
                                            passes=num_passes)

topic_results = lda_model.print_topics(num_topics=num_topics, num_words=num_words)
topic_results

[(0,
  '0.032*"song" + 0.015*"hi" + 0.012*"music" + 0.011*"like" + 0.010*"know"'),
 (1,
  '0.021*"http" + 0.018*"music" + 0.015*"song" + 0.015*"like" + 0.013*"listen"'),
 (2,
  '0.016*"music" + 0.014*"like" + 0.014*"http" + 0.008*"artist" + 0.008*"current"'),
 (3,
  '0.012*"music" + 0.011*"look" + 0.011*"album" + 0.011*"releas" + 0.009*"x200b"'),
 (4,
  '0.041*"http" + 0.015*"music" + 0.011*"listen" + 0.011*"song" + 0.010*"one"')]

In [10]:
"""
Visualize the LDA topic modeling
"""

# cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

# cloud = WordCloud(background_color='white',
#                   width=2500,
#                   height=1800,
#                   max_words=10,
#                   colormap='tab10',
#                   color_func=lambda *args, **kwargs: cols[i],
#                   prefer_horizontal=1.0)

# topics = lda_model.show_topics(formatted=False)

# fig, axes = plt.subplots(3, 2, figsize=(10,10), sharex=True, sharey=True)

# for i, ax in enumerate(axes.flatten()):
#     fig.add_subplot(ax)
#     topic_words = dict(topics[i][1])
#     cloud.generate_from_frequencies(topic_words, max_font_size=300)
#     plt.gca().imshow(cloud)
#     plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
#     plt.gca().axis('off')


# plt.subplots_adjust(wspace=0, hspace=0)
# plt.axis('off')
# plt.margins(x=0, y=0)
# plt.tight_layout()
# plt.show()

'\nVisualize the LDA topic modeling\n'

# Read and preprocess the comments CSV

In [11]:
df_comments = pd.read_csv("../data/raw/" + subreddit + "_comments.csv", lineterminator='\n')
df_comments.head()

Unnamed: 0,post_id,comment
0,rhuzow,Lovely question . Natural woman by carol king ...
1,rhuzow,Hi Joss. Paul McCartney said he wished he'd wr...
2,rhuzow,Why did you pick to be a Sausage on the Masked...
3,rhuzow,"Hey Joss, have you ever considered going back ..."
4,rhuzow,'Fell in Love with a Boy' is one of my longest...


In [12]:
PreProcess.preprocess(df_comments, 'comment')
df_comments.head()

Unnamed: 0,post_id,comment,comment_word_token,comment_filtered,comment_stem
0,rhuzow,Lovely question . Natural woman by carol king ...,"[Lovely, question, .]","[lovely, question]","[love, question]"
1,rhuzow,Hi Joss. Paul McCartney said he wished he'd wr...,"[Hi, Joss, .]","[hi, joss]","[hi, joss]"
2,rhuzow,Why did you pick to be a Sausage on the Masked...,"[Why, did, you, pick, to, be, a, Sausage, on, ...","[pick, sausage, masked, singer]","[pick, sausag, mask, singer]"
3,rhuzow,"Hey Joss, have you ever considered going back ...","[Hey, Joss, ,, have, you, ever, considered, go...","[hey, joss, ever, considered, going, back, alb...","[hey, joss, ever, consid, go, back, album, lik..."
4,rhuzow,'Fell in Love with a Boy' is one of my longest...,"['Fell, in, Love, with, a, Boy, ', is, one, of...","[fell, love, boy, one, longest, standing, turn...","[fell, love, boy, one, longest, stand, turnitu..."


# Merge the df comments and posts together

In [13]:
df_merge = df_posts.merge(df_comments, how='left', left_on='post_id', right_on='post_id')
df_merge.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,title_word_token,title_filtered,title_stem,body_word_token,body_filtered,body_stem,comment,comment_word_token,comment_filtered,comment_stem
0,rhuzow,"Hi, this is Joss Stone. I'm excited to answer ...",1404,0.91,Music,https://www.reddit.com/r/Music/comments/rhuzow...,455,I had so much fun writing and recording my new...,1639674000.0,"[Hi, ,, this, is, Joss, Stone, .]","[hi, joss, stone]","[hi, joss, stone]","[I, had, so, much, fun, writing, and, recordin...","[much, fun, writing, recording, new, album, da...","[much, fun, write, record, new, album, dave, s...",Lovely question . Natural woman by carol king ...,"[Lovely, question, .]","[lovely, question]","[love, question]"
1,rhuzow,"Hi, this is Joss Stone. I'm excited to answer ...",1404,0.91,Music,https://www.reddit.com/r/Music/comments/rhuzow...,455,I had so much fun writing and recording my new...,1639674000.0,"[Hi, ,, this, is, Joss, Stone, .]","[hi, joss, stone]","[hi, joss, stone]","[I, had, so, much, fun, writing, and, recordin...","[much, fun, writing, recording, new, album, da...","[much, fun, write, record, new, album, dave, s...",Hi Joss. Paul McCartney said he wished he'd wr...,"[Hi, Joss, .]","[hi, joss]","[hi, joss]"
2,rhuzow,"Hi, this is Joss Stone. I'm excited to answer ...",1404,0.91,Music,https://www.reddit.com/r/Music/comments/rhuzow...,455,I had so much fun writing and recording my new...,1639674000.0,"[Hi, ,, this, is, Joss, Stone, .]","[hi, joss, stone]","[hi, joss, stone]","[I, had, so, much, fun, writing, and, recordin...","[much, fun, writing, recording, new, album, da...","[much, fun, write, record, new, album, dave, s...",Why did you pick to be a Sausage on the Masked...,"[Why, did, you, pick, to, be, a, Sausage, on, ...","[pick, sausage, masked, singer]","[pick, sausag, mask, singer]"
3,rhuzow,"Hi, this is Joss Stone. I'm excited to answer ...",1404,0.91,Music,https://www.reddit.com/r/Music/comments/rhuzow...,455,I had so much fun writing and recording my new...,1639674000.0,"[Hi, ,, this, is, Joss, Stone, .]","[hi, joss, stone]","[hi, joss, stone]","[I, had, so, much, fun, writing, and, recordin...","[much, fun, writing, recording, new, album, da...","[much, fun, write, record, new, album, dave, s...","Hey Joss, have you ever considered going back ...","[Hey, Joss, ,, have, you, ever, considered, go...","[hey, joss, ever, considered, going, back, alb...","[hey, joss, ever, consid, go, back, album, lik..."
4,rhuzow,"Hi, this is Joss Stone. I'm excited to answer ...",1404,0.91,Music,https://www.reddit.com/r/Music/comments/rhuzow...,455,I had so much fun writing and recording my new...,1639674000.0,"[Hi, ,, this, is, Joss, Stone, .]","[hi, joss, stone]","[hi, joss, stone]","[I, had, so, much, fun, writing, and, recordin...","[much, fun, writing, recording, new, album, da...","[much, fun, write, record, new, album, dave, s...",'Fell in Love with a Boy' is one of my longest...,"['Fell, in, Love, with, a, Boy, ', is, one, of...","[fell, love, boy, one, longest, standing, turn...","[fell, love, boy, one, longest, stand, turnitu..."


In [14]:
# Check the number of posts with no comments
print("There are {} posts with no comments".format(df_merge.comment.isna().sum()))

print("There are {} posts with no content after filtering".format(len(df_merge[df_merge['comment_stem'].str.len() == 0])))

There are 243 posts with no comments
There are 42 posts with no content after filtering


# Generate LDA models for each post using comments

In [15]:
topics = []

# Drop the posts that do not have any comments
df_merge_dropped = df_merge[df_merge['comment_stem'].str.len() > 0]

for post_id in df_merge_dropped['post_id'].unique():
    df_temp = df_merge_dropped[df_merge_dropped['post_id'] == post_id]
    posts = list(df_temp['comment_stem'])
    dictionary = corpora.Dictionary(posts)
    corpus = [dictionary.doc2bow(article) for article in 
              posts] # All except the last one
    
    lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, 
                                            id2word = dictionary, 
                                            passes=num_passes)
    topic_results = lda_model.print_topics(num_topics=num_topics, num_words=num_words)
    topics.append({'post_id':post_id, 'topics':topic_results})
topics

[{'post_id': 'rhuzow',
  'topics': [(0,
    '0.104*"joss" + 0.052*"hi" + 0.031*"hey" + 0.027*"http" + 0.024*"like"'),
   (1,
    '0.023*"http" + 0.012*"like" + 0.012*"music" + 0.009*"get" + 0.009*"back"'),
   (2,
    '0.037*"http" + 0.029*"song" + 0.027*"love" + 0.022*"question" + 0.021*"new"'),
   (3,
    '0.023*"like" + 0.017*"artist" + 0.013*"feel" + 0.013*"best" + 0.011*"hi"'),
   (4,
    '0.052*"music" + 0.030*"http" + 0.018*"like" + 0.015*"new" + 0.013*"song"')]},
 {'post_id': 's8m1uh',
  'topics': [(0,
    '0.047*"alway" + 0.047*"realli" + 0.047*"get" + 0.025*"nice" + 0.025*"use"'),
   (1,
    '0.118*"video" + 0.044*"album" + 0.044*"better" + 0.044*"wish" + 0.044*"newer"'),
   (2,
    '0.068*"song" + 0.047*"music" + 0.047*"love" + 0.047*"incred" + 0.047*"avalanch"'),
   (3,
    '0.078*"one" + 0.053*"favorit" + 0.053*"video" + 0.029*"line" + 0.029*"open"'),
   (4,
    '0.066*"glad" + 0.066*"psychiatrist" + 0.066*"frontier" + 0.011*"wonder" + 0.011*"nice"')]},
 {'post_id': 's8dqe7

# Generate a single LDA model for all comments

In [16]:
posts = list(df_merge_dropped['comment_stem'])
dictionary = corpora.Dictionary(posts)
corpus = [dictionary.doc2bow(article) for article in 
          posts] # All except the last one

In [17]:
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, 
                                            id2word = dictionary, 
                                            passes=num_passes)

topic_results = lda_model.print_topics(num_topics=num_topics, num_words=num_words)
topic_results

[(0,
  '0.035*"say" + 0.033*"artist" + 0.033*"come" + 0.031*"song" + 0.031*"wait"'),
 (1,
  '0.050*"market" + 0.032*"album" + 0.027*"whole" + 0.027*"singl" + 0.026*"rather"'),
 (2,
  '0.040*"nt" + 0.038*"great" + 0.035*"album" + 0.028*"mean" + 0.028*"start"'),
 (3,
  '0.095*"album" + 0.074*"listen" + 0.035*"love" + 0.034*"peopl" + 0.029*"one"'),
 (4,
  '0.041*"song" + 0.024*"think" + 0.021*"http" + 0.019*"hear" + 0.018*"without"')]