In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
%matplotlib inline
import itertools
from wordcloud import WordCloud, STOPWORDS

from src.features.preprocess import PreProcess

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer

import re
from gensim import corpora, models
import gensim

[nltk_data] Downloading package punkt to /home/andy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/andy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/andy/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/andy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/andy/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Read the posts csv file

In [2]:
subreddit = "Music"

In [3]:
df_posts = pd.read_csv("../data/raw/" + subreddit + "_posts.csv")
df_posts.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created
0,rhuzow,"Hi, this is Joss Stone. I'm excited to answer ...",1404,0.91,Music,https://www.reddit.com/r/Music/comments/rhuzow...,455,I had so much fun writing and recording my new...,1639674000.0
1,s8m1uh,The Avalanches - Since I Left You [Plunderphon...,528,0.94,Music,https://youtu.be/wpqm-05R2Jk,65,,1642694000.0
2,s8dqe7,99 Luftballons or 99 Red Balloons?,1258,0.91,Music,https://www.reddit.com/r/Music/comments/s8dqe7...,649,I don’t speak German at all and still sing alo...,1642665000.0
3,s8jx84,Agents of Fortune by Blue Oyster Cult is quiet...,165,0.87,Music,https://www.reddit.com/r/Music/comments/s8jx84...,39,Blue Oyster Cult's fourth studio album is an a...,1642688000.0
4,s86gam,Bad Company's 1974 self titled debut album is ...,1505,0.92,Music,https://www.reddit.com/r/Music/comments/s86gam...,216,I'd like to point fingers and say that people ...,1642642000.0


# Preprocess

In [4]:
PreProcess.preprocess(df_posts, 'title')
PreProcess.preprocess(df_posts, 'body')
df_posts.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,title_word_token,title_filtered,title_stem,body_word_token,body_filtered,body_stem
0,rhuzow,"Hi, this is Joss Stone. I'm excited to answer ...",1404,0.91,Music,https://www.reddit.com/r/Music/comments/rhuzow...,455,I had so much fun writing and recording my new...,1639674000.0,"[Hi, ,, this, is, Joss, Stone, .]","[hi, joss, stone]","[hi, joss, stone]","[I, had, so, much, fun, writing, and, recordin...","[much, fun, writing, recording, new, album, da...","[much, fun, write, record, new, album, dave, s..."
1,s8m1uh,The Avalanches - Since I Left You [Plunderphon...,528,0.94,Music,https://youtu.be/wpqm-05R2Jk,65,,1642694000.0,"[The, Avalanches, -, Since, I, Left, You, [, P...","[avalanches, since, left, plunderphonics]","[avalanch, sinc, left, plunderphon]",[],[],[]
2,s8dqe7,99 Luftballons or 99 Red Balloons?,1258,0.91,Music,https://www.reddit.com/r/Music/comments/s8dqe7...,649,I don’t speak German at all and still sing alo...,1642665000.0,"[99, Luftballons, or, 99, Red, Balloons, ?]","[99, luftballons, 99, red, balloons]","[99, luftballon, 99, red, balloon]","[I, don, ’, t, speak, German, at, all, and, st...","[speak, german, still, sing, along, 99, luftba...","[speak, german, still, sing, along, 99, luftba..."
3,s8jx84,Agents of Fortune by Blue Oyster Cult is quiet...,165,0.87,Music,https://www.reddit.com/r/Music/comments/s8jx84...,39,Blue Oyster Cult's fourth studio album is an a...,1642688000.0,"[Agents, of, Fortune, by, Blue, Oyster, Cult, ...","[agents, fortune, blue, oyster, cult, quietly,...","[agent, fortun, blue, oyster, cult, quietli, o...","[Blue, Oyster, Cult, 's, fourth, studio, album...","[blue, oyster, cult, fourth, studio, album, ad...","[blue, oyster, cult, fourth, studio, album, ad..."
4,s86gam,Bad Company's 1974 self titled debut album is ...,1505,0.92,Music,https://www.reddit.com/r/Music/comments/s86gam...,216,I'd like to point fingers and say that people ...,1642642000.0,"[Bad, Company, 's, 1974, self, titled, debut, ...","[bad, company, 1974, self, titled, debut, albu...","[bad, compani, 1974, self, titl, debut, album,...","[I, 'd, like, to, point, fingers, and, say, th...","[like, point, fingers, say, people, nt, talk, ...","[like, point, finger, say, peopl, nt, talk, ba..."


# Topic Modeling (LDA)

LDA Config Settings

In [5]:
# Number of topics to generate
num_topics = 5
# Number of passes the LDA should run
num_passes = 100

# Number of words to print out
num_words = 5

LDA on titles of all posts

In [6]:
posts = list(df_posts['title_stem'])
dictionary = corpora.Dictionary(posts)
corpus = [dictionary.doc2bow(article) for article in 
          posts] # All except the last one

In [7]:
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, 
                                            id2word = dictionary, 
                                            passes=num_passes)

topic_results = lda_model.print_topics(num_topics=num_topics, num_words=num_words)
topic_results

[(0,
  '0.034*"rock" + 0.029*"song" + 0.012*"help" + 0.010*"2022" + 0.007*"young"'),
 (1,
  '0.019*"song" + 0.011*"music" + 0.010*"pop" + 0.007*"punk" + 0.006*"genr"'),
 (2,
  '0.052*"music" + 0.016*"new" + 0.013*"pop" + 0.008*"album" + 0.007*"playlist"'),
 (3,
  '0.019*"rap" + 0.018*"beat" + 0.016*"music" + 0.012*"album" + 0.012*"instrument"'),
 (4,
  '0.009*"music" + 0.009*"song" + 0.008*"time" + 0.006*"sound" + 0.006*"hop"')]

LDA on body of all posts

In [8]:
posts = list(df_posts['body_stem'])
dictionary = corpora.Dictionary(posts)
corpus = [dictionary.doc2bow(article) for article in 
          posts] # All except the last one

In [9]:
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, 
                                            id2word = dictionary, 
                                            passes=num_passes)

topic_results = lda_model.print_topics(num_topics=num_topics, num_words=num_words)
topic_results

[(0,
  '0.016*"music" + 0.011*"know" + 0.010*"good" + 0.010*"one" + 0.010*"band"'),
 (1,
  '0.053*"http" + 0.015*"song" + 0.011*"music" + 0.011*"wwwyoutubecomwatch" + 0.008*"love"'),
 (2,
  '0.028*"http" + 0.015*"like" + 0.012*"music" + 0.008*"would" + 0.008*"youtub"'),
 (3,
  '0.033*"song" + 0.014*"album" + 0.013*"hello" + 0.011*"music" + 0.009*"like"'),
 (4,
  '0.023*"music" + 0.019*"hi" + 0.014*"like" + 0.010*"song" + 0.009*"know"')]

In [10]:
"""
Visualize the LDA topic modeling
"""

# cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

# cloud = WordCloud(background_color='white',
#                   width=2500,
#                   height=1800,
#                   max_words=10,
#                   colormap='tab10',
#                   color_func=lambda *args, **kwargs: cols[i],
#                   prefer_horizontal=1.0)

# topics = lda_model.show_topics(formatted=False)

# fig, axes = plt.subplots(3, 2, figsize=(10,10), sharex=True, sharey=True)

# for i, ax in enumerate(axes.flatten()):
#     fig.add_subplot(ax)
#     topic_words = dict(topics[i][1])
#     cloud.generate_from_frequencies(topic_words, max_font_size=300)
#     plt.gca().imshow(cloud)
#     plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
#     plt.gca().axis('off')


# plt.subplots_adjust(wspace=0, hspace=0)
# plt.axis('off')
# plt.margins(x=0, y=0)
# plt.tight_layout()
# plt.show()

'\nVisualize the LDA topic modeling\n'

# Read and preprocess the comments CSV

In [11]:
df_comments = pd.read_csv("../data/raw/" + subreddit + "_comments.csv", lineterminator='\n')
df_comments.head()

Unnamed: 0,post_id,comment
0,rhuzow,Lovely question . Natural woman by carol king ...
1,rhuzow,Hi Joss. Paul McCartney said he wished he'd wr...
2,rhuzow,Why did you pick to be a Sausage on the Masked...
3,rhuzow,"Hey Joss, have you ever considered going back ..."
4,rhuzow,'Fell in Love with a Boy' is one of my longest...


In [12]:
PreProcess.preprocess(df_comments, 'comment')
df_comments.head()

Unnamed: 0,post_id,comment,comment_word_token,comment_filtered,comment_stem
0,rhuzow,Lovely question . Natural woman by carol king ...,"[Lovely, question, .]","[lovely, question]","[love, question]"
1,rhuzow,Hi Joss. Paul McCartney said he wished he'd wr...,"[Hi, Joss, .]","[hi, joss]","[hi, joss]"
2,rhuzow,Why did you pick to be a Sausage on the Masked...,"[Why, did, you, pick, to, be, a, Sausage, on, ...","[pick, sausage, masked, singer]","[pick, sausag, mask, singer]"
3,rhuzow,"Hey Joss, have you ever considered going back ...","[Hey, Joss, ,, have, you, ever, considered, go...","[hey, joss, ever, considered, going, back, alb...","[hey, joss, ever, consid, go, back, album, lik..."
4,rhuzow,'Fell in Love with a Boy' is one of my longest...,"['Fell, in, Love, with, a, Boy, ', is, one, of...","[fell, love, boy, one, longest, standing, turn...","[fell, love, boy, one, longest, stand, turnitu..."


# Merge the df comments and posts together

In [13]:
df_merge = df_posts.merge(df_comments, how='left', left_on='post_id', right_on='post_id')
df_merge.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,title_word_token,title_filtered,title_stem,body_word_token,body_filtered,body_stem,comment,comment_word_token,comment_filtered,comment_stem
0,rhuzow,"Hi, this is Joss Stone. I'm excited to answer ...",1404,0.91,Music,https://www.reddit.com/r/Music/comments/rhuzow...,455,I had so much fun writing and recording my new...,1639674000.0,"[Hi, ,, this, is, Joss, Stone, .]","[hi, joss, stone]","[hi, joss, stone]","[I, had, so, much, fun, writing, and, recordin...","[much, fun, writing, recording, new, album, da...","[much, fun, write, record, new, album, dave, s...",Lovely question . Natural woman by carol king ...,"[Lovely, question, .]","[lovely, question]","[love, question]"
1,rhuzow,"Hi, this is Joss Stone. I'm excited to answer ...",1404,0.91,Music,https://www.reddit.com/r/Music/comments/rhuzow...,455,I had so much fun writing and recording my new...,1639674000.0,"[Hi, ,, this, is, Joss, Stone, .]","[hi, joss, stone]","[hi, joss, stone]","[I, had, so, much, fun, writing, and, recordin...","[much, fun, writing, recording, new, album, da...","[much, fun, write, record, new, album, dave, s...",Hi Joss. Paul McCartney said he wished he'd wr...,"[Hi, Joss, .]","[hi, joss]","[hi, joss]"
2,rhuzow,"Hi, this is Joss Stone. I'm excited to answer ...",1404,0.91,Music,https://www.reddit.com/r/Music/comments/rhuzow...,455,I had so much fun writing and recording my new...,1639674000.0,"[Hi, ,, this, is, Joss, Stone, .]","[hi, joss, stone]","[hi, joss, stone]","[I, had, so, much, fun, writing, and, recordin...","[much, fun, writing, recording, new, album, da...","[much, fun, write, record, new, album, dave, s...",Why did you pick to be a Sausage on the Masked...,"[Why, did, you, pick, to, be, a, Sausage, on, ...","[pick, sausage, masked, singer]","[pick, sausag, mask, singer]"
3,rhuzow,"Hi, this is Joss Stone. I'm excited to answer ...",1404,0.91,Music,https://www.reddit.com/r/Music/comments/rhuzow...,455,I had so much fun writing and recording my new...,1639674000.0,"[Hi, ,, this, is, Joss, Stone, .]","[hi, joss, stone]","[hi, joss, stone]","[I, had, so, much, fun, writing, and, recordin...","[much, fun, writing, recording, new, album, da...","[much, fun, write, record, new, album, dave, s...","Hey Joss, have you ever considered going back ...","[Hey, Joss, ,, have, you, ever, considered, go...","[hey, joss, ever, considered, going, back, alb...","[hey, joss, ever, consid, go, back, album, lik..."
4,rhuzow,"Hi, this is Joss Stone. I'm excited to answer ...",1404,0.91,Music,https://www.reddit.com/r/Music/comments/rhuzow...,455,I had so much fun writing and recording my new...,1639674000.0,"[Hi, ,, this, is, Joss, Stone, .]","[hi, joss, stone]","[hi, joss, stone]","[I, had, so, much, fun, writing, and, recordin...","[much, fun, writing, recording, new, album, da...","[much, fun, write, record, new, album, dave, s...",'Fell in Love with a Boy' is one of my longest...,"['Fell, in, Love, with, a, Boy, ', is, one, of...","[fell, love, boy, one, longest, standing, turn...","[fell, love, boy, one, longest, stand, turnitu..."


In [14]:
# Check the number of posts with no comments
print("There are {} posts with no comments".format(df_merge.comment.isna().sum()))
print("There are {} posts with no content after filtering".format(len(df_merge[df_merge['comment_stem'].str.len() == 0])))

There are 243 posts with no comments
There are 42 posts with no content after filtering


# Generate LDA models for each post using comments

In [15]:
topics = []

# Drop the posts that do not have any comments
df_merge_dropped = df_merge[df_merge['comment_stem'].str.len() > 0]

# for post_id in ['n2n0ax', 'qb4bof']:
for post_id in df_merge_dropped['post_id'].unique():
    df_temp = df_merge_dropped[df_merge_dropped['post_id'] == post_id]
    posts = list(df_temp['comment_stem'])
    dictionary = corpora.Dictionary(posts)
    corpus = [dictionary.doc2bow(article) for article in 
              posts] # All except the last one
    
    
    lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, 
                                            id2word = dictionary, 
                                            passes=num_passes)
    topic_results = lda_model.print_topics(num_topics=num_topics, num_words=num_words)
    topics.append({'post_id':post_id, 'topics':topic_results})
topics

[{'post_id': 'rhuzow',
  'topics': [(0,
    '0.055*"like" + 0.025*"would" + 0.024*"love" + 0.018*"music" + 0.016*"artist"'),
   (1,
    '0.024*"http" + 0.017*"tudor" + 0.013*"music" + 0.011*"joss" + 0.011*"alway"'),
   (2,
    '0.064*"joss" + 0.028*"hey" + 0.025*"question" + 0.019*"stone" + 0.013*"hope"'),
   (3,
    '0.047*"hi" + 0.036*"http" + 0.036*"joss" + 0.030*"song" + 0.025*"music"'),
   (4,
    '0.049*"http" + 0.017*"song" + 0.015*"joss" + 0.015*"like" + 0.014*"play"')]},
 {'post_id': 's8m1uh',
  'topics': [(0,
    '0.048*"album" + 0.048*"vibe" + 0.048*"still" + 0.048*"slap" + 0.048*"great"'),
   (1,
    '0.034*"plunderphon" + 0.034*"use" + 0.034*"alway" + 0.034*"bring" + 0.034*"genuin"'),
   (2,
    '0.093*"song" + 0.071*"video" + 0.049*"love" + 0.049*"realli" + 0.049*"better"'),
   (3,
    '0.066*"nice" + 0.036*"avalanch" + 0.036*"alway" + 0.036*"see" + 0.036*"respect"'),
   (4,
    '0.100*"video" + 0.052*"one" + 0.052*"favorit" + 0.028*"high" + 0.028*"back"')]},
 {'post_id':

# Generate a single LDA model for all comments

In [16]:
posts = list(df_merge_dropped['comment_stem'])
dictionary = corpora.Dictionary(posts)
corpus = [dictionary.doc2bow(article) for article in 
          posts] # All except the last one

In [17]:
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, 
                                            id2word = dictionary, 
                                            passes=num_passes)

topic_results = lda_model.print_topics(num_topics=num_topics, num_words=num_words)
topic_results

[(0,
  '0.057*"album" + 0.053*"song" + 0.051*"market" + 0.038*"think" + 0.031*"listen"'),
 (1,
  '0.040*"nt" + 0.037*"great" + 0.032*"http" + 0.027*"start" + 0.026*"tool"'),
 (2,
  '0.107*"album" + 0.051*"listen" + 0.036*"love" + 0.029*"realli" + 0.029*"one"'),
 (3,
  '0.040*"song" + 0.033*"say" + 0.031*"come" + 0.027*"gener" + 0.026*"pay"'),
 (4,
  '0.025*"like" + 0.021*"artist" + 0.021*"music" + 0.019*"wait" + 0.017*"spotifi"')]

# Redo the above with Lemm instead of Stem

In [18]:
PreProcess.preprocess(df_posts, 'title', lemm=True)
PreProcess.preprocess(df_posts, 'body', lemm=True)
df_posts.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,title_word_token,title_filtered,title_stem,body_word_token,body_filtered,body_stem,title_tag,title_lemm,body_tag,body_lemm
0,rhuzow,"Hi, this is Joss Stone. I'm excited to answer ...",1404,0.91,Music,https://www.reddit.com/r/Music/comments/rhuzow...,455,I had so much fun writing and recording my new...,1639674000.0,"[Hi, ,, this, is, Joss, Stone, .]","[hi, joss, stone]","[hi, joss, stone]","[I, had, so, much, fun, writing, and, recordin...","[much, fun, writing, recording, new, album, da...","[much, fun, write, record, new, album, dave, s...","[[(hi, NN)], [(joss, NN)], [(stone, NN)]]","[hi, joss, stone]","[[(much, JJ)], [(fun, NN)], [(writing, VBG)], ...","[much, fun, write, record, new, album, dave, s..."
1,s8m1uh,The Avalanches - Since I Left You [Plunderphon...,528,0.94,Music,https://youtu.be/wpqm-05R2Jk,65,,1642694000.0,"[The, Avalanches, -, Since, I, Left, You, [, P...","[avalanches, since, left, plunderphonics]","[avalanch, sinc, left, plunderphon]",[],[],[],"[[(avalanches, NNS)], [(since, IN)], [(left, N...","[avalanche, since, left, plunderphonics]",[],[]
2,s8dqe7,99 Luftballons or 99 Red Balloons?,1258,0.91,Music,https://www.reddit.com/r/Music/comments/s8dqe7...,649,I don’t speak German at all and still sing alo...,1642665000.0,"[99, Luftballons, or, 99, Red, Balloons, ?]","[99, luftballons, 99, red, balloons]","[99, luftballon, 99, red, balloon]","[I, don, ’, t, speak, German, at, all, and, st...","[speak, german, still, sing, along, 99, luftba...","[speak, german, still, sing, along, 99, luftba...","[[(99, CD)], [(luftballons, NNS)], [(99, CD)],...","[99, luftballons, 99, red, balloon]","[[(speak, NN)], [(german, JJ)], [(still, RB)],...","[speak, german, still, sing, along, 99, luftba..."
3,s8jx84,Agents of Fortune by Blue Oyster Cult is quiet...,165,0.87,Music,https://www.reddit.com/r/Music/comments/s8jx84...,39,Blue Oyster Cult's fourth studio album is an a...,1642688000.0,"[Agents, of, Fortune, by, Blue, Oyster, Cult, ...","[agents, fortune, blue, oyster, cult, quietly,...","[agent, fortun, blue, oyster, cult, quietli, o...","[Blue, Oyster, Cult, 's, fourth, studio, album...","[blue, oyster, cult, fourth, studio, album, ad...","[blue, oyster, cult, fourth, studio, album, ad...","[[(agents, NNS)], [(fortune, NN)], [(blue, NN)...","[agent, fortune, blue, oyster, cult, quietly, ...","[[(blue, NN)], [(oyster, NN)], [(cult, NN)], [...","[blue, oyster, cult, fourth, studio, album, ad..."
4,s86gam,Bad Company's 1974 self titled debut album is ...,1505,0.92,Music,https://www.reddit.com/r/Music/comments/s86gam...,216,I'd like to point fingers and say that people ...,1642642000.0,"[Bad, Company, 's, 1974, self, titled, debut, ...","[bad, company, 1974, self, titled, debut, albu...","[bad, compani, 1974, self, titl, debut, album,...","[I, 'd, like, to, point, fingers, and, say, th...","[like, point, fingers, say, people, nt, talk, ...","[like, point, finger, say, peopl, nt, talk, ba...","[[(bad, JJ)], [(company, NN)], [(1974, CD)], [...","[bad, company, 1974, self, title, debut, album...","[[(like, IN)], [(point, NN)], [(fingers, NNS)]...","[like, point, finger, say, people, nt, talk, b..."


##### LDA on title

In [19]:
posts = list(df_posts['title_lemm'])
dictionary = corpora.Dictionary(posts)
corpus = [dictionary.doc2bow(article) for article in 
          posts] # All except the last one

In [20]:
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, 
                                            id2word = dictionary, 
                                            passes=num_passes)

topic_results = lda_model.print_topics(num_topics=num_topics, num_words=num_words)
topic_results

[(0,
  '0.026*"music" + 0.017*"indie" + 0.017*"rock" + 0.014*"pop" + 0.009*"song"'),
 (1,
  '0.026*"song" + 0.021*"new" + 0.016*"help" + 0.013*"music" + 0.011*"2022"'),
 (2,
  '0.029*"song" + 0.018*"beat" + 0.011*"2022" + 0.008*"rap" + 0.008*"blue"'),
 (3,
  '0.029*"music" + 0.013*"rock" + 0.008*"genre" + 0.007*"rap" + 0.007*"jazz"'),
 (4,
  '0.023*"rock" + 0.012*"music" + 0.012*"album" + 0.010*"pop" + 0.009*"alternative"')]

##### LDA on body

In [21]:
posts = list(df_posts['body_lemm'])
dictionary = corpora.Dictionary(posts)
corpus = [dictionary.doc2bow(article) for article in 
          posts] # All except the last one

In [22]:
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, 
                                            id2word = dictionary, 
                                            passes=num_passes)

topic_results = lda_model.print_topics(num_topics=num_topics, num_words=num_words)
topic_results

[(0,
  '0.030*"song" + 0.013*"music" + 0.013*"http" + 0.011*"like" + 0.009*"story"'),
 (1,
  '0.019*"know" + 0.017*"song" + 0.016*"hi" + 0.013*"music" + 0.010*"listen"'),
 (2,
  '0.019*"music" + 0.015*"album" + 0.011*"rock" + 0.009*"hello" + 0.008*"time"'),
 (3,
  '0.040*"http" + 0.013*"song" + 0.013*"wwwyoutubecomwatch" + 0.008*"play" + 0.008*"wwwyoutubecomplaylist"'),
 (4,
  '0.021*"music" + 0.020*"http" + 0.017*"song" + 0.016*"like" + 0.012*"listen"')]

##### LDA on comments

In [23]:
df_comments = pd.read_csv("../data/raw/" + subreddit + "_comments.csv", lineterminator='\n')
df_comments.head()

Unnamed: 0,post_id,comment
0,rhuzow,Lovely question . Natural woman by carol king ...
1,rhuzow,Hi Joss. Paul McCartney said he wished he'd wr...
2,rhuzow,Why did you pick to be a Sausage on the Masked...
3,rhuzow,"Hey Joss, have you ever considered going back ..."
4,rhuzow,'Fell in Love with a Boy' is one of my longest...


In [24]:
PreProcess.preprocess(df_comments, 'comment', lemm=True)
df_comments.head()

Unnamed: 0,post_id,comment,comment_word_token,comment_filtered,comment_stem,comment_tag,comment_lemm
0,rhuzow,Lovely question . Natural woman by carol king ...,"[Lovely, question, .]","[lovely, question]","[love, question]","[[(lovely, RB)], [(question, NN)]]","[lovely, question]"
1,rhuzow,Hi Joss. Paul McCartney said he wished he'd wr...,"[Hi, Joss, .]","[hi, joss]","[hi, joss]","[[(hi, NN)], [(joss, NN)]]","[hi, joss]"
2,rhuzow,Why did you pick to be a Sausage on the Masked...,"[Why, did, you, pick, to, be, a, Sausage, on, ...","[pick, sausage, masked, singer]","[pick, sausag, mask, singer]","[[(pick, NN)], [(sausage, NN)], [(masked, NNS)...","[pick, sausage, masked, singer]"
3,rhuzow,"Hey Joss, have you ever considered going back ...","[Hey, Joss, ,, have, you, ever, considered, go...","[hey, joss, ever, considered, going, back, alb...","[hey, joss, ever, consid, go, back, album, lik...","[[(hey, NN)], [(joss, NN)], [(ever, RB)], [(co...","[hey, joss, ever, consider, go, back, album, l..."
4,rhuzow,'Fell in Love with a Boy' is one of my longest...,"['Fell, in, Love, with, a, Boy, ', is, one, of...","[fell, love, boy, one, longest, standing, turn...","[fell, love, boy, one, longest, stand, turnitu...","[[(fell, VBD)], [(love, NN)], [(boy, NN)], [(o...","[fell, love, boy, one, long, stand, turnitupan..."


In [25]:
df_merge = df_posts.merge(df_comments, how='left', left_on='post_id', right_on='post_id')
df_merge.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created,title_word_token,...,title_tag,title_lemm,body_tag,body_lemm,comment,comment_word_token,comment_filtered,comment_stem,comment_tag,comment_lemm
0,rhuzow,"Hi, this is Joss Stone. I'm excited to answer ...",1404,0.91,Music,https://www.reddit.com/r/Music/comments/rhuzow...,455,I had so much fun writing and recording my new...,1639674000.0,"[Hi, ,, this, is, Joss, Stone, .]",...,"[[(hi, NN)], [(joss, NN)], [(stone, NN)]]","[hi, joss, stone]","[[(much, JJ)], [(fun, NN)], [(writing, VBG)], ...","[much, fun, write, record, new, album, dave, s...",Lovely question . Natural woman by carol king ...,"[Lovely, question, .]","[lovely, question]","[love, question]","[[(lovely, RB)], [(question, NN)]]","[lovely, question]"
1,rhuzow,"Hi, this is Joss Stone. I'm excited to answer ...",1404,0.91,Music,https://www.reddit.com/r/Music/comments/rhuzow...,455,I had so much fun writing and recording my new...,1639674000.0,"[Hi, ,, this, is, Joss, Stone, .]",...,"[[(hi, NN)], [(joss, NN)], [(stone, NN)]]","[hi, joss, stone]","[[(much, JJ)], [(fun, NN)], [(writing, VBG)], ...","[much, fun, write, record, new, album, dave, s...",Hi Joss. Paul McCartney said he wished he'd wr...,"[Hi, Joss, .]","[hi, joss]","[hi, joss]","[[(hi, NN)], [(joss, NN)]]","[hi, joss]"
2,rhuzow,"Hi, this is Joss Stone. I'm excited to answer ...",1404,0.91,Music,https://www.reddit.com/r/Music/comments/rhuzow...,455,I had so much fun writing and recording my new...,1639674000.0,"[Hi, ,, this, is, Joss, Stone, .]",...,"[[(hi, NN)], [(joss, NN)], [(stone, NN)]]","[hi, joss, stone]","[[(much, JJ)], [(fun, NN)], [(writing, VBG)], ...","[much, fun, write, record, new, album, dave, s...",Why did you pick to be a Sausage on the Masked...,"[Why, did, you, pick, to, be, a, Sausage, on, ...","[pick, sausage, masked, singer]","[pick, sausag, mask, singer]","[[(pick, NN)], [(sausage, NN)], [(masked, NNS)...","[pick, sausage, masked, singer]"
3,rhuzow,"Hi, this is Joss Stone. I'm excited to answer ...",1404,0.91,Music,https://www.reddit.com/r/Music/comments/rhuzow...,455,I had so much fun writing and recording my new...,1639674000.0,"[Hi, ,, this, is, Joss, Stone, .]",...,"[[(hi, NN)], [(joss, NN)], [(stone, NN)]]","[hi, joss, stone]","[[(much, JJ)], [(fun, NN)], [(writing, VBG)], ...","[much, fun, write, record, new, album, dave, s...","Hey Joss, have you ever considered going back ...","[Hey, Joss, ,, have, you, ever, considered, go...","[hey, joss, ever, considered, going, back, alb...","[hey, joss, ever, consid, go, back, album, lik...","[[(hey, NN)], [(joss, NN)], [(ever, RB)], [(co...","[hey, joss, ever, consider, go, back, album, l..."
4,rhuzow,"Hi, this is Joss Stone. I'm excited to answer ...",1404,0.91,Music,https://www.reddit.com/r/Music/comments/rhuzow...,455,I had so much fun writing and recording my new...,1639674000.0,"[Hi, ,, this, is, Joss, Stone, .]",...,"[[(hi, NN)], [(joss, NN)], [(stone, NN)]]","[hi, joss, stone]","[[(much, JJ)], [(fun, NN)], [(writing, VBG)], ...","[much, fun, write, record, new, album, dave, s...",'Fell in Love with a Boy' is one of my longest...,"['Fell, in, Love, with, a, Boy, ', is, one, of...","[fell, love, boy, one, longest, standing, turn...","[fell, love, boy, one, longest, stand, turnitu...","[[(fell, VBD)], [(love, NN)], [(boy, NN)], [(o...","[fell, love, boy, one, long, stand, turnitupan..."


In [26]:
# Check the number of posts with no comments
print("There are {} posts with no comments".format(df_merge.comment.isna().sum()))
print("There are {} posts with no content after filtering".format(len(df_merge[df_merge['comment_stem'].str.len() == 0])))

There are 243 posts with no comments
There are 42 posts with no content after filtering


##### LDA for comments in each post

In [27]:
topics = []

# Drop the posts that do not have any comments
df_merge_dropped = df_merge[df_merge['comment_lemm'].str.len() > 0]

for post_id in df_merge_dropped['post_id'].unique():
    df_temp = df_merge_dropped[df_merge_dropped['post_id'] == post_id]
    posts = list(df_temp['comment_lemm'])
    dictionary = corpora.Dictionary(posts)
    corpus = [dictionary.doc2bow(article) for article in 
              posts] # All except the last one
    
    
    lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, 
                                            id2word = dictionary, 
                                            passes=num_passes)
    topic_results = lda_model.print_topics(num_topics=num_topics, num_words=num_words)
    topics.append({'post_id':post_id, 'topics':topic_results})
topics

[{'post_id': 'rhuzow',
  'topics': [(0,
    '0.057*"joss" + 0.046*"hi" + 0.028*"http" + 0.023*"stone" + 0.018*"like"'),
   (1,
    '0.058*"joss" + 0.035*"hey" + 0.020*"question" + 0.015*"hi" + 0.015*"like"'),
   (2,
    '0.037*"love" + 0.028*"http" + 0.021*"music" + 0.011*"album" + 0.011*"get"'),
   (3,
    '0.029*"like" + 0.014*"artist" + 0.012*"work" + 0.012*"play" + 0.009*"would"'),
   (4,
    '0.055*"http" + 0.047*"song" + 0.024*"music" + 0.020*"new" + 0.017*"like"')]},
 {'post_id': 's8m1uh',
  'topics': [(0,
    '0.062*"video" + 0.043*"love" + 0.043*"one" + 0.043*"really" + 0.043*"favorite"'),
   (1,
    '0.072*"album" + 0.044*"video" + 0.039*"wish" + 0.039*"newer" + 0.039*"frontier"'),
   (2,
    '0.134*"song" + 0.097*"video" + 0.038*"incredible" + 0.038*"music" + 0.038*"well"'),
   (3,
    '0.035*"get" + 0.035*"avalanche" + 0.035*"high" + 0.035*"school" + 0.035*"brings"'),
   (4,
    '0.064*"use" + 0.064*"plunderphonics" + 0.035*"always" + 0.035*"genre" + 0.035*"genuine"')]},
 {

##### LDA for all comments

In [28]:
posts = list(df_merge_dropped['comment_lemm'])
dictionary = corpora.Dictionary(posts)
corpus = [dictionary.doc2bow(article) for article in 
          posts] # All except the last one

In [29]:
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, 
                                            id2word = dictionary, 
                                            passes=num_passes)

topic_results = lda_model.print_topics(num_topics=num_topics, num_words=num_words)
topic_results

[(0,
  '0.028*"nt" + 0.026*"great" + 0.018*"start" + 0.017*"tool" + 0.017*"bunch"'),
 (1,
  '0.036*"album" + 0.034*"http" + 0.031*"wait" + 0.031*"mean" + 0.028*"playlist"'),
 (2,
  '0.049*"listen" + 0.047*"album" + 0.042*"market" + 0.034*"love" + 0.031*"think"'),
 (3,
  '0.086*"album" + 0.048*"spotify" + 0.038*"people" + 0.032*"really" + 0.029*"artist"'),
 (4,
  '0.036*"say" + 0.034*"come" + 0.033*"song" + 0.027*"pay" + 0.026*"money"')]