# DMP - WallStreet Bets

In [None]:
import pandas as pd

In [8]:
## Read data
import pandas as pd
import re
from collections import Counter
import nltk
from nltk.tokenize import  word_tokenize
from pandarallel import pandarallel
import pickle

import os
import numpy as np


### Full data

In [10]:
%%time
post_data_full = pd.read_csv('wallstreetbets_submissions_zstextract.csv')
post_data_full.shape

CPU times: user 13.7 s, sys: 2.43 s, total: 16.1 s
Wall time: 16.1 s


(2218243, 14)

In [82]:
post_data_full.head(2)

Unnamed: 0.1,Unnamed: 0,type,id,title,body,author,name,created_utc,url,score,num_comments,upvotes,downvotes,permalink
0,0,Post,s4jw1,Earnings season is here. Place your bets.,"I know that /r/investing is a great place for congregating with fellow market gamblers - but it's not *exactly* the right place to be. So rather than flooding it with gambling posts and epic win/loss screen shots I figured I'd make another sub for this specifically for speculating and such. We'll see if it works. Subscribe if you're interested.\n\n\nAnyways, upcomming earnings (feel free to add):\n\n* AAPL - 4/24\n\n* AMZN - 4/25 (unconfirmed)\n\n* GOOG - 4/12 (TOMORROW! - after market cl...",[deleted],t3_s4jw1,1334162440,http://www.reddit.com/r/wallstreetbets/comments/s4jw1/earnings_season_is_here_place_your_bets/,13,22,16,3,/r/wallstreetbets/comments/s4jw1/earnings_season_is_here_place_your_bets/
1,1,Post,s6r57,"GOOG - beat estimates, price barely rises.",,[deleted],t3_s6r57,1334263051,http://www.bloomberg.com/news/2012-04-12/google-s-profit-tops-estimates-as-new-markets-help-fuel-growth.html,2,0,5,3,/r/wallstreetbets/comments/s6r57/goog_beat_estimates_price_barely_rises/


In [83]:
post_data_full['type'].value_counts()

Post    2218243
Name: type, dtype: int64

In [84]:
post_data_full.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2218243 entries, 0 to 2218242
Data columns (total 14 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   Unnamed: 0    2218243 non-null  int64 
 1   type          2218243 non-null  object
 2   id            2218243 non-null  object
 3   title         2218239 non-null  object
 4   body          1556533 non-null  object
 5   author        2218243 non-null  object
 6   name          1905290 non-null  object
 7   created_utc   2218243 non-null  int64 
 8   url           2185509 non-null  object
 9   score         2218243 non-null  int64 
 10  num_comments  2218243 non-null  int64 
 11  upvotes       2218243 non-null  int64 
 12  downvotes     2218243 non-null  int64 
 13  permalink     2218243 non-null  object
dtypes: int64(6), object(8)
memory usage: 236.9+ MB


In [11]:
import datetime

post_data_full['timestamp'] = post_data_full['created_utc'].apply(datetime.datetime.fromtimestamp)
post_data_full['timestamp'].min()

Timestamp('2012-04-11 16:40:40')

## Profiling

In [4]:
def clean_text(text):
    # Remove all text between square brackets
    text = re.sub(r'\[.*?\]', '', text)
    
    # Remove all text between parentheses
    text = re.sub(r'\(.*?\)', '', text)
    
    # Remove all text between curly braces
    text = re.sub(r'\{.*?\}', '', text)
    
    # Remove all text between angle brackets
    text = re.sub(r'<.*?>', '', text)
    
    # Remove all text between double quotes
    #text = re.sub(r'"(.*?)"', '', text)
    
    # Remove all text between single quotes
    #text = re.sub(r"'(.*?)'", '', text)
    
    # Remove everything between **____**
    text = re.sub(r'\*\*(.*?)\*\*','',text)
    
    # Remove words followed by ^
    text = re.sub(r'(\^\w*)','',text)
    
    # Remove mentions, hashtags, links
    text = re.sub(r'\||:|\-|\n|(@\w+.*?)|(http\w\S+.*?)|(#\w+)','',text)
    
    # Remove all ^Your daily posts
    text = re.sub(r'^Your daily trading discussion.*|^We are now using AutoModerator.*|^Every time a new submission.*|^Trading discussion only.*','',text)
    
    # Remove all newlines, tabs, and extra whitespace
    text = re.sub(r'[\n\t]+', ' ', text)
    text = re.sub(r'\s{2,}', ' ', text)
    
    
    return text

In [6]:
%%time

pandarallel.initialize()
post_data_full['clean_text'] = post_data_full['body'].parallel_apply(lambda x: clean_text(str(x)))
post_data_full = post_data_full[['id','title','clean_text','author','timestamp','upvotes','num_comments','score']]
post_data_full.groupby(['clean_text'])['author'].count().sort_values(ascending=False).head(5)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
CPU times: user 2.53 s, sys: 926 ms, total: 3.45 s
Wall time: 10.1 s


clean_text
            1400263
nan          661710
&amp;;         1012
                648
Discuss.        509
Name: author, dtype: int64

In [12]:
## Creating a copy
df = post_data_full

In [88]:
%%time

pandarallel.initialize()

stop_words = set(nltk.corpus.stopwords.words('english'))
text = df['title'].str.lower().str.cat(sep=' ')

text_token = nltk.tokenize.word_tokenize(text)
text_token = [word for word in text_token if len(word) > 1]
text_token = [word for word in text_token if not word.isnumeric()]
text_token = [word for word in text_token if word not in stop_words]

word_cnt = Counter(text_token)
word_cnt.most_common(10)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
CPU times: user 3min 4s, sys: 1.97 s, total: 3min 6s
Wall time: 3min 6s


[('gme', 195396),
 ('buy', 119774),
 ('amc', 104421),
 ("'s", 90274),
 ('stock', 87597),
 ('...', 75084),
 ('moon', 74195),
 ('hold', 70079),
 ('like', 61723),
 ('robinhood', 61143)]

In [92]:
word_cnt.most_common(20)

[('gme', 195396),
 ('buy', 119774),
 ('amc', 104421),
 ("'s", 90274),
 ('stock', 87597),
 ('...', 75084),
 ('moon', 74195),
 ('hold', 70079),
 ('like', 61723),
 ('robinhood', 61143),
 ('short', 60488),
 ('get', 58732),
 ('today', 57710),
 ('go', 54476),
 ('market', 54380),
 ('going', 53307),
 ("n't", 52333),
 ('new', 48931),
 ('time', 47296),
 ('let', 45540)]

In [93]:
post_data_full.groupby(['author'])['score'].mean().sort_values(ascending=False).head(10)

author
AdjustedClimatology    196211.0
1daBread               178528.0
CaptainOats8690        172631.0
string_beans           170922.0
mcuban                 157954.0
LilHMoney              123128.0
yellow_duck12          120463.0
tannerwastaken         116812.0
TheRavinRaven          116353.0
Lansman                109162.0
Name: score, dtype: float64

In [94]:
post_data_full.groupby(['author'])['upvotes'].sum().sort_values(ascending=False).head(10)

author
[deleted]             71501
haupt91               55636
realdotards           34431
ITradeBaconFutures    29342
GodEmperorMusk        27107
dajesus77             23470
SIThereAndThere       15967
Fiercehero            14811
AutoModerator         12011
Bombadilionare        11765
Name: upvotes, dtype: int64

In [95]:
post_data_full.groupby(['author'])['num_comments'].mean().sort_values(ascending=False).head(10)

author
wallstreetboyfriend    44567.750000
premier_               27140.714286
mcuban                 27014.000000
grebfar                26051.875000
Dan_inKuwait           23408.500000
MrBeast100kinvest      16845.000000
GoBeaversOSU           14607.000000
Stylux                 12379.714286
CallsOnAlcoholism      11618.333333
Shrubber               10912.000000
Name: num_comments, dtype: float64

## Topic Modelling

In [4]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

import string


import gensim
from gensim import corpora, models
from gensim.models.ldamulticore import LdaMulticore
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel

# import pyLDAvis.gensim
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

In [5]:
import warnings

warnings.simplefilter('ignore')

In [14]:
%%time

from multiprocessing import Pool

stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in str(doc).lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

def parallel_clean(data):
    with Pool() as pool:
        results = pool.map(clean, data)
    return [result.split() for result in results]

# post_list = df['clean_text'].tolist()
# post_clean = parallel_clean(post_list)

title_list = df['title'].tolist()
title_clean = parallel_clean(title_list)

CPU times: user 6.17 s, sys: 2.1 s, total: 8.27 s
Wall time: 19.6 s


In [11]:
# Creating the term dictionary of our corpus, where every unique term is assigned an index. 

dictionary = corpora.Dictionary(title_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.

%time doc_term_matrix = [dictionary.doc2bow(doc) for doc in title_clean]

CPU times: user 17 s, sys: 638 ms, total: 17.6 s
Wall time: 17.6 s


In [106]:
# Save the dictionary and corpus
with open('DMP/dictionary.pkl', 'wb') as f:
    pickle.dump(dictionary, f)
with open('DMP/corpus.pkl', 'wb') as f:
    pickle.dump(doc_term_matrix, f)

In [12]:
import multiprocessing

num_processors = multiprocessing.cpu_count()
num_processors

16

In [13]:
%%time

num_topics = 10
iterations = 100
passes = 20
workers = num_processors - 1
eval_every = None

ldamodel = LdaMulticore(corpus=doc_term_matrix,
                       id2word=dictionary,
                       eta='auto',
                       num_topics=num_topics,
                       iterations=iterations,
                       passes=passes,
                       eval_every=eval_every,
                       workers = workers)

CPU times: user 24min 1s, sys: 4min 13s, total: 28min 14s
Wall time: 27min 38s


In [16]:
print(*ldamodel.print_topics(num_topics=num_topics, num_words=15), sep='\n')

(0, '0.072*"robinhood" + 0.049*"stock" + 0.027*"trading" + 0.020*"dd" + 0.015*"trade" + 0.015*"market" + 0.013*"app" + 0.013*"rh" + 0.012*"move" + 0.010*"use" + 0.010*"order" + 0.010*"broker" + 0.009*"stonks" + 0.008*"thread" + 0.008*"new"')
(1, '0.028*"call" + 0.024*"option" + 0.023*"day" + 0.022*"put" + 0.019*"week" + 0.019*"market" + 0.017*"yolo" + 0.014*"today" + 0.013*"bought" + 0.011*"spy" + 0.010*"year" + 0.010*"first" + 0.010*"last" + 0.010*"tsla" + 0.009*"share"')
(2, '0.039*"like" + 0.037*"bb" + 0.027*"im" + 0.026*"guy" + 0.023*"i’m" + 0.022*"know" + 0.021*"anyone" + 0.014*"think" + 0.013*"look" + 0.012*"good" + 0.011*"wall" + 0.011*"someone" + 0.011*"stock" + 0.010*"street" + 0.010*"love"')
(3, '0.066*"short" + 0.029*"holding" + 0.023*"squeeze" + 0.023*"tendies" + 0.021*"price" + 0.013*"position" + 0.012*"long" + 0.012*"coming" + 0.010*"still" + 0.009*"share" + 0.008*"nio" + 0.008*"interest" + 0.008*"may" + 0.008*"war" + 0.007*"2020"')
(4, '0.021*"fund" + 0.018*"hedge" + 0.0

In [15]:
ldamodel.save('lda_post')

In [17]:
%%time

lda_display = gensimvis.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

CPU times: user 3min 48s, sys: 439 ms, total: 3min 49s
Wall time: 3min 53s


In [None]:
topic_distributions = [ldamodel.get_document_topics(doc) for doc in doc_term_matrix]

# Choose the topic with the highest probability for each document
labels = [max(td, key=lambda x: x[1])[0] for td in topic_distributions]

# Assign the labels to the corresponding rows in your dataset
df['topic_label'] = labels

In [None]:
import matplotlib.pyplot as plt


df['time_period'] = pd.PeriodIndex(df['timestamp'], freq='M')
count_df = df.groupby(['time_period', 'topic_label']).size().reset_index(name='count')

count_df['time_period'] = count_df['time_period'].dt.strftime('%Y-%m')

# Plot the topic timeline
fig, ax = plt.subplots()
for topic in count_df['topic_label'].unique():
    topic_data = count_df[count_df['topic_label'] == topic]
    ax.scatter(topic_data['time_period'], topic_data['count'], label='Topic {}'.format(topic))
ax.set_xlabel('Time Period')
ax.set_ylabel('Number of Documents')
ax.legend()
plt.xticks(rotation=90)
#ax.set_xticks(quarter_periods)
#ax.set_xticklabels(quarter_periods.strftime('%Y-Q%q'))
plt.show()

## Coherence

In [None]:
num_topics = 10
iterations = 100
passes = 20
workers = num_processors - 1
eval_every = None

ldamodel = LdaMulticore(corpus=doc_term_matrix,
                       id2word=dictionary,
                       eta='auto',
                       num_topics=num_topics,
                       iterations=iterations,
                       passes=passes,
                       eval_every=eval_every,
                       workers = workers)

In [16]:
## Selecting the best no of topics

def compute_coherence_values(dictionary, corpus, texts, 
                             cohere, limit, start=2, step=2):

    coherence_values = []
    
    workers=15

    for num_topics in range(start, limit, step):
        model = LdaMulticore(corpus=corpus,
                       id2word=dictionary,
                       eta='auto',
                       num_topics=num_topics,
                       iterations=100,
                       passes=20,
                       eval_every=None,
                       workers = workers)
        coherencemodel = CoherenceModel(model=model, 
                                        texts=texts, 
                                        dictionary=dictionary, 
                                        coherence=cohere)
        coherence_values.append(coherencemodel.get_coherence())

    return coherence_values

In [15]:
limit=12
start=4
step=2

In [9]:

with open('dictionary.pkl', 'rb') as f:
    dictionary = pickle.load(f)
    
with open('corpus.pkl', 'rb') as f:
    corpus = pickle.load(f)


In [None]:
coherence_values = compute_coherence_values(dictionary=dictionary, 
                                            corpus=corpus, 
                                            texts=title_clean, 
                                            cohere='c_v', 
                                            start=start, 
                                            limit=limit, 
                                            step=step)

In [None]:
plt.figure(figsize=(8,5))

# Create a custom x-axis
x = range(start, limit, step)

# Build the line plot
ax = sns.lineplot(x=x, y=coherence_values, color='#238C8C')

# Set titles and labels
plt.title("Best Number of Topics for LDA Model")
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.xlim(start, limit)
plt.xticks(range(2, limit, step))

# Add a vertical line to show the optimum number of topics
plt.axvline(x[np.argmax(coherence_values)], 
            color='#F26457', linestyle='--')

# Draw a custom legend
legend_elements = [Line2D([0], [0], color='#238C8C', 
                          ls='-', label='Coherence Value (c_v)'),
                   Line2D([0], [1], color='#F26457', 
                          ls='--', label='Optimal Number of Topics')]

ax.legend(handles=legend_elements, loc='upper right')