In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import spacy
import matplotlib.pyplot as plt
import multiprocessing
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
import plotly.graph_objects as go
from sklearn.feature_extraction.text import CountVectorizer
from yellowbrick.text import freqdist
plt.style.use('ggplot')

In [None]:
df = pd.read_csv('/kaggle/input/reddit-wallstreetsbets-posts/reddit_wsb.csv')

In [None]:
df.shape

In [None]:
df.head()

### Title is mentioning one of the following stocks:

In [None]:
stocks = ['GME', 'AMC', 'NOC', 'BB', 'TR', 'BBW', 'KOSS']

for stock in stocks:
    name = f'is_{stock.lower()}'
    df[name] = np.where(df['title'].str.contains(stock, case=False), 1, 0)

In [None]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [None]:
df['date'] = df['timestamp'].dt.date
df['hour'] = df['timestamp'].dt.hour

In [None]:
stocks_bydate = df.groupby('date').agg({'is_gme':'sum',
                        'is_amc':'sum',
                        'is_noc':'sum',
                        'is_bb':'sum',
                        'is_koss':'sum',
                        'is_tr':'sum',
                       }).reset_index()

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=stocks_bydate['date'].values, y=stocks_bydate['is_gme'].values,
                    mode='lines+markers',
                    name='GME'))

fig.add_trace(go.Scatter(x=stocks_bydate['date'].values, y=stocks_bydate['is_amc'].values,
                    mode='lines+markers',
                    name='AMC'))

fig.add_trace(go.Scatter(x=stocks_bydate['date'].values, y=stocks_bydate['is_bb'].values,
                    mode='lines+markers',
                    name='BB'))

fig.add_trace(go.Scatter(x=stocks_bydate['date'].values, y=stocks_bydate['is_tr'].values,
                    mode='lines+markers',
                    name='TR'))

fig.update_layout(
    title='Number of Mentions in Post Title per Day',
    xaxis_tickfont_size=14,
    yaxis=dict(
        title='Number of Posts',
        titlefont_size=16,
        tickfont_size=14,
    ))

fig.show()

### Hourly Mentions in title during Jan 29th - February 1st Peak

In [None]:
at_peak = df[(df['date']<=pd.to_datetime('2021-02-01'))].copy()
at_peak.shape

In [None]:
atpeak_byhour = at_peak.groupby('hour').agg({'is_gme':'sum',
                                             'is_bb':'sum',
                             'is_amc':'sum',
                             'is_noc':'sum',
                             'is_tr':'sum'}).reset_index()

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=atpeak_byhour['hour'].values, y=atpeak_byhour['is_gme'].values,
                    mode='lines+markers',
                    name='GME'))

fig.add_trace(go.Scatter(x=atpeak_byhour['hour'].values, y=atpeak_byhour['is_amc'].values,
                    mode='lines+markers',
                    name='AMC'))

fig.add_trace(go.Scatter(x=atpeak_byhour['hour'].values, y=atpeak_byhour['is_bb'].values,
                    mode='lines+markers',
                    name='BB'))

fig.add_trace(go.Scatter(x=atpeak_byhour['hour'].values, y=atpeak_byhour['is_tr'].values,
                    mode='lines+markers',
                    name='TR'))

fig.update_layout(
    title='Number of Mentions in Post Title per Hour',
    xaxis_tickfont_size=14,
    yaxis=dict(
        title='Number of Posts',
        titlefont_size=16,
        tickfont_size=14,
    ))

fig.show()

# Analyze Body Data During Peak

In [None]:
has_body = at_peak[at_peak['body'].notnull()].copy()
has_body.shape, np.round(has_body.shape[0]/at_peak.shape[0]*100, 2)

In [None]:
has_body['body'] = has_body['body'].str.lower()

# Random Post Sample at Peak

In [None]:
sample = has_body.sample(3000, random_state=0)

In [None]:
nlp = spacy.load('en')

In [None]:
def is_token(token):
    """
    Check whether token is valid
    """
    if (not token or not token.string.strip or
       token.is_stop or not token.is_alpha or token.is_punct):
        return False
    return True

In [None]:
def standardize_text(df, column, nlp):
    """
    MUCH FASTER than .apply
    Check 
    https://stackoverflow.com/questions/44395656/applying-spacy-parser-to-pandas-dataframe-w-multiprocessing
    """
    clean_docs = []
    for doc in nlp.pipe(df[column].astype('unicode').values, batch_size=50,
                    n_threads=3):
        clean_comment =  [token.lemma_.strip() for token in doc if is_token(token)]
        clean_docs.append(" ".join(clean_comment))
    return clean_docs

In [None]:
sample['clean_body'] = standardize_text(sample, 'body', nlp)

In [None]:
sample['clean_title'] = standardize_text(sample, 'title', nlp)

In [None]:
sample.to_csv('sample_reddit_wallstreetsbets_posts.csv')

In [None]:
def show_freq_distribution(df, column, ngram=2, top=20):
    """
    Show Frequency Distribution for
    Ngrams
    """
    vectorizer = CountVectorizer(analyzer='word', ngram_range=(ngram, ngram))
    matrix = vectorizer.fit_transform(df[column].values)
    ngrams = pd.DataFrame(matrix.toarray())
    ngrams.columns = vectorizer.get_feature_names()
    freqdist(ngrams.columns, matrix, orient='h', n=top)

### Top 20 Bigrams for Post Body

In [None]:
show_freq_distribution(sample, 'clean_body')

In [None]:
vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2))
matrix = vectorizer.fit_transform(sample['clean_body'].values)
ngrams = pd.DataFrame(matrix.toarray())
ngrams.columns = vectorizer.get_feature_names()

In [None]:
check = sample.reset_index(drop=True).merge(ngrams, left_index=True, right_index=True, how='left')

In [None]:
top_20 = ngrams.sum().to_frame().sort_values(0, ascending=False).head(20).index.tolist()
top_20

## How did posts evolve during the peak?

In [None]:

bigrams_bydate = check.groupby('date').agg({'hedge fund':'sum',
                           'buy gme':'sum',
                           'retail investor':'sum',
                           'financial advice':'sum',
                           'market manipulation':'sum',
                           'wall street': 'sum',
                           'hold line':'sum',
                           'short squeeze':'sum'}).reset_index()

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(x=bigrams_bydate['date'].values,
                y=bigrams_bydate['hedge fund'].values,
                name='Hedge Fund',
                marker_color='rgb(55, 83, 109)'
                ))

fig.add_trace(go.Bar(x=bigrams_bydate['date'].values,
                y=bigrams_bydate['buy gme'].values,
                name='Buy GME',
                marker_color='rgb(51, 185, 176)'
                ))


fig.add_trace(go.Bar(x=bigrams_bydate['date'].values,
                y=bigrams_bydate['financial advice'].values,
                name='Financial Advice'
                ))

fig.add_trace(go.Bar(x=bigrams_bydate['date'].values,
                y=bigrams_bydate['market manipulation'].values,
                name='Market Manipulation'
                ))

fig.add_trace(go.Bar(x=bigrams_bydate['date'].values,
                y=bigrams_bydate['wall street'].values,
                name='Wall Street'
                ))

fig.add_trace(go.Bar(x=bigrams_bydate['date'].values,
                y=bigrams_bydate['hold line'].values,
                name='Hold Line'
                ))


fig.update_layout(
    title='Posts for Selected Bigrams During Peak',
    xaxis_tickfont_size=14,
    yaxis=dict(
        title='Number of Posts',
        titlefont_size=16,
        tickfont_size=14,
    ))
fig.show()

### Using Word2Vec to Find Similarities in the Posts

The idea of using Word2Vec is to find which words are more likely to appear together. This could shed some light into the overall sentiment of the post.

In [None]:
cores = multiprocessing.cpu_count()

In [None]:
has_body['clean_body'] = standardize_text(has_body, 'body', nlp)

In [None]:
sentences = [row.split() for row in has_body['clean_body']]

In [None]:
phrases = Phrases(sentences, min_count=30, progress_per=10000)

In [None]:
word_model = Phraser(phrases)
model_sentences = word_model[sentences]

In [None]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=400,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)


In [None]:
w2v_model.build_vocab(model_sentences, progress_per=10000)

In [None]:
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

In [None]:
w2v_model.init_sims(replace=True)

In [None]:
def visualize_top_positive(model, word, top):
    df = pd.DataFrame(model.wv.most_similar(positive=word)).loc[:top-1]
    df = df.rename(columns={0:'words', 1:'prob'})
    df.plot(x='words', y='prob', kind='barh', alpha=0.5, color='k')
    plt.legend(loc='upper right',  bbox_to_anchor=(1.02, 1.0))
    plt.show()

In [None]:
visualize_top_positive(w2v_model, ['buy', 'gme'], 5)

In [None]:
visualize_top_positive(w2v_model, ['hedgefund'], 5)

It is interesting that this naive way of looking at the data is showing some relations that confirm what has been reported in the news:

For example, the words **market** or **manipulation** are likely to be related with fraud, collusion, clearly or blatant.

The same seems to be true for **hedgefund**, which is likely to be associated with words that express a negative sentiments.

In [None]:
visualize_top_positive(w2v_model, ['market', 'manipulation'], 5)

In [None]:
visualize_top_positive(w2v_model, ['hold', 'line'], 5)