In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import datetime
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk import ngrams, word_tokenize
import seaborn as sns
%matplotlib inline
sns.set(rc={'figure.figsize': (11, 8)})
sns.set_palette('pastel')
nltk.download('punkt')

In [None]:
wallstreet_raw = pd.read_csv('/kaggle/input/reddit-rwallstreetbets/r_wallstreetbets_posts.csv')
wallstreet_raw.head()

The dataset will be limited to the columns (id, title, author, created_datetime) as the rest are of little interest.

In [None]:
wallstreet_filtered = wallstreet_raw[['id', 'title', 'author', 'created_datetime']].copy()
wallstreet_filtered.head()

## The first question : How has the activity been on this subreddit?

In order to answer this, we will look at the number of post it has received over the years.

In [None]:
ax = sns.barplot(data=wallstreet_filtered.created_datetime.dt.year.value_counts().reset_index(), x = 'index', y = 'created_datetime')
ax.set(xlabel = 'Year', ylabel = 'Number of posts');

- The data roots back to 2012, which could be the year of inception of this subreddit.

- The first major bit of activity is in 2018, which I believe is around the time that Bitcoin was rallying.

- The amount of activity in 2021 has surpassed that of 2020 even though it is just the beginning of February.

- Hence, the activity certainly has been at it's peak during the last couple of months.

A follow up on this is to look at the activity from first time post by authors.

We will look at the number of authors who posted for the first time over the years.

If the activity is from new followers of this subreddit, we should expect a large number of first time posts being made in the recent years. 

In [None]:
each_users_datetime_of_first_post = wallstreet_filtered.groupby(['author']).created_datetime.min().reset_index()
each_users_datetime_of_first_post.columns = ['author', 'datetime_of_first_post']
each_users_datetime_of_first_post['year'] = each_users_datetime_of_first_post.datetime_of_first_post.dt.year
each_users_datetime_of_first_post.head()

In [None]:
ax = sns.barplot(data=each_users_datetime_of_first_post.year.value_counts().reset_index(), x = 'index', y = 'year')
ax.set(xlabel = 'Year', ylabel = 'Number of first time authors');

- The above plot shows that 2021 is the year which saw the most new authors. That means the activity is from new followers rather than the existing followers.

- This reddit saw an amount of activity in 2021 that is more than the combined activity of the previous years.

- Considering we are only one month into 2021, this is staggering.


#### Having noticed that the majority of the traffic took place in 2020 and 2021, let's dig into these to see at what point this subreddit found momentum.

In [None]:
wallstreet_filtered_2020_2021 = wallstreet_filtered[wallstreet_filtered.created_datetime.dt.year.isin([2020, 2021])].copy()
wallstreet_filtered_2020_2021['month'] = wallstreet_filtered_2020_2021.created_datetime.dt.month
wallstreet_filtered_2020_2021['year'] = wallstreet_filtered_2020_2021.created_datetime.dt.year
wallstreet_filtered_2020_2021.head()

In [None]:
ax = sns.barplot(data= wallstreet_filtered_2020_2021.groupby(['year', 'month']).id.count().reset_index(), x= 'month', y= 'id', hue= 'year')
ax.set(xlabel = 'month', ylabel = 'count of comments')
plt.legend(loc='upper right')

It is clear as day that the traffic to this reddit only began at the start of 2021. Things were pretty normal before this.

Hence, for the rest of the analysis, we will focus on the activity in 2021 alone.

In [None]:
wallstreet_filtered_2021 = wallstreet_filtered[wallstreet_filtered.created_datetime.dt.year == 2021].copy()
wallstreet_filtered_2021['day'] = wallstreet_filtered_2021.created_datetime.dt.day
wallstreet_filtered_2021.head(10)

In [None]:
ax = sns.barplot(data= wallstreet_filtered_2021.groupby(['day']).id.count().reset_index(), x='day', y= 'id')
ax.set(xlabel = 'Day of January', ylabel = 'Number of comments');

Looks like it was the 27th of January that kicked off activity on this Reddit.

## The next question of interest: Who were the most active authors?

These authors could be the most influential of the lot.

In [None]:
wallstreet_filtered_2021.author.value_counts().reset_index().rename(columns={'index': 'Author', 'author': 'Number of posts'})


- Interesting. A large amount of comments were deleted by the moderator. Wonder why.

- This hinders my quest to find the most active authors.

- However, I will be ignoring the deleted comments and continue looking.


Let's visulize the activity pattern of the top 5 authors on this reddit.

In [None]:
top_5_active_authors = wallstreet_filtered_2021[ wallstreet_filtered_2021.author.isin(wallstreet_filtered_2021.author.value_counts()[1:6].index)]
top_5_active_authors.head(10)

In [None]:
top_5_active_authors_posts_per_day_in_January = top_5_active_authors.groupby(['day', 'author']).id.count().reset_index()
top_5_active_authors_posts_per_day_in_January.columns = ['day', 'author', 'count']
top_5_active_authors_posts_per_day_in_January

In [None]:
sns.barplot(data= top_5_active_authors_posts_per_day_in_January, x='day', y= 'count', hue='author')
plt.legend(loc='upper left');

- From the above bar plot, the most consistent author is CappedCrib, who has been regular at posting.

- There isn't much here to help us answer if there were any influential authors/followers.

- Let's look at some of their posts.

In [None]:
top_5_active_authors[top_5_active_authors.author == 'dhiral1994'].sort_values(['day'])

In [None]:
top_5_active_authors[(top_5_active_authors.author == 'CappedCrib') & (top_5_active_authors.day >= 26)].sort_values(['day'])

## The final question : What were the most common phrases used by the authors.

I am sure it would be "TO THE MOON" but would be interesting to see what the others are.

In [None]:
wallstreet_2021_comments_cleaned = wallstreet_filtered_2021.title.apply(lambda c : ''.join([x for x in str(c).lower() if x.isalnum() or x.isspace()]))
wallstreet_2021_comments_cleaned.head()

In [None]:
def get_ngrams(text, n):
    '''
    INPUT
    text - string, any text to create ngrams for
    n    - int, the n in the ngram i.e n = 2 will create set of all possible phrases of 2 words

    OUTPUT
    list of strings, each string being a phrase of n words
    '''
    output_grams = []
    grams = ngrams(word_tokenize(text), n)
    for gram in grams:
        try:
            output_grams.append(' '.join(gram))
        except:
            continue
    return output_grams

In [None]:
def get_top_n_phrases(phrase_series, gram_n = 3, top_n = 10):
    '''
    INPUT
    phrase_series - pandas series, a series of text for which a count of all phrases of n words is require
    gram_n        - int, default 3, the n in the ngram i.e gram_n = 2 will create set of all possible phrases of 2 words
    top_n         - int, default 10, the number of top results to return

    OUTPUT
    a pandas dataframe, with two columns (phrase, count) where phrase is the phrase of gram_n words and top_n number of rows
    '''
    phrase_list = phrase_series.apply(get_ngrams, args=(gram_n,))
    phrase_dict = {}
    for ele in phrase_list:
        for phrase in ele:
            phrase_dict.update({phrase: phrase_dict.get(phrase, 0) + 1})
    return pd.DataFrame(sorted(phrase_dict.items(), key=lambda item: item[1], reverse=True)[:top_n], columns= ['phrase', 'count'])

In [None]:
# five_word_phrases = get_top_n_phrases(wallstreet_2021_comments_cleaned, 5)
# four_word_phrases = get_top_n_phrases(wallstreet_2021_comments_cleaned, 4)
# three_word_phrases = get_top_n_phrases(wallstreet_2021_comments_cleaned, 3)
two_word_phrases = get_top_n_phrases(wallstreet_2021_comments_cleaned, 2)

In [None]:
sns.barplot(data = two_word_phrases, x = 'count', y = 'phrase');

In [None]:
sns.barplot(data = three_word_phrases, x = 'count', y = 'phrase');

In [None]:
sns.barplot(data = four_word_phrases, x = 'count', y = 'phrase');