# **This is a initial descriptive analysis of the Reddit wallstreetbets posts. It contains a basic statistics of words, character count, and occurence. At the bottom, you will find the analysis of the most common mentioned NYSE or other stock tickers. Enjoy!**

**Short summary:**
The average title length is 11 words.
The average title length is 120 words.
The most popular words are, without a surprise: gme, buy, robinhood, hold, amc.
The most popular tickers are: gme, know, one, hold, see, time, big, amc

# Import the libraries

In [None]:
import os
import re
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

from datetime import date, datetime
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
from scipy import stats
from nltk.corpus import stopwords
from collections import Counter

# Read the dataframe

In [None]:
df = pd.read_csv('/kaggle/input/reddit-wallstreetsbets-posts/reddit_wsb.csv')

# Check the head of the dataframe

In [None]:
df.head()

# Drop useless columns

In [None]:
df = df.drop(columns=['id', 'url', 'created'])
df.head()

In [None]:
df.shape

# Add a few data realted columns for further analysis

In [None]:
from datetime import date, datetime
year_col = []
month_col = []
hour_col = []
minute_col = []
for i, content in df['timestamp'].items():
    t1 = datetime.strptime(content, '%Y-%m-%d %H:%M:%S')
    year_col.append(t1.year)
    month_col.append(t1.month)
    hour_col.append(t1.hour)
    minute_col.append(t1.minute)
df['year'] = year_col
df['month'] = month_col
df['hour'] = hour_col
df['minute'] = minute_col

In [None]:
df.head()

# Normalize the text to be lowercase

In [None]:
df['title'] = df['title'].str.lower()
df['body'] = df['body'].str.lower()

In [None]:
df.drop_duplicates(subset=['title'], keep='first', inplace=True)
df.shape

# Descriptive statistics - title

**Count the number of characters and length of a title**

In [None]:
count = df['title'].str.split().str.len()
count.index = count.index.astype(str) + ' words:'
count.sort_index(inplace=True)

print("Total number of words: ", count.sum(), "words")

In [None]:
print("Average number of words per post: ", round(count.mean(),2), "words")
print("Max number of words per post: ", count.max(), "words")
print("Min number of words per post: ", count.min(), "words")

In [None]:
def word_count(df):
    """
    This function takes the dataframe and adds a new colun with the number of words.
    :param df: The dataframe to be transformed.
    :return: The transformed dataframe.
    """
    words_count = []
    for i, content in df['title'].items():
        new_values =[]
        new_values = content.split()
        words_count.append(len(new_values))
    df['title_word_count'] = words_count
    return df

df = word_count(df)

df.head()

In [None]:
df['title_length'] = df['title'].str.len()

print("Total length of a dataset: ", df.title_length.sum(), "characters")
print("Average length of a tweet: ", round(df.title_length.mean(),0), "characters")
print(df.head())

In [None]:
plt.subplots(figsize=(10,8))
sns.heatmap(df.drop(columns=['year']).corr(), annot=True, linewidths=1.5, fmt=".2f");

# Most popular words used in title

In [None]:
def create_text_blob(df, text_column):
    blob_text=[]
    for i, content in df[text_column].items():
        for i in content.split():
            blob_text.append(i.lower())
    return blob_text

blob_text = create_text_blob(df, 'title')
print(blob_text[0:100])

# Let's remove the stop words

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))  
filtered_sentence = [w for w in blob_text if not w in stop_words]  
filtered_sentence = []  
  
for w in blob_text:  
    if w not in stop_words:  
        filtered_sentence.append(w)  

print(filtered_sentence[0:100])  

In [None]:
counts = Counter(filtered_sentence)

# Let's print the most popular words, used over 700 times

In [None]:
import plotly.express as px

top_20_words = {}

for (key, value) in counts.items():
   # Check if value is greater than 200 and add to new dictionary
    if value > 700 :
        top_20_words[key] = value
    continue

sorted_top_20_words = dict(sorted(top_20_words.items(), key=lambda item: item[1], reverse=False))

word = sorted_top_20_words.keys()
count = sorted_top_20_words.values()


fig = px.bar(y=word, x=count, text = count)
fig.update_traces(texttemplate='%{text:}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()

In [None]:
top_20_words_clean = {}

for (key, value) in counts.items():
    # Check if key length is greater than 3 and value greater than 150 and add to new dictionary
    if len(key)>2 and value > 700 :
        top_20_words_clean[key] = value
    continue

sorted_top_20_words_clean = dict(sorted(top_20_words_clean.items(), key=lambda item: item[1], reverse=False))

word = sorted_top_20_words_clean.keys()
count = sorted_top_20_words_clean.values()

fig = px.bar(y=word, x=count, text = count)
fig.update_traces(texttemplate='%{text:}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()

# Descriptive statistics - body

In [None]:
count = df['body'].str.split().str.len()
count.index = count.index.astype(str) + ' words:'
count.sort_index(inplace=True)

print("Total number of words: ", count.sum(), "words")

In [None]:
print("Average number of words per post: ", round(count.mean(),2), "words")
print("Max number of words per post: ", count.max(), "words")
print("Min number of words per post: ", count.min(), "words")

In [None]:
df['body_length'] = df['body'].str.len()

print("Total length of a dataset: ", df.body_length.sum(), "characters")
print("Average length of a tweet: ", round(df.body_length.mean(),0), "characters")
print(df.head())

In [None]:
def create_text_blob(df, text_column):
    blob_text=[]
    for i, content in df[text_column].items():
        for i in str(content).split():
            blob_text.append(i.lower())
    return blob_text

blob_text = create_text_blob(df, 'body')
print(blob_text[0:100])

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))  
stop_words.add('nan')
filtered_sentence = [w for w in blob_text if not w in stop_words]  
filtered_sentence = []  
  
for w in blob_text:  
    if w not in stop_words:  
        filtered_sentence.append(w)  

print(filtered_sentence[0:100])  

In [None]:
counts_body = Counter(filtered_sentence)

In [None]:
import plotly.express as px

top_20_words = {}

for (key, value) in counts_body.items():
   # Check if value is greater than 3000 and add to new dictionary
    if value != "nan" and value > 3000:
        top_20_words[key] = value
    continue

sorted_top_20_words = dict(sorted(top_20_words.items(), key=lambda item: item[1], reverse=False))

word = sorted_top_20_words.keys()
count = sorted_top_20_words.values()


fig = px.bar(y=word, x=count, text = count)
fig.update_traces(texttemplate='%{text:}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()

# **Let's see what are the most popular tickers mentioned in the body text**

In [None]:
nyse_tickers = pd.read_csv("../input/tickers/nyse-listed_csv.csv")
other_tickers = pd.read_csv("../input/tickers/other-listed_csv.csv")

In [None]:
nyse_tickers.head()

In [None]:
nyse_tickers_list = list(nyse_tickers['ACT Symbol'].str.lower())
other_tickers_list = list(other_tickers['ACT Symbol'].str.lower())
nyse_tickers_list[0:10]

In [None]:
import plotly.express as px

top_words = {}

for (key, value) in counts_body.items():
   # Check if value is greater than 100 and add to new dictionary
    if key in nyse_tickers_list and value > 300: 
        top_words[key] = value
    continue
    
sorted_top_words = dict(sorted(top_words.items(), key=lambda item: item[1], reverse=False))

word = sorted_top_words.keys()
count = sorted_top_words.values()

fig = px.bar(y=word, x=count, text = count, title='Nyse Tickers')
fig.update_traces(texttemplate='%{text:}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()



In [None]:
import plotly.express as px

top_words = {}

for (key, value) in counts_body.items():
   # Check if value is greater than 100 and add to new dictionary
    if key in other_tickers_list and value > 500: 
        top_words[key] = value
    continue
    
sorted_top_words = dict(sorted(top_words.items(), key=lambda item: item[1], reverse=False))

word = sorted_top_words.keys()
count = sorted_top_words.values()

fig = px.bar(y=word, x=count, text = count, title='Other Tickers')
fig.update_traces(texttemplate='%{text:}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()