# Reddit WallStreetBets Posts

**Importing the necessary Libraries**

In [None]:
# To prevent the annoyning Warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('darkgrid')
cmap = sns.cm.mako_r

%matplotlib inline

import re
from nltk import word_tokenize, corpus
from nltk.stem import PorterStemmer
from wordcloud import WordCloud, STOPWORDS
from nltk.sentiment import SentimentIntensityAnalyzer

english_words = set(corpus.words.words())

from IPython.core.display import HTML
HTML("""<style> 
            .output_png { display: table-cell; text-align: center; vertical-align: middle; } 
     </style> """)

**Importing the Data using pandas read_csv() and we will drop the columns id, url and created as I am not gonna be using these columns for analysis**

In [None]:
reddit = pd.read_csv('../input/reddit-wallstreetsbets-posts/reddit_wsb.csv')
reddit.drop(columns=['id', 'url', 'created'], inplace=True)

**Calling head() and info() in the DataFrame**

In [None]:
reddit.head()

In [None]:
reddit.info()

From the above information on the the DataFrame we can clearly see that we only have NaN in the case of body and also around half the cells in the Body Column are NaN.

---
**Which Day of the week we have most Post**

1. We are converting the 'timestamp' to a datetime object
2. Using the weekday() from the datetime object we get the Day of the Week
3. We will plot a barplot using the Seaborn and set the order of days accordingly

In [None]:
reddit['timestamp'] = pd.to_datetime(reddit['timestamp'])

day_of_the_week = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}
days_order = list(day_of_the_week.values())

reddit['Weekday'] = reddit['timestamp'].apply(lambda x : day_of_the_week[x.weekday()])

xs = reddit['Weekday'].value_counts().index
ys = reddit['Weekday'].value_counts().values

plt.figure(figsize=(14,6))

sns.barplot(x=xs, y=ys, order=days_order)

plt.title("No. of Posts vs Day of the Week", fontsize=15)

plt.xlabel("Days", fontsize=15)
plt.ylabel("No. of Posts", fontsize=15)

plt.show()

From the above Barplot we can clearly see that the there were huge number of posts on Friday.

---
**Focusing more on the 'title' and 'body' column of the data**

1. We will preprocess the Text Data in the Title and Body using the clean_text_date().
2. The function will remove Handlers, URLs, Special Characters, Single Characters and Extra Spaces

In [None]:
reddit_title = reddit['title'].dropna()
reddit_body = reddit['body'].dropna()


def clean_text_date(text):
    text = text.lower()

    # Replacing Handlers with Empty String
    text = re.sub('@[^\s]+','',text)

    # Replacing URLs with Empty String
    text = re.sub(r"http\S+", "",text)

    # Remove all the special characters
    text = ' '.join(re.findall(r'\w+', text))

    # Replacing Single Characters with Empty String
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)

    # Removing Extra Spaces
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    
    return text

    
# Text Preprocessing
reddit_title = reddit_title.apply(lambda x : clean_text_date(x))
reddit_body = reddit_body.apply(lambda x : clean_text_date(x))

---
**Plotting a Histograms to see the Length(No. of words) Distribution of Title and Body**

1. We use the word_tokenize from nltk on each Title and Body to get a list of Lengths.
2. Then using the Seaborn histplot we plot a histogram. 

In [None]:
title_length = [len(word_tokenize(text)) for text in reddit_title]
body_length = [len(word_tokenize(text)) for text in reddit_body]

fig, (axis1, axis2) = plt.subplots(1,2, figsize=(16,6))

sns.histplot(title_length, bins=50, kde=True, ax=axis1)
sns.histplot(body_length, bins=40, kde=True, ax=axis2)

axis1.set_xlabel("Length of Title")
axis2.set_xlabel("Length of Body")

plt.show()

---
**WordCloud for Title and Body of the Post**

In [None]:
word_tokens = [word_tokenize(text) for text in reddit_title]

word_cloud_string = ""

for word_list in word_tokens:
    for word in word_list:
        if word.lower() in english_words:
            word_cloud_string += word + " "
        
# Updating some of the Words into Stopwords 
description_stopwords = set(STOPWORDS)

my_word_cloud = WordCloud(background_color='white',stopwords=description_stopwords).generate(word_cloud_string)
plt.figure(figsize=(10,20))
plt.imshow(my_word_cloud, interpolation='bilinear')
plt.title("Word Cloud for Post Title", fontsize=20)
plt.axis('off')
plt.show()

In [None]:
word_tokens = [word_tokenize(text) for text in reddit_body]

word_cloud_string = ""

for word_list in word_tokens:
    for word in word_list:
        if word.lower() in english_words:
            word_cloud_string += word + " "
        
# Updating some of the Words into Stopwords 
description_stopwords = set(STOPWORDS)

my_word_cloud = WordCloud(background_color='white',stopwords=description_stopwords).generate(word_cloud_string)
plt.figure(figsize=(10,20))
plt.imshow(my_word_cloud, interpolation='bilinear')
plt.title("Word Cloud for Post Body", fontsize=20)
plt.axis('off')
plt.show()

---
**Sentiment Analysis Using SentimentIntensityAnalyzer from nltk.sentiment**

1. Calling get_sentiment() on each Title and Body
2. Creating a new Column called 'Sentiment' and storing the respective sentiment
3. Using the Sentiment column to plot various graphs.

In [None]:
def get_sentiment(sia, text):
    if sia.polarity_scores(text)["compound"] > 0:
        return "Positive"
    elif sia.polarity_scores(text)["compound"] < 0:
        return "Negative"
    else:
        return "Neutral"

sia = SentimentIntensityAnalyzer()    
    
reddit_title_df = reddit_title.to_frame(name='Title')
reddit_title_df['Sentiment'] = reddit_title_df['Title'].apply(lambda x : get_sentiment(sia, x))

reddit_body_df = reddit_body.to_frame(name='Body')
reddit_body_df['Sentiment'] = reddit_body_df['Body'].apply(lambda x : get_sentiment(sia, x))

**Plotting a Bar Graph for Sentiment Counts**

In [None]:
fig, (axis1, axis2) = plt.subplots(1,2, figsize=(12,5))

order = ['Positive', 'Neutral', 'Negative']

xs = reddit_title_df['Sentiment'].value_counts().index
ys = reddit_title_df['Sentiment'].value_counts().values
sns.barplot(x=xs, y=ys, order=order, ax=axis1)

xs = reddit_body_df['Sentiment'].value_counts().index
ys = reddit_body_df['Sentiment'].value_counts().values
sns.barplot(x=xs, y=ys, order=order, ax=axis2)

axis1.set_title("For Title")
axis2.set_title("For Body")

plt.show()

---
**WordCloud for Different Sentiment**

1. Title

In [None]:
fig, (axis1, axis2, axis3) = plt.subplots(3, 1, figsize=(12,18))

axes = [axis1, axis2, axis3]
sentiments = ['Positive', 'Neutral', 'Negative']

for i in range(3):
    word_tokens = [word_tokenize(text) for text in reddit_title_df[reddit_title_df['Sentiment'] == sentiments[i]]['Title']]
    
    word_cloud_string = ""
    
    for word_list in word_tokens:
        for word in word_list:
            if word.lower() in english_words:
                word_cloud_string += word + " "
    
    description_stopwords = set(STOPWORDS)

    my_word_cloud = WordCloud(background_color='white',stopwords=description_stopwords).generate(word_cloud_string)
    axes[i].imshow(my_word_cloud, interpolation='bilinear')
    axes[i].set_title(f"Word Cloud for Post Title with {sentiments[i]} Sentiment", fontsize=20)
    axes[i].axis('off')


plt.show()

---
2. Body

In [None]:
fig, (axis1, axis2, axis3) = plt.subplots(3, 1, figsize=(12,18))

axes = [axis1, axis2, axis3]
sentiments = ['Positive', 'Neutral', 'Negative']

for i in range(3):
    word_tokens = [word_tokenize(text) for text in reddit_body_df[reddit_body_df['Sentiment'] == sentiments[i]]['Body']]
    
    word_cloud_string = ""
    
    for word_list in word_tokens:
        for word in word_list:
            if word.lower() in english_words:
                word_cloud_string += word + " "
    
    description_stopwords = set(STOPWORDS)

    my_word_cloud = WordCloud(background_color='white',stopwords=description_stopwords).generate(word_cloud_string)
    axes[i].imshow(my_word_cloud, interpolation='bilinear')
    axes[i].set_title(f"Word Cloud for Post Body with {sentiments[i]} Sentiment", fontsize=20)
    axes[i].axis('off')


plt.show()

---

# Thank You