In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk.corpus import stopwords
from nltk.sentiment import SentimentAnalyzer
from nltk.corpus import subjectivity
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
import string
%matplotlib inline


In [None]:
# Read in the data
df = pd.read_csv('../input/reddit-wallstreetsbets-posts/reddit_wsb.csv')

In [None]:
df.info()

In [None]:
df.head()

In [None]:
# Lets find out the lengths of the messages
df['length_title'] = df['title'].apply(len)
for i in range(0, len(df)):
    df['body'][i] = str(df['body'][i])
df['length_body'] = df['body'].apply(len)

In [None]:
print("The max length of title is {}".format(df['length_title'].max()))
print("The min length of title is {}".format(df['length_title'].min()))
print("The mode length of title is {}".format(df['length_title'].mode()))
print("The max length of body is {}".format(df['length_body'].max()))
print("The min length of body is {}".format(df['length_body'].min()))
print("The mode length of body is {}".format(df['length_body'].mode()))

In [None]:
plt.figure(figsize=(15,10))
plt.title('Title Text Lengths > 100')
sns.countplot(df[df['length_title'] > 100]['length_title'])

# Text Cleaning

In [None]:
# Gets rid of punctuation
def text_clean(mess):
    nopunct = [char for char in mess if char not in string.punctuation]
    nopunct = ''.join(nopunct)
    return nopunct

In [None]:
title_text = text_clean(df['title'])

In [None]:
body_text = text_clean(df['body'])

In [None]:
len(title_text)

In [None]:
len(body_text)

# Generating Word Clouds

In [None]:
# Words to not include in Word Clouds
my_stopwords = set(STOPWORDS)
# Additional phrases I don't want picked up in word clouds
my_stopwords.update(['https', 'http', '\n', '\t'])

In [None]:
#Creating Title Word Cloud
title_wc = WordCloud(stopwords = my_stopwords, background_color='white', collocations = False).generate(title_text)
plt.figure(figsize=(12,10))
plt.imshow(title_wc, interpolation='bilinear')
plt.title('Most common words used in WSB Title', fontsize = 20)
plt.axis('off')

In [None]:
body_wc = WordCloud(stopwords = my_stopwords, background_color='white', collocations = False).generate(body_text)
plt.figure(figsize=(12,10))
plt.imshow(body_wc, interpolation='bilinear')
plt.title('Most common words used in WSB Body', fontsize = 20)
plt.axis('off')

# Title Text Sentiment Tracker

In [None]:
nltk.download('punkt')
nltk.download('vader_lexicon')

In [None]:
# Sentiment Analyzer
analyzer = SentimentIntensityAnalyzer()

In [None]:
# Will generate a score in the format: {'neg': 0.0, 'neu': 0.425, 'pos': 0.575, 'compound': 0.8877} 
def sentiment_analyzer_scores(sentence):
    score = analyzer.polarity_scores(sentence)
    if score['compound'] > 0.05:
        return str('positive')
    elif score['compound'] < -0.05:
        return str('negative')
    else:
        return str('neutral')

In [None]:
# Lets create a function to create a list of sentiment values 

def sentiment_append(text):
    moods = []
    for mess in text:
        mood = sentiment_analyzer_scores(mess)
        moods.append(mood)
    return moods
    

In [None]:
# Add title moods to data frame
title_moods = sentiment_append(df['title'])
df['title_sentiment'] = title_moods

In [None]:
# Lets find out how many of each sentiment there are in the titles text

df['title_sentiment'].value_counts()

In [None]:
# Lets Visualize the Number of Each Sentiment

plt.figure(figsize = (12,10))
sns.countplot(df['title_sentiment'])
plt.title("Number of Each Sentiment in Title", fontsize = 20)
plt.show()

There is a much larger amount of Neutral titles than Positive and Negative titles

# Body Text Sentiment Tracker

In [None]:
# Sentiment Analyzer
analyzer = SentimentIntensityAnalyzer()

In [None]:
# Will generate a score in the format: {'neg': 0.0, 'neu': 0.425, 'pos': 0.575, 'compound': 0.8877} 
def sentiment_analyzer_scores(sentence):
    score = analyzer.polarity_scores(sentence)
    if score['compound'] > 0.05:
        return str('positive')
    elif score['compound'] < -0.05:
        return str('negative')
    else:
        return str('neutral')

In [None]:
# Lets create a function to create a list of sentiment values 

def sentiment_append(text):
    moods = []
    for mess in text:
        mood = sentiment_analyzer_scores(mess)
        moods.append(mood)
    return moods
    

In [None]:
# Add body  moods to data frame
body_moods = sentiment_append(df['body'])
df['body_sentiment'] = body_moods

In [None]:
# Lets find out how many of each sentiment there are in the titles text
df['body_sentiment'].value_counts()

In [None]:
# Lets Visualize the Number of Each Sentiment

plt.figure(figsize = (12,10))
sns.countplot(df['body_sentiment'])
plt.title("Number of Each Sentiment in Body", fontsize = 20)
plt.show()

There is a much larger number of Neutral Body text messages thatn Positive and Negative text messages. A trend towards more neutral messages can be seen based on Body text messages and Title text messages. The majority of neutral values may come from nan values in the data. A more reliable distribution between Positive, Neutral, and Negative values can be made by replacing nan values or getting rid of them all together and comparing.

# Sentiment Correlation

In [None]:
# Sentiment Correlation

plt.figure(figsize=(12,10))
sns.heatmap(df.groupby('title_sentiment').corr(), cmap = 'viridis')
plt.title("Title Correlation")

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(df.groupby('body_sentiment').corr(), cmap = 'viridis')
plt.title("Body Correlation")

# Positive Title Word Clouds 

In [None]:
# Creating a positive title 
pos_title_df = pd.DataFrame(columns=['positive_sentiment_text'])

In [None]:
# Extracting just the positive text titles 
pos_list = []
for i in range(0, len(df)):
    if df['title_sentiment'][i] == 'positive':
        pos_list.append(df['title'][i])

In [None]:
# Adding positive text messages to data frame
pos_title_df['positive_sentiment_text'] = pos_list

In [None]:
# Creating Clean Text of Positive Title's
pos_title_text = text_clean(pos_title_df['positive_sentiment_text'])

In [None]:
#Creating Positive Title Word Cloud
pos_title_wc = WordCloud(stopwords = my_stopwords, background_color='white', collocations = False).generate(pos_title_text)
plt.figure(figsize=(12,10))
plt.imshow(pos_title_wc, interpolation='bilinear')
plt.title('Most common words used in Positive WSB Title', fontsize = 20)
plt.axis('off')

# Negative Title Word Clouds

In [None]:
# Creating a Negative title 
neg_title_df = pd.DataFrame(columns=['neg_sentiment_text'])

In [None]:
# Extracting just the negative text titles 
neg_list = []
for i in range(0, len(df)):
    if df['title_sentiment'][i] == 'negative':
        neg_list.append(df['title'][i])

In [None]:
# Adding negative text messages to data frame
neg_title_df['negative_sentiment_text'] = neg_list

In [None]:
# Creating Clean Text of Negative Title's
neg_title_text = text_clean(neg_title_df['negative_sentiment_text'])

In [None]:
#Creating Negative Title Word Cloud
neg_title_wc = WordCloud(stopwords = my_stopwords, background_color='white', collocations = False).generate(neg_title_text)
plt.figure(figsize=(12,10))
plt.imshow(neg_title_wc, interpolation='bilinear')
plt.title('Most common words used in Negative WSB Title', fontsize = 20)
plt.axis('off')

# Neutral Title Word Clouds 

In [None]:
# Creating a Neutral title 
neutral_title_df = pd.DataFrame(columns=['neutral_sentiment_text'])

In [None]:
# Extracting just the neutral text titles 
neu_list = []
for i in range(0, len(df)):
    if df['title_sentiment'][i] == 'neutral':
        neu_list.append(df['title'][i])

In [None]:
# Adding neutral text messages to data frame
neutral_title_df['neutral_sentiment_text'] = neu_list

In [None]:
# Creating Clean Text of Neutral Title's
neutral_title_text = text_clean(neutral_title_df['neutral_sentiment_text'])

In [None]:
#Creating Neutral Title Word Cloud
neutral_title_wc = WordCloud(stopwords = my_stopwords, background_color='white', collocations = False).generate(neutral_title_text)
plt.figure(figsize=(12,10))
plt.imshow(neutral_title_wc, interpolation='bilinear')
plt.title('Most common words used in Neutral WSB Title', fontsize = 20)
plt.axis('off')

GME, AMC, and Robinhood all seem to be extremely common words amongst WSB Titles no matter the sentiment.

# Let Me Know What You Think!

# Thank You!!!