# imports

In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from wordcloud import WordCloud
import math

# preprocessing data

In [None]:
dataset = pd.read_csv("/kaggle/input/reddit-wallstreetsbets-posts/reddit_wsb.csv")

In [None]:
title_raw = []
score = []

for i in range(len(dataset)):
    title_raw.append(dataset["title"][i])
    score.append(int(dataset["score"][i]))

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(title_raw)
title = tokenizer.texts_to_sequences(title_raw)

print(len(tokenizer.word_index))

# distribution of upvotes

In [None]:
scores = {"0":0, "1-10":0, "11-100":0, "101-1000":0, "1001-10000":0, "10001-100000":0, "100001->":0}

for i in score:
    if i < 1:
        scores["0"] += 1
    elif i < 11:
        scores["1-10"] += 1
    elif i < 101:
        scores["11-100"] += 1
    elif i < 1001:
        scores["101-1000"] += 1
    elif i < 10001:
        scores["1001-10000"] += 1
    elif i < 100001:
        scores["10001-100000"] += 1
    else:
        scores["100001->"] += 1

fig = plt.figure(figsize=(9, 3))
ax = fig.add_axes([0,0,1,1])
score_label = ["0", "1-10", "11-100", "101-1000", "1001-10000", "10001-100000", "100001->"]
value = [scores["0"], scores["1-10"], scores["11-100"], scores["101-1000"], scores["1001-10000"], scores["10001-100000"], scores["100001->"]]
ax.bar(score_label,value)
plt.show()

In [None]:
def plot_cloud(wordcloud):
    plt.figure(figsize=(40, 30))
    plt.imshow(wordcloud) 
    plt.axis("off")

# most common words overall

In [None]:
total_posts=0

for i in value:
    total_posts+=i

overallfreqs={}
    
for word, index in tokenizer.word_index.items():
      overallfreqs[word] = 0
    
for i in title:
    for j in i:
        for word, index in tokenizer.word_index.items():
          if j == index:
            overallfreqs[word] += 1
            break
    
    
for word, index in tokenizer.word_index.items():
      overallfreqs[word] /= total_posts
        
wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='black', colormap='Set2').generate_from_frequencies(overallfreqs)

plot_cloud(wordcloud)

# most relatively common words up to 10 upvotes

In [None]:
total_posts=scores["0"] + scores["1-10"]

freqto10 = {}

for word, index in tokenizer.word_index.items():
      freqto10[word] = 0
    
for i in range(len(title)):
    if score[i] < 11:
        for j in title[i]:
            for word, index in tokenizer.word_index.items():
              if j == index:
                freqto10[word] += 1
                break    
    
for word, index in tokenizer.word_index.items():
      freqto10[word] /= total_posts
        
for word, index in tokenizer.word_index.items():
    if freqto10[word] != 0 and overallfreqs[word] != 0:
        freqto10[word] /= overallfreqs[word]
        
wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='black', colormap='Set2').generate_from_frequencies(freqto10)

plot_cloud(wordcloud)

# most relatively common words 11 to 1000 upvotes

In [None]:
total_posts=scores["11-100"] + scores["101-1000"]

freq11to1000 = {}

for word, index in tokenizer.word_index.items():
      freq11to1000[word] = 0
    
for i in range(len(title)):
    if score[i] > 10 or score[i] < 1001:
        for j in title[i]:
            for word, index in tokenizer.word_index.items():
              if j == index:
                freq11to1000[word] += 1
                break    
    
for word, index in tokenizer.word_index.items():
      freq11to1000[word] /= total_posts
        
for word, index in tokenizer.word_index.items():
    if freq11to1000[word] != 0 and overallfreqs[word] != 0:
        freq11to1000[word] /= overallfreqs[word]
        
wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='black', colormap='Set2').generate_from_frequencies(freq11to1000)

plot_cloud(wordcloud)

# most relatively common words 1001 to 100000 upvotes

In [None]:
total_posts=scores["1001-10000"] + scores["10001-100000"]

freq1001to100000 = {}

for word, index in tokenizer.word_index.items():
      freq1001to100000[word] = 0
    
for i in range(len(title)):
    if score[i] > 1000 or score[i] < 100001:
        for j in title[i]:
            for word, index in tokenizer.word_index.items():
              if j == index:
                freq1001to100000[word] += 1
                break    
    
for word, index in tokenizer.word_index.items():
      freq1001to100000[word] /= total_posts
        
for word, index in tokenizer.word_index.items():
    if freq1001to100000[word] != 0 and overallfreqs[word] != 0:
        freq1001to100000[word] /= overallfreqs[word]
        
wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='black', colormap='Set2').generate_from_frequencies(freq1001to100000)

plot_cloud(wordcloud)

# most relatively common words 100001 and up upvotes

In [None]:
total_posts=scores["100001->"]

freqover100001 = {}

for word, index in tokenizer.word_index.items():
      freqover100001[word] = 0
    
for i in range(len(title)):
    if score[i] > 100001:
        for j in title[i]:
            for word, index in tokenizer.word_index.items():
              if j == index:
                freqover100001[word] += 1
                break    
    
for word, index in tokenizer.word_index.items():
      freqover100001[word] /= total_posts
        
for word, index in tokenizer.word_index.items():
    if freqover100001[word] != 0 and overallfreqs[word] != 0:
        freqover100001[word] /= overallfreqs[word]
        
wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='black', colormap='Set2').generate_from_frequencies(freqover100001)

plot_cloud(wordcloud)

I think this shows that some words are very common in post titles of various popularities. As well as this the words overall are very different from the distributions over the wider english language

I will try to take on any suggestions... I hope you enjoyed. Please consider voting this notebook while you enjoy those chicken tendies