# Imports

In [1]:
import os
import re

import emoji
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

pd.set_option('display.max_colwidth', 100)

# Read Data

In [2]:
def drop_removed_deleted(df_orig, field):
    removed_values=['[removed]','[deleted]']

    df = df_orig.copy()
    df = df[~df[field].isin(removed_values)]

    return df

In [3]:
def read_and_sample_files():
    BASE_PATH = './data/'
    os.chdir(BASE_PATH)

    folders = [f'{folder}/'  for folder in os.listdir() if folder not in ('final', '.ipynb_checkpoints',)]

    final_df = pd.DataFrame()

    for folder in folders:
        # folder_path = os.path.join(BASE_PATH, folder)
        print(f"Reading {folder}...")
        files = [i for i in os.listdir(folder) if i.endswith('csv')]

        # print((os.path.join(folder, files[0])))

        df = pd.read_csv(os.path.join(folder, files[0]))

        df = drop_removed_deleted(df, 'selftext')
        df = df.dropna(subset='selftext')
        df = df.sample(frac=1).reset_index(drop=True)
        
        df['file'] = folder
        
        if len(df) < 4000:
            print('Dataframe contains fewer than 4000 records ({len(df)}). Retreiving all records.')
        else:
            df = df.sample(4000)

        final_df = pd.concat([final_df, df])
    
    final_df = final_df.sample(frac=1)
    
    final_df.to_csv('sampled_data.csv', index=False)

    return final_df



In [4]:
df = read_and_sample_files()

Reading wallstreetbets/...
Reading investing/...


  df = pd.read_csv(os.path.join(folder, files[0]))


Reading stockmarket/...
Reading stocks/...
Reading robinhood/...
Dataframe contains fewer than 4000 records ({len(df)}). Retreiving all records.
Reading personalfinance/...
Reading finance/...
Dataframe contains fewer than 4000 records ({len(df)}). Retreiving all records.
Reading securityanalysis/...
Dataframe contains fewer than 4000 records ({len(df)}). Retreiving all records.
Reading gme/...
Reading robinhoodpennystocks/...
Dataframe contains fewer than 4000 records ({len(df)}). Retreiving all records.
Reading forex/...
Reading financialindependence/...
Dataframe contains fewer than 4000 records ({len(df)}). Retreiving all records.
Reading options/...
Reading pennystocks/...


In [5]:
df.describe()

Unnamed: 0,pinned,archived,locked,removed,deleted,is_self,is_video,is_original_content,upvote_ratio,score,gilded,total_awards_received,num_comments,num_crossposts
count,42330.0,42330.0,42330.0,42330.0,42330.0,42330.0,42330.0,42330.0,42330.0,42330.0,42330.0,42330.0,42330.0,42330.0
mean,0.0,0.0,0.001465,0.006071,0.0,1.0,0.0,0.018356,0.779192,85.232081,0.020506,0.774699,67.667966,0.062532
std,0.0,0.0,0.038244,0.077683,0.0,0.0,0.0,0.134236,0.198404,1009.267974,0.259671,11.867053,1080.324833,0.498515
min,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.67,1.0,0.0,0.0,3.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.82,5.0,0.0,0.0,8.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.95,18.0,0.0,0.0,21.0,0.0
max,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,98013.0,24.0,1481.0,75852.0,30.0


In [6]:
df.head(5)

Unnamed: 0,id,author,created,retrieved,edited,pinned,archived,locked,removed,deleted,...,upvote_ratio,score,gilded,total_awards_received,num_comments,num_crossposts,selftext,thumbnail,shortlink,file
4814,qjl9qp,aivapes,2021-10-31 08:29:23,2021-10-31 15:51:10,1970-01-01 00:00:00,0,0,0,0,0,...,0.23,0,0,0,11,0,"Can't tell you how I know, but I actually am 100% certain. Only posting this here because not ma...",self,https://redd.it/qjl9qp,options/
5715,liowew,tomalucian123,2021-02-13 00:27:22,2021-02-13 12:30:48,1970-01-01 00:00:00,0,0,0,0,0,...,0.9,8,0,0,2,0,"I see it everywhere , it seems like a good deal by what I have seen, they secured a lot of money...",self,https://redd.it/liowew,pennystocks/
14792,oc0rho,_Apache_Helicopter_,2021-07-02 02:20:36,2021-07-02 08:52:48,1970-01-01 00:00:00,0,0,0,0,0,...,0.83,365,1,17,160,2,"# China steel export tax\n\n* Within the next few days, China is expected to put in a place a st...",self,https://redd.it/oc0rho,wallstreetbets/
11656,qqrazf,Sowreen,2021-11-10 10:20:17,2021-11-10 17:49:21,1970-01-01 00:00:00,0,0,0,0,0,...,0.57,2,0,0,21,0,I have received the sum of 10k € from my parrents in order apply for a mortgage. The usual pric...,self,https://redd.it/qqrazf,stocks/
55851,m66dr2,eric66111,2021-03-16 10:55:10,2021-03-16 16:58:18,1970-01-01 00:00:00,0,0,0,0,0,...,0.85,17,0,0,6,0,"No, we are not ""here"". No, the graph isn't going to be identical as VW squeeze. No, the graph is...",self,https://redd.it/m66dr2,gme/


In [7]:
df['file'].value_counts()

file
options/                  4000
pennystocks/              4000
wallstreetbets/           4000
stocks/                   4000
gme/                      4000
investing/                4000
stockmarket/              4000
forex/                    4000
personalfinance/          4000
robinhoodpennystocks/     3911
financialindependence/    1508
robinhood/                 678
securityanalysis/          189
finance/                    44
Name: count, dtype: int64