# Imports

In [1]:
import os
import re

import emoji
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

pd.set_option('display.max_colwidth', 100)

# Read Data

In [2]:
def drop_removed_deleted(df_orig, field):
    removed_values=['[removed]','[deleted]']

    df = df_orig.copy()
    df = df[~df[field].isin(removed_values)]

    return df

In [3]:
def read_and_sample_files():
    BASE_PATH = './data/'
    os.chdir(BASE_PATH)

    folders = [f'{folder}/'  for folder in os.listdir() if folder not in ('final', '.ipynb_checkpoints',)]

    final_df = pd.DataFrame()

    for folder in folders:
        # folder_path = os.path.join(BASE_PATH, folder)
        print(f"Reading {folder}...")
        files = [i for i in os.listdir(folder) if i.endswith('csv')]

        # print((os.path.join(folder, files[0])))

        df = pd.read_csv(os.path.join(folder, files[0]))

        df = drop_removed_deleted(df, 'selftext')
        df = df.dropna(subset='selftext')
        df = df.sample(frac=1).reset_index(drop=True)
        
        df['file'] = folder
        
        if len(df) < 4000:
            print(f'Dataframe contains fewer than 4000 records ({len(df)} records). Retreiving all records.')
        else:
            df = df.sample(4000)

        final_df = pd.concat([final_df, df])
    
    final_df = final_df.sample(frac=1)
    
    return final_df



In [4]:
df = read_and_sample_files()

Reading wallstreetbets/...
Reading investing/...


  df = pd.read_csv(os.path.join(folder, files[0]))


Reading stockmarket/...
Reading stocks/...
Reading robinhood/...
Dataframe contains fewer than 4000 records (678 records). Retreiving all records.
Reading personalfinance/...
Reading finance/...
Dataframe contains fewer than 4000 records (44 records). Retreiving all records.
Reading securityanalysis/...
Dataframe contains fewer than 4000 records (189 records). Retreiving all records.
Reading gme/...
Reading robinhoodpennystocks/...
Dataframe contains fewer than 4000 records (3911 records). Retreiving all records.
Reading forex/...
Reading financialindependence/...
Dataframe contains fewer than 4000 records (1508 records). Retreiving all records.
Reading options/...
Reading pennystocks/...


In [5]:
df.describe()

Unnamed: 0,pinned,archived,locked,removed,deleted,is_self,is_video,is_original_content,upvote_ratio,score,gilded,total_awards_received,num_comments,num_crossposts
count,42330.0,42330.0,42330.0,42330.0,42330.0,42330.0,42330.0,42330.0,42330.0,42330.0,42330.0,42330.0,42330.0,42330.0
mean,0.0,0.0,0.001205,0.005882,0.0,1.0,0.0,0.018379,0.778739,86.775313,0.020884,0.764092,59.198228,0.063855
std,0.0,0.0,0.03469,0.076471,0.0,0.0,0.0,0.134321,0.198316,1343.127405,0.327914,17.679246,964.492078,0.553722
min,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.67,1.0,0.0,0.0,3.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.82,5.0,0.0,0.0,8.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.95,18.0,0.0,0.0,21.0,0.0
max,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,154745.0,32.0,2548.0,82334.0,57.0


In [6]:
df.head(2)

Unnamed: 0,id,author,created,retrieved,edited,pinned,archived,locked,removed,deleted,...,upvote_ratio,score,gilded,total_awards_received,num_comments,num_crossposts,selftext,thumbnail,shortlink,file
17274,l6b7qn,CaptHymanShocked,2021-01-27 18:31:06,2021-02-04 22:09:39,1970-01-01 00:00:00,0,0,0,0,0,...,0.92,11,0,0,4,0,he took his glasses off for this one -- love this guy:\n\n[https://youtu.be/uzojHqzm3TU?t=32](ht...,self,https://redd.it/l6b7qn,wallstreetbets/
1512,lfax7f,Mik-Hail-tal,2021-02-08 12:37:24,2021-02-09 00:47:19,1970-01-01 00:00:00,0,0,0,0,0,...,0.89,7,0,0,4,0,"I've been holding 9000 shares since 2018 @$0.0143\n\nRecently, they finally started rising to $0...",self,https://redd.it/lfax7f,pennystocks/


In [7]:
df['file'].value_counts()

file
wallstreetbets/           4000
pennystocks/              4000
forex/                    4000
options/                  4000
stocks/                   4000
personalfinance/          4000
investing/                4000
gme/                      4000
stockmarket/              4000
robinhoodpennystocks/     3911
financialindependence/    1508
robinhood/                 678
securityanalysis/          189
finance/                    44
Name: count, dtype: int64

In [8]:
df.to_csv('sampled_data.csv', index=False)