# Financial data
https://www.kaggle.com/datasets/leukipp/reddit-finance-data/data


https://www.tensorflow.org/tutorials/structured_data/time_series#data_windowing
https://blog.tensorflow.org/2023/09/forecasting-with-tensorflow-decision-forests-and-temporian.html

https://www.tensorflow.org/decision_forests


In [1]:
import os
import emoji
from collections import Counter
import pandas as pd
import seaborn as sns
import math
import numpy as np
import re
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import initializers

import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split


2024-07-15 21:50:07.314373: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
pd.set_option('display.max_colwidth', 100) 

In [3]:
forex_link = '/data/forex/submissions_reddit.csv'
financial_independence_link = '/data/financialindependence/submissions_reddit.csv'
finance_link = '/data/finance/submissions_reddit.csv'

In [4]:
df_forex = pd.read_csv(os.getcwd() + forex_link)
df_financial_independence = pd.read_csv(os.getcwd() + financial_independence_link)
df_finance = pd.read_csv(os.getcwd() + finance_link)

In [5]:
print(df_forex.shape)
print(df_financial_independence.shape)
print(df_finance.shape)

(14643, 24)
(10338, 24)
(7130, 24)


In [6]:
def drop_removed(df_orig, field):
    removed_values=['[removed]','[deleted]']
    
    df = df_orig.copy()
    df = df[~df[field].isin(removed_values)]

    return df

## df_forex

In [7]:
df_forex.shape

(14643, 24)

In [8]:
df_forex['selftext'].value_counts().reset_index().iloc[:10]

Unnamed: 0,selftext,count
0,[removed],3123
1,[deleted],1077
2,.,3
3,[https://www.reddit.com/r/Forex/comments/mc48os/rejection\_trade\_euraud/](https://www.reddit.co...,3
4,[https://www.reddit.com/r/Forex/comments/mc6h58/long\_term\_breakout\_trade/](https://www.reddit...,2
5,"**Hello, I thought it would be a good idea to ask here. I recently got into crypto swing trading...",2
6,Just wondering,2
7,Title,2
8,Hi Guys im about to start backtesting my strategy for the first time. Im abit worried that emoti...,2
9,Hello out there\nI am one of forex miserable traders in this world but my story is something dif...,2


In [9]:
df_forex = drop_removed(df_forex, 'selftext')

In [10]:
df_forex['selftext'].value_counts().reset_index().iloc[:10]

Unnamed: 0,selftext,count
0,.,3
1,[https://www.reddit.com/r/Forex/comments/mc48os/rejection\_trade\_euraud/](https://www.reddit.co...,3
2,"**Hello, I thought it would be a good idea to ask here. I recently got into crypto swing trading...",2
3,Just wondering,2
4,Title,2
5,Hi Guys im about to start backtesting my strategy for the first time. Im abit worried that emoti...,2
6,Hello out there\nI am one of forex miserable traders in this world but my story is something dif...,2
7,Where do you recommend I watch the ECB press conference or any big central bank press conference...,2
8,"Hello everyone, I’m fairly new at trading been doing it for about a month and a half now. I’m in...",2
9,[https://www.reddit.com/r/Forex/comments/mc6h58/long\_term\_breakout\_trade/](https://www.reddit...,2


In [11]:
# nas in link_flair_text and selftext
df_forex.isna().sum()

id                          0
author                      0
created                     0
retrieved                   0
edited                      0
pinned                      0
archived                    0
locked                      0
removed                     0
deleted                     0
is_self                     0
is_video                    0
is_original_content         0
title                       0
link_flair_text            62
upvote_ratio                0
score                       0
gilded                      0
total_awards_received       0
num_comments                0
num_crossposts              0
selftext                 4870
thumbnail                   0
shortlink                   0
dtype: int64

In [12]:
df_forex_cleaned = df_forex.dropna()
df_forex_cleaned.shape

(5548, 24)

In [13]:
df_forex_cleaned.isna().sum()

id                       0
author                   0
created                  0
retrieved                0
edited                   0
pinned                   0
archived                 0
locked                   0
removed                  0
deleted                  0
is_self                  0
is_video                 0
is_original_content      0
title                    0
link_flair_text          0
upvote_ratio             0
score                    0
gilded                   0
total_awards_received    0
num_comments             0
num_crossposts           0
selftext                 0
thumbnail                0
shortlink                0
dtype: int64

In [14]:
df_forex_cleaned.sample(frac=1)[:4000].shape

(4000, 24)

In [15]:
df_forex_cleaned = df_forex_cleaned.sample(frac=1)[:4000]

In [16]:
df_forex_cleaned = df_forex_cleaned.reset_index(drop=True)

In [17]:
df_forex_cleaned.shape

(4000, 24)

In [18]:
df_forex_cleaned.iloc[:2]

Unnamed: 0,id,author,created,retrieved,edited,pinned,archived,locked,removed,deleted,...,link_flair_text,upvote_ratio,score,gilded,total_awards_received,num_comments,num_crossposts,selftext,thumbnail,shortlink
0,lkudh3,matiu2,2021-02-16 03:11:45,2021-02-16 16:18:24,2021-02-16 03:28:39,0,0,0,0,0,...,Questions,1.0,3,0,0,14,0,So I have a negative risk/reward ratio. Kind of. I'm making more winning trades than loosing tra...,self,https://redd.it/lkudh3
1,r7tphi,OverallStick4214,2021-12-03 07:40:01,2021-12-03 14:18:21,1970-01-01 00:00:00,0,0,0,0,0,...,OTHER/META,0.25,0,0,0,2,0,Learn some fucking sense trading thats all.,self,https://redd.it/r7tphi


## df_financial_independence

In [19]:
df_financial_independence.shape

(10338, 24)

In [20]:
df_financial_independence['selftext'].value_counts().reset_index()

Unnamed: 0,selftext,count
0,[removed],8253
1,[deleted],577
2,Please use this thread to have discussions which you don't feel warrant a new post to the sub....,353
3,\n\nNeed help applying broader FIRE principles to your own situation? We’re here for you!\n\nPo...,52
4,Self-promotion (ie posting about projects/businesses that you operate and can profit from) is...,51
...,...,...
954,TL;DR - philosophy majors and retired baristas sold soul to the man to get freedom to waste more...,1
955,"As the title states, I have acheived FI and so I RE. I have some legacy stocks in a taxable acco...",1
956,"Hello FIRE fam, I wanted to share a recent experience of mine that has totally changed how my SO...",1
957,"Hi, this came up in another thread and I built a quick model for a situation where you might hav...",1


In [21]:
df_financial_independence = drop_removed(df_financial_independence, 'selftext')

In [22]:
df_financial_independence['selftext'].value_counts().reset_index()

Unnamed: 0,selftext,count
0,Please use this thread to have discussions which you don't feel warrant a new post to the sub....,353
1,\n\nNeed help applying broader FIRE principles to your own situation? We’re here for you!\n\nPo...,52
2,Self-promotion (ie posting about projects/businesses that you operate and can profit from) is...,51
3,"Please use this thread to post your milestones, humblebrags and status updates which you don'...",50
4,\n\nPlease use this thread to discuss how amazingly cheap you are. How do you keep your costs...,50
...,...,...
952,"As the title states, I have acheived FI and so I RE. I have some legacy stocks in a taxable acco...",1
953,"Hello FIRE fam, I wanted to share a recent experience of mine that has totally changed how my SO...",1
954,"Hi, this came up in another thread and I built a quick model for a situation where you might hav...",1
955,**Hey everyone! This is my first post after being a lurker for a while now.** \n\nTo help hold m...,1


In [23]:
#na's in link_flair_text, one in title
df_financial_independence.isna().sum()

id                          0
author                      0
created                     0
retrieved                   0
edited                      0
pinned                      0
archived                    0
locked                      0
removed                     0
deleted                     0
is_self                     0
is_video                    0
is_original_content         0
title                       0
link_flair_text          1505
upvote_ratio                0
score                       0
gilded                      0
total_awards_received       0
num_comments                0
num_crossposts              0
selftext                    0
thumbnail                   0
shortlink                   0
dtype: int64

In [24]:
df_financial_independence.shape

(1508, 24)

In [25]:
df_financial_independence_cleaned = df_financial_independence.dropna(subset='title')

In [26]:
df_financial_independence_cleaned.shape

(1508, 24)

In [27]:
df_financial_independence_cleaned.isna().sum()

id                          0
author                      0
created                     0
retrieved                   0
edited                      0
pinned                      0
archived                    0
locked                      0
removed                     0
deleted                     0
is_self                     0
is_video                    0
is_original_content         0
title                       0
link_flair_text          1505
upvote_ratio                0
score                       0
gilded                      0
total_awards_received       0
num_comments                0
num_crossposts              0
selftext                    0
thumbnail                   0
shortlink                   0
dtype: int64

## df_finance

In [28]:
df_finance.shape

(7130, 24)

In [29]:
df_finance['selftext'].value_counts().reset_index()

Unnamed: 0,selftext,count
0,[deleted],552
1,"This is your safe place for questions on financial careers, homework problems and finance in gen...",44


In [30]:
df_finance.isna().sum()

id                          0
author                      0
created                     0
retrieved                   0
edited                      0
pinned                      0
archived                    0
locked                      0
removed                     0
deleted                     0
is_self                     0
is_video                    0
is_original_content         0
title                       0
link_flair_text          7130
upvote_ratio                0
score                       0
gilded                      0
total_awards_received       0
num_comments                0
num_crossposts              0
selftext                 6534
thumbnail                   0
shortlink                   0
dtype: int64

In [31]:
df_finance_cleaned = df_finance.dropna(subset='selftext')

In [32]:
df_finance_cleaned['selftext'].value_counts()

selftext
[deleted]                                                                                                                                                                                                                                                                                                                                                                                                                                                  552
This is your safe place for questions on financial careers, homework problems and finance in general. No question in the finance domain is unwelcome.\n\nReplies are expected to be constructive and civil.\n\nAny questions about your *personal* finances belong in [r/PersonalFinance](https://www.reddit.com/r/PersonalFinance/), and career-seekers are encouraged to also visit [r/FinancialCareers](https://www.reddit.com/r/FinancialCareers/).     44
Name: count, dtype: int64

In [33]:
df_finance_cleaned.shape

(596, 24)

## After dropping NA's

In [34]:
print(df_forex.shape)
print(df_financial_independence.shape)
print(df_finance.shape)

(10443, 24)
(1508, 24)
(7130, 24)


In [35]:
print(df_forex_cleaned.shape)
print(df_financial_independence_cleaned.shape)
print(df_finance_cleaned.shape)

(4000, 24)
(1508, 24)
(596, 24)


In [36]:
dataframes = [df_forex_cleaned, df_financial_independence_cleaned, df_finance_cleaned]
final_df = pd.concat(dataframes)

final_df.to_csv('./data/forex_finance_finindependence_forex.csv')

In [37]:
os.listdir('./data')

['wallstreetbets',
 'investing',
 'stockmarket',
 'stocks',
 'robinhood',
 'personalfinance',
 'finance',
 'securityanalysis',
 '.ipynb_checkpoints',
 'test.txt',
 'gme',
 'robinhoodpennystocks',
 'forex_finance_finindependence_forex.csv',
 'forex',
 'financialindependence',
 'options',
 'pennystocks']

In [38]:
final_df.shape

(6104, 24)

## < end of file >