# Imports

In [1]:
import os
import re

import emoji
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

pd.set_option('display.max_colwidth', 100) 

# Read Data

In [2]:
ch_sample_link = '/data/ch_sample_selections_data.csv'
cg_sample_link = '/data/sample_wsb_gme_personalfin_stock.csv'
rr_sample_link = '/data/forex_finance_finindependence_forex.csv'

file_links = [ch_sample_link, cg_sample_link, rr_sample_link]

In [3]:
ch_sample = pd.read_csv(os.getcwd() + ch_sample_link)
cg_sample = pd.read_csv(os.getcwd() + cg_sample_link)
rr_sample = pd.read_csv(os.getcwd() + rr_sample_link)

In [4]:
print(ch_sample.shape, cg_sample.shape, rr_sample.shape)

(15000, 25) (16000, 25) (6104, 25)


In [5]:
# datatypes match
pd.concat([ch_sample.dtypes,cg_sample.dtypes,rr_sample.dtypes],axis=1)

Unnamed: 0,0,1,2
Unnamed: 0,int64,int64,int64
id,object,object,object
author,object,object,object
created,object,object,object
retrieved,object,object,object
edited,object,object,object
pinned,int64,int64,int64
archived,int64,int64,int64
locked,int64,int64,int64
removed,int64,int64,int64


In [6]:
def read_files(file_links):
    df = pd.DataFrame()
    for link in file_links:
        df = pd.concat((df, pd.read_csv(os.getcwd() + link)), axis=0)
    return df

In [7]:
df_full_raw = read_files(file_links)

In [8]:
np.random.seed(42)
indices = np.arange(len(df_full_raw))
shuffled_indices = np.random.permutation(indices)
df_full = df_full_raw.iloc[shuffled_indices]

# Examine Data for Cleaning

In [9]:
df.shape

NameError: name 'df' is not defined

In [None]:
df.isna().sum()

### Dropping Null selftext values

In [None]:
print(df.shape)
df = df.dropna(subset='selftext')
print(df.shape)

### Filling NA's for link_flair_text

In [None]:
df[df['link_flair_text'].isna()].fillna({'link_flair_text':''}).iloc[:2]

In [None]:
df.describe()

Remove the "Unnamed: 0" field

In [None]:
df.iloc[:2]

In [None]:
plt.figure(figsize=(10, 10))
words = " ".join(df['selftext'])
wc1 = WordCloud(max_words=500, width=800, height=400).generate(words)
plt.imshow(wc1, interpolation='bilinear')
plt.title('Raw Word Cloud', fontsize=14)
plt.axis('off')  # Hide axes
plt.show()

### Reviewing unexpected terms, such as https, webp, pgn.

Where are these words coming from?

In [None]:
search_terms = [r'https', r'webp','png']

The search terms are all part of url's. They will be removed.

In [None]:
df[df['selftext'].apply(lambda x: any(pattern in x for pattern in search_terms))]['selftext'].iloc[0]

In [None]:
df[df['selftext'].apply(lambda x: any(pattern in x for pattern in search_terms))]['selftext'].apply(lambda x: re.sub(r'https?://\S+\b/?', '<url>', x)).iloc[0]

Remove URL's, and replace with "< url >"

In [None]:
plt.figure(figsize=(10, 10))
words = " ".join( df['selftext'].apply(lambda x: re.sub(r'https?://\S+\b/?', '<url>', x)) )
wc2 = WordCloud(max_words=500, width=800, height=400).generate(words)
plt.imshow(wc2, interpolation='bilinear')
plt.title("Word Cloud - URL's removed", fontsize=14)
plt.axis('off')  # Hide axes
plt.show()

In [None]:
# Identifying x200b, which is zero width space
search_terms = ['x200']
zero_width_indexes = df['selftext'].apply(lambda x: any(pattern in x for pattern in search_terms))
df[zero_width_indexes]['selftext'].iloc[0][:1000]

In [None]:
# Remove zero width spaces (x200B)
df[zero_width_indexes]['selftext'].apply(lambda x: re.sub(r'x200B', '', x)).iloc[0][:1000]

In [None]:
# Replace the HTTPS with <url> and remove zero width spaces (x200B) 
plt.figure(figsize=(10, 10))
words = " ".join( df['selftext'].apply(lambda x: re.sub(r'https?://\S+\b/?', '<url>', x)).apply(lambda x: re.sub(r'x200B', '', x)) )
wc2 = WordCloud(max_words=500, width=800, height=400).generate(words)
plt.imshow(wc2, interpolation='bilinear')
plt.title("Word Cloud - URL's and Zero Width Spaces Removed.", fontsize=14)
plt.axis('off')  # Hide axes
plt.show()

What are common selftext values, and how should they be handled?

In [None]:
df['selftext'].value_counts().iloc[:5]

This contains recurring discussion posts and self_text values of '[deleted]' which should be removed.

In [None]:
df.shape, df[df['selftext']!='[deleted]'].shape

### Removing Discussion Posts

In [None]:
text_frequency = df['selftext'].value_counts().reset_index()
text_frequency = text_frequency[text_frequency['count']>3]
discussion_posts = text_frequency['selftext'].tolist()
df[df['selftext'].isin(discussion_posts)].iloc[:2]

# Cleaning Data

### Drop NA's

In [None]:
df = df.dropna(subset='selftext')

### Fill NA's

In [None]:
df = df.fillna({'link_flair_text':''})

### Remove [deleted] selftext entries

In [None]:
df = df[df['selftext']!='[deleted]']

### Remove discussion posts

In [None]:
def remove_discussion_posts(df_orig):
    '''Discussion posts are defined as posts that occur more than 3 times.'''
    df = df_orig.copy()

    text_frequency = df['selftext'].value_counts().reset_index()
    text_frequency = text_frequency[text_frequency['count']>3]
    discussion_posts = text_frequency['selftext'].tolist()
    
    df = df[~df['selftext'].isin(discussion_posts)]


    return df

# df = remove_discussion_posts(df)

### Preprocess data and clean data
We must remove excess HTML and other words.

In [None]:
unnamed_columns = [i for i in df.columns if 'Unnamed' in i]
df = df.drop(columns=unnamed_columns)

We must remove excess HTML and other words.

In [None]:
def preprocessor(text):
    try:
        text = re.sub('<[^>]*>', '', text)
        emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                               text)
        text = (re.sub('[\W]+', ' ', text.lower()) +
                ' '.join(emoticons).replace('-', ''))
    except Exception as e:
        print(f'Exception when preprocessing.\n{e}')
    finally:
        return text

# from https://stackoverflow.com/questions/70304914/sentiment-analysis-python-tokenization
def cleaner(text):
    text = str(text).lower()
    text = re.sub(r'#(\w+)', r'<\1>', text)
    text = re.sub(r'\n', ' ', text) # Remove /n
    text = re.sub(r'@[A-Za-z0-9]+', '<user>', text) # Remove and replace @mention
    text = re.sub(r'RT\s+', '', text) # Remove RT
    text = re.sub(r'https?://\S+\b/?', '<url>', text) # Remove and replace links
    text = re.sub('x200B', '', text)
    return text

# There's a chance we WON'T need to convert emojis to text if the model we're using supports emojis, but this might be useful
def deemoji_text(text):
    return emoji_demojize(text)

def deemoji_text_unicode(text):
    return text.encode('unicode-escape').decode('utf-8')

def remove_urls(text):
    url_pattern = re.compile(r'http[s]?://\S+')
    return url_pattern.sub('', text)

def drop_unnamed_columns(df_orig):
    df = df_orig.copy()
    unnamed_columns = [i for i in df.columns if 'Unnamed' in i]
    df = df_full.drop(columns=unnamed_columns)
    return df

In [None]:
df['selftext'] = df['selftext'].apply(cleaner).apply(preprocessor)

In [None]:
# Replace the HTTPS with <url> and remove zero width spaces (x200B) 
plt.figure(figsize=(10, 10))
words = " ".join( df['selftext'] )
wc2 = WordCloud(max_words=500, width=800, height=400).generate(words)
plt.imshow(wc2, interpolation='bilinear')
plt.title("Word Cloud - Cleaned Dataset", fontsize=14)
plt.axis('off')  # Hide axes
plt.show()

# Standardizing Numeric Fields

### Examine Numeric Fields

In [None]:
numeric_columns = df.dtypes[(df.dtypes == 'int64') | (df.dtypes == 'float64')].index.values
df[numeric_columns]

### Scale Numeric Fields

In [None]:
scaling_fields=['upvote_ratio','score','gilded','total_awards_received','num_comments','num_crossposts']

non_scaling_fields = [i for i in df.columns if i not in scaling_fields]

In [None]:
scaler = StandardScaler()

df_scaled_values = scaler.fit_transform(df[scaling_fields].values)

df_scaled = pd.DataFrame(df_scaled_values, columns=scaling_fields)

In [None]:
df_std = pd.concat((df[non_scaling_fields].reset_index(drop=True), df_scaled), axis=1)

In [None]:
df_std.iloc[1:3]

# Full Dataset Cleaning and Training/Validation/Test Assignments

In [None]:
df_full = df_full.dropna(subset='selftext')

In [None]:
df_full = df_full.fillna({'link_flair_text':''})

In [None]:
df_full = df_full[df_full['selftext']!='[deleted]']

In [None]:
def remove_discussion_posts(df_orig):
    '''Discussion posts are defined as posts that occur more than 3 times.'''
    df = df_orig.copy()

    text_frequency = df['selftext'].value_counts().reset_index()
    text_frequency = text_frequency[text_frequency['count']>3]
    discussion_posts = text_frequency['selftext'].tolist()
    
    df = df[~df['selftext'].isin(discussion_posts)]


    return df

In [None]:
# df = remove_discussion_posts(df)

### Clean and Preprocess Data

In [None]:
df_full['selftext'] = df_full['selftext'].apply(cleaner).apply(preprocessor)

### Drop Columns containing "Unnamed"

In [None]:
df_full = drop_unnamed_columns(df_full)

### Scale Numeric Fields

In [None]:
def scale_numeric_fields(scaler, df, scaling_fields):
    non_scaling_fields = [i for i in df.columns if i not in scaling_fields]

    df_scaled_values = scaler.fit_transform(df[scaling_fields].values)
    df_scaled = pd.DataFrame(df_scaled_values, columns=scaling_fields)
    
    df_std = pd.concat((df[non_scaling_fields].reset_index(drop=True), df_scaled), axis=1)
    
    return df_std

In [None]:
scaling_fields=['upvote_ratio','score','gilded','total_awards_received','num_comments','num_crossposts']

scaler = StandardScaler()

df_full = scale_numeric_fields(scaler, df_full, scaling_fields)

### Split Data into Train, Validation, Test sets

In [None]:
def split_data(df_full, split=(0.7, 0.15, 0.15)):
    split = (0.7, 0.15, 0.15) # --> to be discussed with team
    X_full = df_full[['id', 'author', 'created', 'retrieved', 'edited',
           'pinned', 'archived', 'locked', 'removed', 'deleted', 'is_self',
           'is_video', 'is_original_content', 'title', 'link_flair_text',
           'upvote_ratio',  'gilded', 'total_awards_received',
           'num_comments', 'num_crossposts', 'selftext', 'thumbnail', 'shortlink']]

    Y_full = df_full['score']

    X_train, X_t, y_train, Y_t = train_test_split(X_full,  Y_full, train_size=split[0])
    X_val, X_test, y_val, y_test = train_test_split(X_t,  Y_t, train_size=split[1]/(split[1]+split[2]))

    return X_train, y_train, X_val, y_val, X_test, y_test

X_train, y_train, X_val, y_val, X_test, y_test = split_data(df_full)

In [None]:
# X_train.to_csv('./data/X_train.csv', index=False)
# y_train.to_csv('./data/y_train.csv', index=False)

# X_val.to_csv('./data/X_val.csv', index=False)
# y_val.to_csv('./data/y_val.csv', index=False)

# X_test.to_csv('./data/X_test.csv', index=False)
# y_test.to_csv('./data/y_test.csv', index=False)