## r/IntellectualDarkWeb Preprocessing

In [None]:
import os
import re

import sqlite3
import pandas as pd
from tqdm import tqdm

import html
import markdown
from bs4 import BeautifulSoup

from joblib import Parallel, delayed
import multiprocessing as mp

from transformers import AutoTokenizer

tqdm.pandas()

### SQL Data Loader
- Pulls `comments` and `posts` from SQLite database.

In [None]:
def load_data(path: str, tables: list) -> pd.DataFrame:
    data = tuple()

    for table in tables:
        print(f'Loading {table}')
        SQL = f'SELECT * FROM {table}'
        conn = sqlite3.connect(path)
        df = pd.read_sql(SQL, conn)
        conn.close()
        data = data + (df,)
    
    return data

**Load data:**
- Tables: `(comments, posts)`

In [None]:
tables = ['comments', 'posts']
db_path = os.path.join('..', 'data', 'sqlite', 'idw_reddit.db')
comments, posts = load_data(path=db_path, tables=tables)

In [None]:
print(f'N comments: {len(comments)}')
print(f'N posts: {len(posts)}')
print(f'Total: {len(comments) + len(posts)}')

In [None]:
comments.head()

In [None]:
posts.head()

In [None]:
comments.isna().sum()

In [None]:
posts.isna().sum()

## Text Processing Functions

### Universal text cleaner:
- Normalizes text.
- Removes noise from Reddit.
- Removes HTML/XML/markdown.

In [None]:
def clean_text(text: str) -> str:
    '''
    1. Removes noise specific to Reddit.
    2. Unescapes and cleans HTML/XML.
    3. Pulls out any text between anchor tags.
    4. Removes URLs.
    5. Removes leftover newlines (\n)
    6. Removes excess whitespace.
    '''

    # remove Reddit poll text
    polls = re.compile(r'\[View Poll\]\(\S+\)')
    text = re.sub(polls, ' ', text)

    # remove non-printable ASCII characters
    non_print = re.compile(r'[^\x20-\x7E]')
    text = re.sub(non_print, ' ', text)
    
    # remove 'submission statement' text
    sub_statement = re.compile(r'submission statement', re.IGNORECASE)
    text = re.sub(sub_statement, ' ', text)
   
    # clean HTML
    text = markdown.markdown(text)
    text = html.unescape(text)
    soup = BeautifulSoup(text, 'html.parser')
    text = ' '.join(soup.findAll(string=True))
    text = text.replace('\n', ' ')

    # remove URLs
    url_regex = re.compile(r'http\S+|www\S+|https\S+')
    text =  re.sub(url_regex, ' ', text)
    
    # remove excess white space
    text = ' '.join(text.strip().split())

    return text

### Tokenizer functions:
- Use the `all-mpnet-base-v2` tokenizer to count the tokens.
- If > `382` tokens, tokens must be split into batches of lenght `n=382`.
  - `all-mpnet-base-v2` accepts sequences of length `384`.
  - But we have to account for special tokens `<s>` and `</s>` at the beginning and end of each sequence.

In [None]:
# initiate tokenizer:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')

In [None]:
def tokenizer_processor(text: str) -> tuple:
    max_tokens = 382
    tokens = tokenizer.tokenize(text)
    n_tokens = len(tokens)
    
    if n_tokens > max_tokens:
        tokens = [tokens[i:i + max_tokens] for i in range(0, len(tokens), max_tokens)]
    
    return n_tokens, tokens

### Explode lists of texts to strings:
- This function will be used to expldoe the dataframe observations into multiple rows.
- A post or comment will only have repeating observations when it has been broken into sublists due to it having > `382` tokens.

In [None]:
def stringify_lists(text: list) -> list:
    if isinstance(text[0], list):
        joined_texts = [' '.join(sublist) for sublist in text]
    else:
        joined_texts = [' '.join(text)]
    return joined_texts

## Preprocess Posts
- A couple things:
  - remove titles in the set `['[deleted by user]', '[ Removed by Reddit ]', 'test']`
    - These will not have any content.
  - Remove posts by username `######` (anonymized for public view)
    - This account spammed the same post over and over again
- If a post has a title but no selftext body, use the title as the representation.
  - This can be the case with selftext posts set to `[deleted]` or `[removed]` as well.

In [None]:
remove_titles = ['[deleted by user]', '[ Removed by Reddit ]', 'test']
remove_users = ['######'] # anonymized for public view

In [None]:
posts = posts.loc[~posts['title'].isin(remove_titles)].copy()
posts = posts.loc[~posts['author'].isin(remove_users)].copy()
print(f'N={len(posts)}')

**Clean text:**

In [None]:
posts['clean_title'] = posts['title'].progress_apply(clean_text)
posts['clean_body'] = posts['selftext'].progress_apply(clean_text)

**Replace `[deleted]` and `[removed]` with empty strings:**

In [None]:
posts.loc[posts['clean_body'].isin(['[deleted]', '[removed]']), 'clean_body'] = ''

**Concatenate post titles and post bodies:**

In [None]:
posts['text_representation'] = posts['clean_title'] + ' ' + posts['clean_body']
posts['text_representation'] = posts['text_representation'].progress_apply(lambda row: ' '.join(row.strip().split()))

In [None]:
posts['text_representation'].sample(n=10).tolist()

**Grab token counts and tokenized lists:**

In [None]:
token_vals = Parallel(n_jobs=mp.cpu_count()-1)(delayed(tokenizer_processor)(doc) for doc in tqdm(posts['text_representation'].tolist()))
posts['n_tokens'], posts['tokens'] = zip(*token_vals)

**Get processed texts and full_ids:**

In [None]:
posts_processed = posts[['full_id',  'text_representation', 'tokens', 'n_tokens']].copy()

In [None]:
posts_processed.head()

In [None]:
posts_processed['tokens'].sample(n=3).tolist()

## Comments

- Remove any comments that are simply: `['[deleted]', '[removed]', '[ Removed by Reddit ]']`
- Remove comments from known bots: `['######', '######']` (anonymized for public view)

In [None]:
remove_comments = ['[deleted]', '[removed]', '[ Removed by Reddit ]']
remove_comment_users = ['######', '######'] # anonymized for public view

In [None]:
comments = comments.loc[~comments['body'].isin(remove_comments)].copy()
comments = comments.loc[~comments['author'].isin(remove_comment_users)].copy()
len(comments)

**Clean text:**

In [None]:
comments['text_representation'] = Parallel(n_jobs=mp.cpu_count()-1)(delayed(clean_text)(doc) for doc in tqdm(comments['body'].tolist()))

In [None]:
comments['text_representation'].sample(n=10).tolist()

**Grab token counts and tokenized lists:**

In [None]:
token_vals = Parallel(n_jobs=mp.cpu_count()-1)(delayed(tokenizer_processor)(doc) for doc in tqdm(comments['text_representation'].tolist()))
comments['n_tokens'], comments['tokens'] = zip(*token_vals)

**Get processed texts and full_ids:**

In [None]:
comments_processed = comments[['full_id', 'text_representation', 'tokens', 'n_tokens']].copy()

In [None]:
comments_processed.head()

In [None]:
comments['tokens'].sample(n=1).tolist()

### Concatenate Dataframes
- Combine `posts_processed` and `comments_processed` into a single dataframe.
- Ensure only unique `full_id`s are present.
- Only keep posts/comments with a minimum of `10` tokens.
  - This helps eliminate short posts/comments that may not contain rich topical content.
- Save text representations.
  -  The `text_representation` column will be used for extracting topic keywords.
  - The `tokens` column will be used for creating embeddings and clustering.
- Then, explode the lists of tokens.
  - First, call the `stringify_lists` function.
  - This is necessary for when multiple observations occur for a single `full_id`.
  - Multiple observations occur when a post/comment was batched into sublists when their token count > `382`.


In [None]:
posts_processed['source'] = 'post'
comments_processed['source'] = 'comment'

In [None]:
training_data = pd.concat(
    [
        posts_processed,
        comments_processed
    ]
)

In [None]:
training_data

In [None]:
assert len(training_data) == len(training_data['full_id'].unique())
print(f'Unique posts/comments: {len(training_data)}')

**Filter tokens:**

In [None]:
token_minimum = 10
training_data = training_data.loc[training_data['n_tokens'] >= token_minimum].copy()
training_data['n_tokens'].min(), training_data['n_tokens'].max()

In [None]:
training_data

**Save text representations:**

In [None]:
text_reps = training_data[['full_id', 'text_representation', 'source']]
text_reps

In [None]:
output_path = os.path.join(
    '..',
    'data',
    'training',
    'text_representations.csv'
)

text_reps.to_csv(
    output_path,
    index=False
)

**Stringify lists and explode observations:**

In [None]:
training_data['tokens'] = training_data['tokens'].progress_apply(stringify_lists)
training_data.drop(columns=['text_representation'], inplace=True)

In [None]:
training_data.head()

In [None]:
training_data = training_data.explode('tokens').copy()

In [None]:
training_data.head()

In [None]:
print(f'N total observations: {len(training_data)}')

### Save Training Data

In [None]:
output_path = os.path.join(
    '..',
    'data',
    'training',
    'training_data.csv'
)

training_data.to_csv(
    output_path,
    index=False
)

`<-- Complete -->`