# Preprocess text

TODO:

    - Remove quoted text? Or maybe not, since it provides context.
    - Find common acronyms and synonyms. E.g. sm64, diablo 2 -> diablo_ii
    - encode_decode function removes non-english characters. How many game names are in kanji?

In [2]:
import sys
sys.path.append(r'../')

import ujson as json
import pickle
import multiprocess as mp
import pandas as pd
import nltk
from nltk.tokenize import MWETokenizer
import ftfy
from textacy import preprocessing as textacy_preproc
from src.utils.reddit_data import RedditData
from src.utils import text_preprocessing
from src.utils.resources import CONTRACTIONS

## Create game tokens

We want to convert game names to single tokens. E.g. "Super Mario 64" to "super_mario_64". To identify games, we will use a list of all games on Twitch.

First, we will preprocess the game names.

In [2]:
# Create a preprocessing pipeline
preproc = textacy_preproc.make_pipeline(
    text_preprocessing.to_lower,
    ftfy.fix_text,
    textacy_preproc.normalize.unicode,
    textacy_preproc.normalize.quotation_marks,
    textacy_preproc.remove.accents,
    textacy_preproc.remove.punctuation,
    textacy_preproc.replace.currency_symbols,
    text_preprocessing.replace_contractions,
    text_preprocessing.encode_decode,
    textacy_preproc.normalize.whitespace
)

# Load games
games_file = r'../data/interim/game_ids_and_names.jsonl'
games = pd.read_json(games_file, lines = True)

# Filter out games missing names
games = games[games['name'].notnull()]

# Preprocess game names
games['normalized'] = [preproc(game) for game in games['name']]
games['token'] = ['_'.join(nltk.word_tokenize(str(game))) for game in games['normalized']]

# Save the normalized tokens. These will be used later for calculating similarities.
games.to_json('../data/interim/game_tokens.jsonl', orient='records', lines=True)

games

Unnamed: 0,id,name,normalized,token
1,1,Breakfree,breakfree,breakfree
2,2,Hyperballoid Deluxe: Survival Pack,hyperballoid deluxe survival pack,hyperballoid_deluxe_survival_pack
3,3,Bass Avenger,bass avenger,bass_avenger
4,4,The Chessmaster 2000,the chessmaster 2000,the_chessmaster_2000
5,5,Desert Strike: Return to the Gulf,desert strike return to the gulf,desert_strike_return_to_the_gulf
...,...,...,...,...
98319,2147110739,Planet Rescue: Wildlife Vet,planet rescue wildlife vet,planet_rescue_wildlife_vet
98320,2147114054,Bubble Breaking,bubble breaking,bubble_breaking
98321,2147244070,Big Cock Simulator,big cock simulator,big_cock_simulator
98322,2147344492,Mr. Bean's Wacky World,mr bean s wacky world,mr_bean_s_wacky_world


There are 1,726 normalized game names that are duplicated. Some of these appear to be actual duplicates, but this won't affect the final results since we are going to look up "similarity" using the game ids and we want the embedding scores to be the same if different game ids refer to the same game. However, many end up being duplicates because of punctuation. E.g. Zombies. Zombies! and Zombies!!! might refer to different games.

In [3]:
# Check for duplicates
dupes = games[games.duplicated(subset = 'token', keep = False)].sort_values('token')
dupes

Unnamed: 0,id,name,normalized,token
59048,17741681,人工灭绝,,
87186,1543328458,东方华彩乱战,,
90395,1711326006,觅长生,,
88921,1631367032,飄流幻境,,
90304,1706136320,ПИР: Книга Первая «Семейные узы»,,
...,...,...,...,...
97195,2084047683,Zombies!,zombies,zombies
19598,23811,Zoom,zoom,zoom
12364,14763,Zoom!,zoom,zoom
96835,2063716522,魔物娘と不思議な冒険～力の宝珠と帰還の塔～,~~,~~


Next, we create the multi-word tokenizer:

In [4]:
# problem: the MWE doesn't seem to capture some games. e.g. dark_souls_2 instead of dark_souls 2
# does this have to do with the order in which the MWE tokenizer checks for matches?
# sort games from most to least spaces to see if this makes longer MWEs get tokenized first.

unique_games = list(set(games['token']))
unique_games.sort(key=len)
unique_games.reverse()

game_tokens = [tok.split('_') for tok in unique_games]

mwe_tokenizer = MWETokenizer(game_tokens)

# Add extra phrases to MWE tokenizer
contractions = list(CONTRACTIONS.values())
other_phrases = ['any%', '100%', 'speedrun com', 'twitch com', 'twitch tv', 'speed run', 'speed runs', 'speed running', 'e sport', 'e sports']

for phrase in contractions + other_phrases:
    phrase = nltk.word_tokenize(phrase)
    mwe_tokenizer.add_mwe(phrase)

# Save MWE tokenizer
with open(r'../models/mwe_tokenizer.pickle', 'wb') as fh:
   pickle.dump(mwe_tokenizer, fh)

# Check the MWE:
# games['mwe'] = [' '.join(mwe_tokenizer.tokenize(nltk.word_tokenize(str(game)))) for game in games['normalized']]

## Preprocess Reddit posts

The cell below repeats a lot of info to get multiprocessing to work in a Jupyter notebook

In [5]:
if __name__ == '__main__':

    posts = RedditData(
        comments_path='../data/raw/reddit/comments',
        submissions_path='../data/raw/reddit/submissions'
    )

    def process_file(f):
        from pathlib import Path
        import gzip
        import json
        import pickle
        import ftfy
        from textacy import preprocessing as textacy_preproc
        from src.utils.reddit_data import RedditData
        from src.utils import text_preprocessing
        from tqdm.notebook import tqdm
        from html import unescape

        mwe_tokenizer = pickle.load(open(r'../models/mwe_tokenizer.pickle', 'rb'))

        def merge_mwes(text: str) -> str:
            """Concatenate multiword expressions using underscore"""

            return text_preprocessing.merge_mwes(text, mwe_tokenizer)

        preproc = textacy_preproc.make_pipeline(
            text_preprocessing.to_lower,
            text_preprocessing.replace_special_characters,
            ftfy.fix_text,
            textacy_preproc.normalize.unicode,
            textacy_preproc.normalize.quotation_marks,
            textacy_preproc.remove.accents,
            text_preprocessing.replace_markdown_url,
            text_preprocessing.replace_urls,
            text_preprocessing.normalize_domains,
            text_preprocessing.replace_times,
            text_preprocessing.replace_money,
            text_preprocessing.replace_emojis,
            text_preprocessing.replace_currency_symbols,
            text_preprocessing.normalize_subreddit,
            text_preprocessing.replace_contractions,
            text_preprocessing.remove_punctuation,
            merge_mwes,
            text_preprocessing.replace_numbers,
            text_preprocessing.encode_decode,
            text_preprocessing.replace_repeating_tokens,
            textacy_preproc.normalize.whitespace
        )

        posts = RedditData(
            comments_path='../data/raw/reddit/comments',
            submissions_path='../data/raw/reddit/submissions'
        )

        file_name = f['file']
        post_type = f['post_type']
        parsed_posts = posts.read_file(file_name, post_type)
        outpath = Path(f'../data/interim/reddit_{post_type}_{file_name.stem}.jsonl.gz')
        lines_out = 0

        with gzip.open(outpath, 'wt', encoding='utf-8') as file_out:
            for post in parsed_posts:
                post['text'] = preproc(post['text'])

                if text_preprocessing.filter_post(post['text']):
                    continue

                line = json.dumps(post)
                file_out.write(line + '\n')

                lines_out += 1

        return lines_out

    n_cpu = mp.cpu_count() - 2
    with mp.Pool(processes=n_cpu) as pool:
        result = pool.map(process_file, posts.all_files)
        print(result)

[5816250, 6217561, 5959294, 5625476, 5603319, 5797814, 5743643, 6027611, 6026264, 5707875, 5784793, 6154251, 5482489, 5875928, 5715465, 5604187, 5511348, 5884667, 5727056, 6016020, 5830300, 434552, 459334, 433477, 454558, 447906, 422165, 462622, 437270, 448749, 426112, 426201, 457442, 439547, 442071, 455403, 431036, 440270, 437862, 462397, 442543, 444844]


# Add n grams

In [3]:
from gensim.models.phrases import Phrases, Phraser
import pandas as pd
from src.utils.resources import ENGLISH_CONNECTOR_WORDS
from src.utils.text_preprocessing import GensimCorpus
from pathlib import Path

# corpus = GensimCorpus(r'../data/interim/all_reddit_posts.jsonl.gz', sample=10000)
# corpus = GensimCorpus(r'../data/processed/esports_speedrun_reddit_posts.txt')

files = [f for f in Path(r'../data/interim/').glob('reddit_*.jsonl.gz')]
corpus = GensimCorpus(files)

phrases = Phrases(corpus, min_count=10, common_terms=ENGLISH_CONNECTOR_WORDS)
phrases.save(r'../models/phrases_jan_4.bg')

phrase_export = phrases.export_phrases(corpus)
bigrams_df = pd.DataFrame(phrase_export)
bigrams_df.columns = ['phrase', 'score']
bigram_counts = bigrams_df.groupby(['phrase', 'score'])['phrase'].count().reset_index(name='count')
bigram_counts.sort_values('count', ascending=False)

Unnamed: 0,phrase,score,count
34609,b'gon na',11.922062,1223959
88287,b'wan na',11.765868,384263
7218,b'assassins creed',55.671868,298150
65502,b'questions or objections',15.444972,211469
35744,b'gta v',12.540109,202261
...,...,...,...
23374,b'ding dong',271.477877,1
55643,b'navy seals',440.834743,1
90848,b'xclass hardware',10.814350,1
3864,b'advisory stickers',10.435145,1


In [4]:
bigram_counts.to_csv(r'../models/bigram_phrase_counts.csv')

# Filter bigrams

In [None]:
bigram_counts = pd.read_csv(r'../models/bigram_phrase_counts.csv')

In [None]:
# bigram_counts.sort_values('count', ascending=False).head(100)
# x = bigram_counts.iloc[263].phrase
x = b'\xc2\xb0'
x.decode('unicode_escape')

In [None]:
import textacy.preprocessing
# Filter out phrases that are uncommon or have low score

# Filter out anything with _SUBREDDIT_ in it


def ignore(text: str) -> bool:
    ignore = {'_SUBREDDIT_', '_EMOJI_', r'\x'}

    for token in ignore:
        if token in text:
            return True

    text = textacy.preprocessing.remove.punctuation(text)
    text = textacy.preprocessing.normalize.whitespace(text)
    
    if len(text) <= 3:
        return True

    return False

for i, phrase in enumerate(bigram_counts['phrase'].tolist()):
    if ignore(phrase):
        print(phrase)