In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import tensorflow as tf
import numpy as np
import os, time, random, math, re, json, string, sys, datetime, textwrap
import matplotlib.pyplot as plt
from tqdm import tqdm
import keras_nlp
from tokenizer import *
import heapq
from collections import Counter

Using TensorFlow backend


In [3]:
# gpu growth
for gpu in tf.config.experimental.list_physical_devices('GPU'):
    tf.config.experimental.set_memory_growth(gpu, True)
    print(f'{gpu} memory growth: {tf.config.experimental.get_memory_growth(gpu)}')

PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU') memory growth: True


In [5]:
def load_data():
    # Load and preprocess data
    all_text = open("tmp/posts.txt", "rb").read().decode(encoding="utf-8")
    all_text = all_text[:10_000_000]

    all_tokens = tokenizer(all_text)
    alltokens500k = all_tokens[:100_000]

    # Calculate token frequencies using Counter
    token_counts = Counter(alltokens500k)

    # Use a max-heap to sort tokens by frequency
    heap = [(-count, token) for token, count in token_counts.items()]
    heapq.heapify(heap)

    # Extract unique tokens sorted by frequency
    unique_tokens_sorted_by_frequency = [heapq.heappop(heap)[1] for _ in range(len(heap))]

    # Print the length of vocab and other information
    print(len(unique_tokens_sorted_by_frequency))
    print(all_text[:1000])
    print(len(all_text.split("<SPECIAL:END>")))

    return all_text, all_tokens, unique_tokens_sorted_by_frequency

In [6]:
all_text, all_tokens, vocab = load_data()

6707
I am a third year (starting fourth year in the fall) PhD student in mathematics. I've passed all qualifying exams and am currently doing research. As far as I can tell, I am not doing poorly. I have the good fortune of having a great advisor, being in a very supportive department, and having friends and family who genuinely care about my success. 

The fact is research is hard. It appears to consist primarily of staring at a problem for days and days and days without getting anywhere. Sometimes, rarely, I do figure something out and that feels wonderful, but the overwhelming majority of my time appears to be spent banging my head against a mostly figurative wall. I am not complaining about the material being hard, and I am not averse to putting in hard work, but I get frequently discouraged when I realize the vast volumes of mathematics that I yet know nothing about (and probably never will). It's very hard to quantify progress - in particular, there are too few tangible returns a

In [9]:
vocab[::-1]

['基于mathematica的数值计算方法',
 '∞',
 'β',
 'α',
 '´',
 'zany',
 'youtube',
 'youth',
 'yours',
 'young-ish',
 "you'll",
 'yearly',
 'year-old',
 'yale',
 'xmlui',
 'xfig',
 "x'",
 'ww2',
 'wrongfully',
 'wrongdoing',
 'wrong-doing',
 'writes',
 'wrestle',
 'worthy',
 'worthless',
 'worst',
 'worsens',
 'worsening',
 'worldwide',
 'workouts',
 'workload',
 'work-ethos',
 'wordy',
 'woods',
 'wonderful',
 'women-only',
 'witnesses',
 'withdrawing',
 'withdrawal',
 'wished',
 'winter',
 'wing',
 'wind',
 'wilderness',
 'width',
 'widespread',
 'wide',
 'whiteboard',
 'whistle',
 'whisper',
 'whine',
 'whilst',
 'wherein',
 'wherefore',
 'wet-lab',
 'well-written',
 'well-regulated',
 'well-regarded',
 'well-reasoned',
 'well-established',
 'well-documented',
 'well-designed',
 'well-defined',
 'well-being',
 'well-aware',
 'weighted',
 'weighing',
 'weigh',
 'weeping',
 'wednesday',
 'weapon',
 "we've",
 "we'll",
 'waypoint',
 'waves',
 'watson',
 'watertight',
 'waterfall',
 'water',
 'watche