In [1]:
%matplotlib inline
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Flatten, TimeDistributed, Dropout, LSTMCell, RNN, Bidirectional, Concatenate, Layer
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.python.keras.utils import tf_utils
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import pickle
import tensorflow_hub as hub

from sklearn.model_selection import train_test_split

import unicodedata
import re
import os
import time
import shutil
import requests
import tarfile
import glob

import argparse
from tokenize import tokenize, untokenize, COMMENT, STRING, NEWLINE, ENCODING, ENDMARKER, NL, INDENT, NUMBER
from io import BytesIO
import json

import pandas as pd
import numpy as np
import string, os
tf.__version__

'2.4.0'

In [2]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
physical_devices

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:3', device_type='GPU')]

In [3]:
with open("train_sent.txt", "r") as fp: 
    train_sent = fp.readlines()
with open("test_sent.txt", "r") as fp:
    test_sent = fp.readlines()
with open("full_corpus.txt", "r") as fp:
    full_corpus = fp.readlines()

In [4]:
train_sent[0:15]

['e = enumerate\n',
 'n , * a = map ( int , open ( <NUM_LIT:0> ) . read ( ) . split ( ) )\n',
 'd = [ <NUM_LIT:0> ]\n',
 'for j , ( a , i ) in e ( sorted ( ( a , i ) for i , a in e ( a ) ) [ : : - <NUM_LIT:1> ] ) : d = [ d [ <NUM_LIT:0> ] + a * abs ( n - j - i - <NUM_LIT:1> ) ] + [ max ( d [ k ] + a * abs ( n - j + k - i - <NUM_LIT:1> ) , d [ k - <NUM_LIT:1> ] + a * abs ( i - k + <NUM_LIT:1> ) ) for k in range ( <NUM_LIT:1> , j + <NUM_LIT:1> ) ] + [ d [ j ] + a * abs ( i - j ) ]\n',
 'print ( max ( d ) )\n',
 '\n',
 'N = int ( input ( ) )\n',
 'A = list ( map ( int , input ( ) . split ( ) ) )\n',
 'table = [ ]\n',
 'for i , a in enumerate ( A ) :\n',
 'table . append ( [ a , i ] )\n',
 'table . sort ( )\n',
 'DP = [ [ <NUM_LIT:0> for i in range ( N + <NUM_LIT:1> ) ] for j in range ( N + <NUM_LIT:1> ) ]\n',
 'for i in range ( <NUM_LIT:1> , N + <NUM_LIT:1> ) :\n',
 'baby , pos = table . pop ( )\n']

In [5]:
len(full_corpus)

240000

In [6]:
batch_size=1000
all_texts = [full_corpus[i : i+batch_size] for i in range(0, len(full_corpus), batch_size)]

In [7]:
len(all_texts)

240

In [8]:
def batch_iterator():
    for i in range(0, len(full_corpus), batch_size):
        yield full_corpus[i : i + batch_size]

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [10]:
tokenizer.is_fast

True

In [11]:
new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), vocab_size=25000)

In [12]:
# new_tokenizer.add_special_tokens({
#   "eos_token": "</s>",
#   "bos_token": "<s>",
#   "unk_token": "<unk>",
#   "pad_token": "<pad>",
#   "mask_token": "<mask>"
# })

In [13]:
new_tokenizer(full_corpus[:1])

{'input_ids': [[68, 178, 557, 168, 170, 30, 196, 183, 215, 185, 178, 232, 172, 204, 183, 805, 172, 168, 177, 63, 175, 26, 16, 30, 171, 189, 296, 172, 171, 189, 228, 172, 171, 171, 168, 170, 30, 197, 178, 181, 168, 177, 63, 175, 26, 16, 30, 180, 168, 170, 30, 198, 231, 183, 172, 185, 183, 173, 171, 179, 220, 172, 463, 172, 172, 185, 183, 173, 171, 198, 173, 183, 185, 179, 220, 172, 185, 171, 171, 181, 182, 182, 194, 168, 177, 63, 175, 26, 17, 30, 180, 171, 182, 197, 178, 181, 197, 181, 168, 177, 63, 175, 26, 16, 30, 180, 192, 185, 215, 388, 172, 196, 194, 231, 194, 173, 194, 168, 177, 63, 175, 26, 17, 30, 171, 180, 192, 181, 306, 172, 197, 181, 254, 180, 192, 185, 215, 388, 172, 196, 194, 231, 192, 254, 194, 173, 194, 168, 177, 63, 175, 26, 17, 30, 171, 183, 197, 181, 254, 194, 168, 177, 63, 175, 26, 17, 30, 180, 192, 185, 215, 388, 172, 173, 194, 254, 192, 168, 177, 63, 175, 26, 17, 30, 171, 171, 198, 254, 179, 216, 172, 168, 177, 63, 175, 26, 17, 30, 183, 231, 192, 168, 177, 63, 175, 

In [14]:
full_corpus[:1]

['e = enumerate <EOL> n , * a = map ( int , open ( <NUM_LIT:0> ) . read ( ) . split ( ) ) <EOL> d = [ <NUM_LIT:0> ] <EOL> for j , ( a , i ) in e ( sorted ( ( a , i ) for i , a in e ( a ) ) [ : : - <NUM_LIT:1> ] ) : d = [ d [ <NUM_LIT:0> ] + a * abs ( n - j - i - <NUM_LIT:1> ) ] + [ max ( d [ k ] + a * abs ( n - j + k - i - <NUM_LIT:1> ) , d [ k - <NUM_LIT:1> ] + a * abs ( i - k + <NUM_LIT:1> ) ) for k in range ( <NUM_LIT:1> , j + <NUM_LIT:1> ) ] + [ d [ j ] + a * abs ( i - j ) ] <EOL> print ( max ( d ) )\n']

In [15]:
new_tokenizer.save_pretrained("code-tokenizer")

('code-tokenizer/tokenizer_config.json',
 'code-tokenizer/special_tokens_map.json',
 'code-tokenizer/vocab.json',
 'code-tokenizer/merges.txt',
 'code-tokenizer/added_tokens.json',
 'code-tokenizer/tokenizer.json')

In [16]:
encoded_input = new_tokenizer("Hello, I'm a single sentence!")

In [17]:
encoded_input

{'input_ids': [14265, 657, 12, 382, 7, 76, 185, 4563, 3612, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [18]:
new_tokenizer.decode(encoded_input["input_ids"])

"Hello, I'm a single sentence!"

In [19]:
encoded_input = new_tokenizer(full_corpus[0])

In [20]:
encoded_input

{'input_ids': [68, 178, 557, 168, 170, 30, 196, 183, 215, 185, 178, 232, 172, 204, 183, 805, 172, 168, 177, 63, 175, 26, 16, 30, 171, 189, 296, 172, 171, 189, 228, 172, 171, 171, 168, 170, 30, 197, 178, 181, 168, 177, 63, 175, 26, 16, 30, 180, 168, 170, 30, 198, 231, 183, 172, 185, 183, 173, 171, 179, 220, 172, 463, 172, 172, 185, 183, 173, 171, 198, 173, 183, 185, 179, 220, 172, 185, 171, 171, 181, 182, 182, 194, 168, 177, 63, 175, 26, 17, 30, 180, 171, 182, 197, 178, 181, 197, 181, 168, 177, 63, 175, 26, 16, 30, 180, 192, 185, 215, 388, 172, 196, 194, 231, 194, 173, 194, 168, 177, 63, 175, 26, 17, 30, 171, 180, 192, 181, 306, 172, 197, 181, 254, 180, 192, 185, 215, 388, 172, 196, 194, 231, 192, 254, 194, 173, 194, 168, 177, 63, 175, 26, 17, 30, 171, 183, 197, 181, 254, 194, 168, 177, 63, 175, 26, 17, 30, 180, 192, 185, 215, 388, 172, 173, 194, 254, 192, 168, 177, 63, 175, 26, 17, 30, 171, 171, 198, 254, 179, 216, 172, 168, 177, 63, 175, 26, 17, 30, 183, 231, 192, 168, 177, 63, 175, 2

In [21]:
new_tokenizer.decode(encoded_input["input_ids"])

'e = enumerate <EOL> n, * a = map ( int, open ( <NUM_LIT:0> ). read ( ). split ( ) ) <EOL> d = [ <NUM_LIT:0> ] <EOL> for j, ( a, i ) in e ( sorted ( ( a, i ) for i, a in e ( a ) ) [ : : - <NUM_LIT:1> ] ) : d = [ d [ <NUM_LIT:0> ] + a * abs ( n - j - i - <NUM_LIT:1> ) ] + [ max ( d [ k ] + a * abs ( n - j + k - i - <NUM_LIT:1> ), d [ k - <NUM_LIT:1> ] + a * abs ( i - k + <NUM_LIT:1> ) ) for k in range ( <NUM_LIT:1>, j + <NUM_LIT:1> ) ] + [ d [ j ] + a * abs ( i - j ) ] <EOL> print ( max ( d ) )\n'