<a href="https://colab.research.google.com/github/sdf10528236/cake/blob/main/coderapir_basic_s2s.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [33]:
import numpy as np
import regex
import string
import random
import pandas as pd
import tensorflow as tf




In [34]:
INPUT_CHARS = "".join(
    sorted(set("".join(string.ascii_letters)))) + " (),;.\""

OUTPUT_CHARS = "".join(
    sorted(set("".join(string.ascii_letters)))) + " (),;.\""


In [35]:
def data_str_to_ids(date_str, chars): 

    return [chars.index(c) for c in date_str]


def prepare_date_strs(data_strs, chars=INPUT_CHARS):
    X_ids = [data_str_to_ids(dt, chars) for dt in data_strs]
    X = tf.ragged.constant(X_ids, ragged_rank=1)
    return (X + 1).to_tensor()  # using 0 as the padding token ID


def create_dataset(x, y):

    return prepare_date_strs(x, INPUT_CHARS), prepare_date_strs(y, OUTPUT_CHARS)




In [36]:
df = pd.read_csv('printf_100thouthands.csv')



X_train, Y_train = create_dataset(df['wrong'][:60000], df['correct'][:60000])
X_valid, Y_valid = create_dataset(
    df['wrong'][60000:80000], df['correct'][60000:80000])
X_test, Y_test = create_dataset(
    df['wrong'][80000:99999], df['correct'][80000:99999])


In [37]:
X_train[1]

<tf.Tensor: shape=(23,), dtype=int32, numpy=
array([42, 44, 35, 40, 46, 32, 59,  3, 12, 47,  7, 43,  7, 38,  7, 59, 55,
       57,  0,  0,  0,  0,  0], dtype=int32)>

## First version: a very basic seq2seq model

In [38]:
from tensorflow import keras
embedding_size = 32
max_output_length = Y_train.shape[1]

np.random.seed(42)
tf.random.set_seed(42)

encoder = keras.models.Sequential([
    keras.layers.Embedding(input_dim=len(INPUT_CHARS) + 1,
                            output_dim=embedding_size,
                            input_shape=[None]),
    keras.layers.LSTM(128)
])

decoder = keras.models.Sequential([
    keras.layers.LSTM(128, return_sequences=True),
    keras.layers.Dense(len(OUTPUT_CHARS) + 1, activation="softmax")
])

model = keras.models.Sequential([
    encoder,
    keras.layers.RepeatVector(max_output_length),
    decoder
])

optimizer = keras.optimizers.Nadam()
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
              metrics=["accuracy"])
history = model.fit(X_train, Y_train, epochs=20,
                    validation_data=(X_valid, Y_valid))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [39]:


def ids_to_date_strs(ids, chars=OUTPUT_CHARS):
    return ["".join([(" " + chars)[index] for index in sequence])
            for sequence in ids]

In [40]:
max_input_length = X_train.shape[1]

def prepare_date_strs_padded(date_strs):
    X = prepare_date_strs(date_strs)
    if X.shape[1] < max_input_length:
        X = tf.pad(X, [[0, 0], [0, max_input_length - X.shape[1]]])
    return X

def convert_date_strs(date_strs):
    X = prepare_date_strs_padded(date_strs)
    pids = model.predict(X)
    ids = np.argmax(pids, axis=2)
    return ids_to_date_strs(ids)

In [41]:
df['wrong'][80000],df['wrong'][80001],df['wrong'][80002],df['wrong'][80003],df['wrong'][80004],df['wrong'][80005]

('print("CvIzCutGqq");',
 'printf(hlQohZEEN");',
 'printf("HqBSU"),',
 'pritf("s");',
 'printf( ("oFeg");',
 'pritf("CQK");')

In [42]:
convert_date_strs([df['wrong'][80000],df['wrong'][80001],df['wrong'][80002],df['wrong'][80003],df['wrong'][80004],df['wrong'][80005]])

['printf("CvAoKqtpqq");',
 'printf("hOQzPVEEO"); ',
 'printf("HqBSU");     ',
 'printf("s");         ',
 'printf("oFUg");      ',
 'printf("CQK");       ']

# Second version: feeding the shifted targets to the decoder (teacher forcing)

In [43]:
sos_id = len(OUTPUT_CHARS) + 1

def shifted_output_sequences(Y):
    sos_tokens = tf.fill(dims=(len(Y), 1), value=sos_id)
    #print(Y)
    #print("sos=",sos_tokens)
    #print(tf.concat([sos_tokens, Y[:, :-1]], axis=1))
    return tf.concat([sos_tokens, Y[:, :-1]], axis=1)
X_train_decoder = shifted_output_sequences(Y_train)
X_valid_decoder = shifted_output_sequences(Y_valid)
X_test_decoder = shifted_output_sequences(Y_test)



In [48]:
from tensorflow import keras
encoder_embedding_size = 32
decoder_embedding_size = 32
lstm_units = 128

np.random.seed(42)
tf.random.set_seed(42)

encoder_input = keras.layers.Input(shape=[None], dtype=tf.int32)
encoder_embedding = keras.layers.Embedding(
    input_dim=len(INPUT_CHARS) + 1,
    output_dim=encoder_embedding_size)(encoder_input)
_, encoder_state_h, encoder_state_c = keras.layers.LSTM(
    lstm_units, return_state=True)(encoder_embedding)
encoder_state = [encoder_state_h, encoder_state_c]

decoder_input = keras.layers.Input(shape=[None], dtype=tf.int32)
decoder_embedding = keras.layers.Embedding(
    input_dim=len(OUTPUT_CHARS) + 2,
    output_dim=decoder_embedding_size)(decoder_input)
decoder_lstm_output = keras.layers.LSTM(lstm_units, return_sequences=True)(
    decoder_embedding, initial_state=encoder_state)
decoder_output = keras.layers.Dense(len(OUTPUT_CHARS) + 1,
                                    activation="softmax")(decoder_lstm_output)

model = keras.models.Model(inputs=[encoder_input, decoder_input],
                           outputs=[decoder_output])

optimizer = keras.optimizers.Nadam()
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
              metrics=["accuracy"])
history = model.fit([X_train, X_train_decoder], Y_train, epochs=20,
                    validation_data=([X_valid, X_valid_decoder], Y_valid))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [49]:
sos_id = len(OUTPUT_CHARS) + 1
max_output_length = Y_train.shape[1]
def predict_date_strs(date_strs):
    X = prepare_date_strs_padded(date_strs)
    Y_pred = tf.fill(dims=(len(X), 1), value=sos_id)
    for index in range(max_output_length):
        pad_size = max_output_length - Y_pred.shape[1]
        X_decoder = tf.pad(Y_pred, [[0, 0], [0, pad_size]])
        Y_probas_next = model.predict([X, X_decoder])[:, index:index+1]
        Y_pred_next = tf.argmax(Y_probas_next, axis=-1, output_type=tf.int32)
        Y_pred = tf.concat([Y_pred, Y_pred_next], axis=1)
    return ids_to_date_strs(Y_pred[:, 1:])

In [50]:
df['wrong'][80000],df['wrong'][80001],df['wrong'][80002],df['wrong'][80003],df['wrong'][80004],df['wrong'][80005]

('printf("XpedDxYYU";',
 'printf("GlBs";',
 'print("ColNe");',
 'pintf("rQGIWCN");',
 'printf("diVPQmA";',
 'print("BgqhH");')

In [51]:
predict_date_strs([df['wrong'][80000],df['wrong'][80001],df['wrong'][80002],df['wrong'][80003],df['wrong'][80004],df['wrong'][80005]])

['printf("XpedTDqYY"); ',
 'printf("GlBs");      ',
 'printf("ColNe");     ',
 'printf("rQGIWCN");   ',
 'printf("diVPQmA");   ',
 'printf("BgqhH");     ']

# Third version: using TF-Addons's seq2seq implementation

In [52]:
# You may need to install tensorflow_addons
!pip install tensorflow_addons

Collecting tensorflow_addons
  Downloading tensorflow_addons-0.15.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[?25l[K     |▎                               | 10 kB 41.3 MB/s eta 0:00:01[K     |▋                               | 20 kB 34.7 MB/s eta 0:00:01[K     |▉                               | 30 kB 17.9 MB/s eta 0:00:01[K     |█▏                              | 40 kB 16.2 MB/s eta 0:00:01[K     |█▌                              | 51 kB 17.5 MB/s eta 0:00:01[K     |█▊                              | 61 kB 14.5 MB/s eta 0:00:01[K     |██                              | 71 kB 14.4 MB/s eta 0:00:01[K     |██▍                             | 81 kB 16.1 MB/s eta 0:00:01[K     |██▋                             | 92 kB 14.8 MB/s eta 0:00:01[K     |███                             | 102 kB 13.6 MB/s eta 0:00:01[K     |███▎                            | 112 kB 13.6 MB/s eta 0:00:01[K     |███▌                            | 122 kB 13.6 MB/s eta 0:00:01[K

In [65]:
import tensorflow_addons as tfa

np.random.seed(42)
tf.random.set_seed(42)

encoder_embedding_size = 32
decoder_embedding_size = 32
units = 128

encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
sequence_lengths = keras.layers.Input(shape=[], dtype=np.int32)

encoder_embeddings = keras.layers.Embedding(
    len(INPUT_CHARS) + 1, encoder_embedding_size)(encoder_inputs)

decoder_embedding_layer = keras.layers.Embedding(
    len(INPUT_CHARS) + 2, decoder_embedding_size)
decoder_embeddings = decoder_embedding_layer(decoder_inputs)

encoder = keras.layers.LSTM(units, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_embeddings)
encoder_state = [state_h, state_c]

sampler = tfa.seq2seq.sampler.TrainingSampler()

decoder_cell = keras.layers.LSTMCell(units)
output_layer = keras.layers.Dense(len(OUTPUT_CHARS) + 1)

decoder = tfa.seq2seq.basic_decoder.BasicDecoder(decoder_cell,
                                                 sampler,
                                                 output_layer=output_layer)
final_outputs, final_state, final_sequence_lengths = decoder(
    decoder_embeddings,
    initial_state=encoder_state)
Y_proba = keras.layers.Activation("softmax")(final_outputs.rnn_output)

model = keras.models.Model(inputs=[encoder_inputs, decoder_inputs],
                           outputs=[Y_proba])
optimizer = keras.optimizers.Nadam()
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
              metrics=["accuracy"])
history = model.fit([X_train, X_train_decoder], Y_train, epochs=25,
                    validation_data=([X_valid, X_valid_decoder], Y_valid))

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [66]:
[df['wrong'][80000],df['wrong'][80001],df['wrong'][80002],df['wrong'][80003],df['wrong'][80004],df['wrong'][80005]]

['printf("XpedDxYYU";',
 'printf("GlBs";',
 'print("ColNe");',
 'pintf("rQGIWCN");',
 'printf("diVPQmA";',
 'print("BgqhH");']

In [67]:
predict_date_strs([df['wrong'][80000],df['wrong'][80001],df['wrong'][80002],df['wrong'][80003],df['wrong'][80004],df['wrong'][80005]])

['printf("XpedDxYYU"); ',
 'printf("GlBs");      ',
 'printf("ColNe");     ',
 'printf("rQGIWCN");   ',
 'printf("diVPQmA");   ',
 'printf("BgqhH");     ']

In [68]:
inference_sampler = tfa.seq2seq.sampler.GreedyEmbeddingSampler(
    embedding_fn=decoder_embedding_layer)
inference_decoder = tfa.seq2seq.basic_decoder.BasicDecoder(
    decoder_cell, inference_sampler, output_layer=output_layer,
    maximum_iterations=max_output_length)
batch_size = tf.shape(encoder_inputs)[:1]
start_tokens = tf.fill(dims=batch_size, value=sos_id)
final_outputs, final_state, final_sequence_lengths = inference_decoder(
    start_tokens,
    initial_state=encoder_state,
    start_tokens=start_tokens,
    end_token=0)

inference_model = keras.models.Model(inputs=[encoder_inputs],outputs=[final_outputs.sample_id])

In [69]:
def fast_predict_date_strs(date_strs):
    X = prepare_date_strs_padded(date_strs)
    Y_pred = inference_model.predict(X)
    return ids_to_date_strs(Y_pred)

In [70]:
[df['wrong'][80000],df['wrong'][80001],df['wrong'][80002],df['wrong'][80003],df['wrong'][80004],df['wrong'][80005]]

['printf("XpedDxYYU";',
 'printf("GlBs";',
 'print("ColNe");',
 'pintf("rQGIWCN");',
 'printf("diVPQmA";',
 'print("BgqhH");']

In [71]:
fast_predict_date_strs([df['wrong'][80000],df['wrong'][80001],df['wrong'][80002],df['wrong'][80003],df['wrong'][80004],df['wrong'][80005]])

['printf("XpedDxYYU"); ',
 'printf("GlBs");      ',
 'printf("ColNe");     ',
 'printf("rQGIWCN");   ',
 'printf("diVPQmA");   ',
 'printf("BgqhH");     ']

# Fourth version: using TF-Addons's seq2seq implementation with a scheduled sampler


In [72]:
import tensorflow_addons as tfa

np.random.seed(42)
tf.random.set_seed(42)

n_epochs = 20
encoder_embedding_size = 32
decoder_embedding_size = 32
units = 128

encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
sequence_lengths = keras.layers.Input(shape=[], dtype=np.int32)

encoder_embeddings = keras.layers.Embedding(
    len(INPUT_CHARS) + 1, encoder_embedding_size)(encoder_inputs)

decoder_embedding_layer = keras.layers.Embedding(
    len(INPUT_CHARS) + 2, decoder_embedding_size)
decoder_embeddings = decoder_embedding_layer(decoder_inputs)

encoder = keras.layers.LSTM(units, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_embeddings)
encoder_state = [state_h, state_c]

sampler = tfa.seq2seq.sampler.ScheduledEmbeddingTrainingSampler(
    sampling_probability=0.,
    embedding_fn=decoder_embedding_layer)
# we must set the sampling_probability after creating the sampler
# (see https://github.com/tensorflow/addons/pull/1714)
sampler.sampling_probability = tf.Variable(0.)

decoder_cell = keras.layers.LSTMCell(units)
output_layer = keras.layers.Dense(len(OUTPUT_CHARS) + 1)

decoder = tfa.seq2seq.basic_decoder.BasicDecoder(decoder_cell,
                                                 sampler,
                                                 output_layer=output_layer)
final_outputs, final_state, final_sequence_lengths = decoder(
    decoder_embeddings,
    initial_state=encoder_state)
Y_proba = keras.layers.Activation("softmax")(final_outputs.rnn_output)

model = keras.models.Model(inputs=[encoder_inputs, decoder_inputs],
                           outputs=[Y_proba])
optimizer = keras.optimizers.Nadam()
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
              metrics=["accuracy"])

def update_sampling_probability(epoch, logs):
    proba = min(1.0, epoch / (n_epochs - 10))
    sampler.sampling_probability.assign(proba)

sampling_probability_cb = keras.callbacks.LambdaCallback(
    on_epoch_begin=update_sampling_probability)
history = model.fit([X_train, X_train_decoder], Y_train, epochs=n_epochs,
                    validation_data=([X_valid, X_valid_decoder], Y_valid),
                    callbacks=[sampling_probability_cb])

Epoch 1/20


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [73]:
softmax_temperature = tf.Variable(1.)

inference_sampler = tfa.seq2seq.sampler.SampleEmbeddingSampler(
    embedding_fn=decoder_embedding_layer,
    softmax_temperature=softmax_temperature)
inference_decoder = tfa.seq2seq.basic_decoder.BasicDecoder(
    decoder_cell, inference_sampler, output_layer=output_layer,
    maximum_iterations=max_output_length)
batch_size = tf.shape(encoder_inputs)[:1]
start_tokens = tf.fill(dims=batch_size, value=sos_id)
final_outputs, final_state, final_sequence_lengths = inference_decoder(
    start_tokens,
    initial_state=encoder_state,
    start_tokens=start_tokens,
    end_token=0)

inference_model = keras.models.Model(inputs=[encoder_inputs],
                                     outputs=[final_outputs.sample_id])

In [74]:
def creative_predict_date_strs(date_strs, temperature=1.0):
    softmax_temperature.assign(temperature)
    X = prepare_date_strs_padded(date_strs)
    Y_pred = inference_model.predict(X)
    return ids_to_date_strs(Y_pred)

In [78]:
[df['wrong'][80000],df['wrong'][80001],df['wrong'][80002],df['wrong'][80003],df['wrong'][80004],df['wrong'][80005]]

['printf("XpedDxYYU";',
 'printf("GlBs";',
 'print("ColNe");',
 'pintf("rQGIWCN");',
 'printf("diVPQmA";',
 'print("BgqhH");']

In [77]:
tf.random.set_seed(42)

creative_predict_date_strs([df['wrong'][80000],df['wrong'][80001],df['wrong'][80002],df['wrong'][80003],df['wrong'][80004],df['wrong'][80005]])

['printf("XpewKEUqv"); ',
 'printf("GlBs");      ',
 'printf("ColNe");     ',
 'printf("rQGIWwN");   ',
 'printf("diaPlmA");   ',
 'printf("BgshH");     ']