# Deep Learning CNN Model on Colab

In [1]:
from google.colab import drive
drive.mount('/content/drive')
path = 'drive/My Drive/Colab Notebooks/Université Jean Monet/Deep Learning II/Project/'

Mounted at /content/drive


## Models

In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, Concatenate, MaxPooling1D, Dense, Dropout, GlobalMaxPooling1D

class CNN_Basic:
    def __init__(self, tokenizer_en, tokenizer_fr, max_len, max_vocab_fr_len):
        self.tokenizer_en = tokenizer_en
        self.tokenizer_fr = tokenizer_fr
        self.max_len = max_len
        self.max_vocab_fr_len = max_vocab_fr_len

    def build_model(self):
        # Define Encoder
        model = Sequential()
        model.add(Embedding(input_dim=len(self.tokenizer_en.word_index) + 1, output_dim=64, input_length=self.max_len))
        model.add(Conv1D(256, kernel_size=8, padding='same', activation='relu'))
        model.add(MaxPooling1D(pool_size=1, strides=1))
        model.add(Conv1D(128, kernel_size=5, padding='same', activation='relu'))
        model.add(Dropout(0.5))
        model.add(Conv1D(64, kernel_size=3, padding='same', activation='relu'))
        model.add(MaxPooling1D(pool_size=1, strides=1))
        model.add(Conv1D(32, kernel_size=3, padding='same', activation='relu'))
        model.add(Dense(100, activation='relu'))
        model.add(Dense(len(self.tokenizer_fr.word_index) + 1)),

        # Compile the model
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        return model

In [3]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Conv1D, Concatenate, MaxPooling1D, Dense, Dropout, Flatten, Input, \
    RepeatVector, Reshape

class CNN_Auto_Basic:
    def __init__(self, tokenizer_en, tokenizer_fr, max_len, max_vocab_fr_len):
        self.tokenizer_en = tokenizer_en
        self.tokenizer_fr = tokenizer_fr
        self.max_len = max_len
        self.max_vocab_fr_len = max_vocab_fr_len

    def build_model(self):
        # Encoder
        encoder_inputs = Input(shape=(None,))
        enc_emb_layer = Embedding(input_dim=len(self.tokenizer_en.word_index) + 1, output_dim=32)
        enc_emb = enc_emb_layer(encoder_inputs)
        # Adding conv and pool layers in the encoder
        encoder_cnn1 = Conv1D(128, kernel_size=5, padding='same', activation='relu')(enc_emb)
        encoder_cnn2 = Conv1D(64, kernel_size=3, padding='same', activation='relu')(encoder_cnn1)
        encoder_cnn3 = Conv1D(32, kernel_size=3, padding='same', activation='relu')(encoder_cnn2)

        # Decoder
        decoder_inputs = Input(shape=(None,))
        dec_emb_layer = Embedding(input_dim=len(self.tokenizer_fr.word_index) + 1, output_dim=32)
        dec_emb = dec_emb_layer(decoder_inputs)

        # Adding conv and pool layers + encoder in the decoder
        decoder_cnn1 = Conv1D(128, kernel_size=5, padding='same', activation='relu')(dec_emb)
        #encoder_output_repeated = RepeatVector(32)(Flatten()(encoder_cnn3))
        #encoder_output_repeated = Reshape((32, 64))(encoder_output_repeated)
        #merged_input = Concatenate(axis=-1)([decoder_cnn1, encoder_output_repeated])
        decoder_cnn2 = Conv1D(64, kernel_size=3, padding='same', activation='relu')(decoder_cnn1)
        decoder_cnn3 = Conv1D(64, kernel_size=3, padding='same', activation='relu')(decoder_cnn2)
        decoder_dense = Dense(self.max_vocab_fr_len + 1, activation='softmax')
        decoder_outputs = decoder_dense(decoder_cnn3)
        # Add to a model
        model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

        # Compile the model
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        return model


In [4]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Conv1D, Concatenate, MaxPooling1D, Dense, Dropout, Flatten, Input, \
    RepeatVector, Reshape


class CNN_Auto_Bigger:
    def __init__(self, tokenizer_en, tokenizer_fr, max_len, max_vocab_fr_len):
        self.tokenizer_en = tokenizer_en
        self.tokenizer_fr = tokenizer_fr
        self.max_len = max_len
        self.max_vocab_fr_len = max_vocab_fr_len

    def build_model(self):
        # Encoder
        encoder_inputs = Input(shape=(None,))
        enc_emb_layer = Embedding(input_dim=len(self.tokenizer_en.word_index) + 1, output_dim=32)
        enc_emb = enc_emb_layer(encoder_inputs)
        # Adding conv and pool layers in the encoder
        #encoder_cnn1 = Conv1D(16384, kernel_size=50, padding='same', activation='relu')(enc_emb)
        #encoder_cnn2 = Conv1D(8192, kernel_size=20, padding='same', activation='relu')(encoder_cnn1)
        #encoder_cnn3 = Conv1D(4096, kernel_size=10, padding='same', activation='relu')(encoder_cnn2)
        #encoder_cnn4 = Conv1D(2048, kernel_size=10, padding='same', activation='relu')(encoder_cnn3)
        encoder_cnn5 = Conv1D(1024, kernel_size=5, padding='same', activation='relu')(enc_emb)
        encoder_cnn6 = Conv1D(512, kernel_size=5, padding='same', activation='relu')(encoder_cnn5)
        encoder_cnn7 = Conv1D(256, kernel_size=5, padding='same', activation='relu')(encoder_cnn6)
        encoder_cnn8 = Conv1D(128, kernel_size=3, padding='same', activation='relu')(encoder_cnn7)
        encoder_cnn9 = Conv1D(64, kernel_size=3, padding='same', activation='relu')(encoder_cnn8)
        encoder_cnn10 = Conv1D(32, kernel_size=3, padding='same', activation='relu')(encoder_cnn9)

        # Decoder
        decoder_inputs = Input(shape=(None,))
        dec_emb_layer = Embedding(input_dim=len(self.tokenizer_fr.word_index) + 1, output_dim=32)
        dec_emb = dec_emb_layer(decoder_inputs)

        # Adding conv and pool layers + encoder in the decoder
        #decoder_cnn1 = Conv1D(16384, kernel_size=50, padding='same', activation='relu')(dec_emb)
        #encoder_output_repeated = RepeatVector(32)(Flatten()(encoder_cnn3))
        #encoder_output_repeated = Reshape((32, 64))(encoder_output_repeated)
        #merged_input = Concatenate(axis=-1)([decoder_cnn1, encoder_output_repeated])
        #decoder_cnn2 = Conv1D(8192, kernel_size=20, padding='same', activation='relu')(decoder_cnn1)
        #decoder_cnn3 = Conv1D(4096, kernel_size=10, padding='same', activation='relu')(decoder_cnn2)
        #decoder_cnn4 = Conv1D(2048, kernel_size=10, padding='same', activation='relu')(decoder_cnn3)
        decoder_cnn5 = Conv1D(1024, kernel_size=5, padding='same', activation='relu')(dec_emb)
        decoder_cnn6 = Conv1D(512, kernel_size=5, padding='same', activation='relu')(decoder_cnn5)
        decoder_cnn7 = Conv1D(256, kernel_size=5, padding='same', activation='relu')(decoder_cnn6)
        decoder_cnn8 = Conv1D(128, kernel_size=3, padding='same', activation='relu')(decoder_cnn7)
        decoder_cnn9 = Conv1D(64, kernel_size=3, padding='same', activation='relu')(decoder_cnn8)
        decoder_cnn10 = Conv1D(32, kernel_size=3, padding='same', activation='relu')(decoder_cnn9)
        decoder_dense = Dense(self.max_vocab_fr_len + 1, activation='softmax')
        decoder_outputs = decoder_dense(decoder_cnn10)
        # Add to a model
        model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

        # Compile the model
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        return model


## Main

In [12]:
# --- 1. We import the libraries we need ---
import numpy as np
import tensorflow as tf
import argparse
import pandas as pd
import time
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger, EarlyStopping, Callback


# --- 2. We define testing modules ---

def translate_sequence(seq, tokenizer):
    """ Translates a sequence of integers back into text using the tokenizer. """
    words = [tokenizer.index_word.get(idx, '') for idx in seq]
    return ' '.join(words).strip()

def predict_and_compare(index, testX, model, tokenizer_en, tokenizer_fr):
    """ Predicts translation for a given index in the test set and compares with the ground truth. """
    input_seq = testX[index:index+1]
    prediction = model.predict(input_seq)

    # Converting the prediction to a sequence of integers
    predicted_seq = np.argmax(prediction, axis=-1)[0]

    # Reverse tokenization (converting sequences back to words)
    input_text = translate_sequence(input_seq[0], tokenizer_en)
    predicted_text = translate_sequence(predicted_seq, tokenizer_fr)
    ground_truth_text = translate_sequence(testY[index].flatten(), tokenizer_fr)

    # Return results
    return input_text, predicted_text, ground_truth_text

def predict_and_compare_auto_en(index, testX, testY, model, tokenizer_en, tokenizer_fr):
    """ Predicts translation for a given index in the test set and compares with the ground truth. """
    input_seq_X = testX[index:index+1]
    input_seq_Y = testY[index:index+1]
    prediction = model.predict([input_seq_X, input_seq_Y])

    # Converting the prediction to a sequence of integers
    predicted_seq = np.argmax(prediction, axis=-1)[0]

    # Reverse tokenization (converting sequences back to words)
    input_text = translate_sequence(input_seq_X[0], tokenizer_en)
    predicted_text = translate_sequence(predicted_seq, tokenizer_fr)
    ground_truth_text = translate_sequence(testY[index].flatten(), tokenizer_fr)

    # Return results
    return input_text, predicted_text, ground_truth_text


class TimedCSVLogger(CSVLogger):
    def __init__(self, filename, separator=',', append=False):
        super().__init__(filename, separator, append)
        self.start_time = time.time()

    def on_epoch_begin(self, epoch, logs=None):
        self.epoch_start_time = time.time()

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        end_time = time.time()
        logs['epoch_duration'] = end_time - self.epoch_start_time
        logs['total_time'] = end_time - self.start_time
        super().on_epoch_end(epoch, logs)

# --- 3. We check the gpus available ---

if __name__ == '__main__':

    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        try:
            # Set TensorFlow to use only one GPU
            tf.config.experimental.set_visible_devices(gpus[0], 'GPU')

            # Enable memory growth
            tf.config.experimental.set_memory_growth(gpus[0], True)

            print("Using GPU:", gpus[0])
        except RuntimeError as e:
            # Memory growth must be set at program startup
            print("RuntimeError:", e)
    #else:
        #raise SystemError("GPU device not found")

    # --- 4. We define global variables ---

    EPOCHS = 100
    BATCH_SIZE = 32
    MAX_VOCAB_SIZE_FR = 20500

    # --- 3. We open the data and apply tokenization ---

    df = pd.read_csv(path + 'preprocessed_data.csv')

    tokenizer_en = Tokenizer()
    tokenizer_en.fit_on_texts(df['en_tokens'])
    tokenizer_fr = Tokenizer(num_words=MAX_VOCAB_SIZE_FR + 1)
    tokenizer_fr.fit_on_texts(df['fr_tokens'])

    # Convert text to sequences
    sequences_en = tokenizer_en.texts_to_sequences(df['en_tokens'])
    sequences_fr = tokenizer_fr.texts_to_sequences(df['fr_tokens'])

    # Padding sequences
    max_len = max(max(len(s) for s in sequences_en), max(len(s) for s in sequences_fr))
    sequences_en = pad_sequences(sequences_en, maxlen=max_len, padding='post')
    sequences_fr = pad_sequences(sequences_fr, maxlen=max_len, padding='post')

    # Splitting the data
    split = int(len(sequences_en) * 0.8)
    trainX, testX = sequences_en[:split], sequences_en[split:]
    trainY, testY = sequences_fr[:split], sequences_fr[split:]

    # Finally, reshape data for feeding into model (French words)
    trainY = trainY.reshape(trainY.shape[0], trainY.shape[1], 1)
    testY = testY.reshape(testY.shape[0], testY.shape[1], 1)

    # --- 4. We load the model ---
    method_name = ['CNN_Basic', 'CNN_Auto_Bigger', 'CNN_Auto_Basic' ]  # , 'CNN_ByteNet']
    method_instance = [CNN_Basic(tokenizer_en, tokenizer_fr, max_len, MAX_VOCAB_SIZE_FR),
                       CNN_Auto_Bigger(tokenizer_en, tokenizer_fr, max_len, MAX_VOCAB_SIZE_FR),
                       CNN_Auto_Basic(tokenizer_en, tokenizer_fr, max_len, MAX_VOCAB_SIZE_FR)]  # , CNN_ByteNet(MAX_VOCAB_SIZE_FR)]

    # Shared Callbacks
    early_stopping = EarlyStopping(monitor='val_accuracy', patience=5, mode='max', verbose=1)

    for i in range(len(method_name)):
        print(method_name[i])
        current_model = method_instance[i].build_model()

        # --- 5. We train the model ---
        checkpoint = ModelCheckpoint(path + 'results/weights/weights_' + method_name[i] + '.best.h5', monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
        csv_logger = TimedCSVLogger(path + 'results/training_log/training_log_' + method_name[i] + '.csv', append=True)
        if method_name[i] == 'CNN_ByteNet':
            run_CNN_ByteNet()
        elif method_name[i] == 'CNN_Auto_Basic' or method_name[i] == 'CNN_Auto_Bigger':
            current_model.fit([trainX, np.squeeze(trainY, axis=-1)], trainY,
                              epochs=EPOCHS,
                              validation_split=0.2,
                              batch_size=BATCH_SIZE,
                              callbacks=[checkpoint, csv_logger, early_stopping])
        else:
            current_model.fit(trainX, trainY,
                              epochs=EPOCHS,
                              validation_data=(testX, testY),
                              batch_size=BATCH_SIZE,
                              callbacks=[checkpoint, csv_logger, early_stopping])


        # --- 6. We test the model (Change for more meaningful metrics like BLEU) ---

        all_predictions = []
        for j in range(5):
            if method_name[i] == 'CNN_Auto_Basic' or method_name[i] == 'CNN_Auto_Bigger':
                input_text, predicted_text, ground_truth_text = predict_and_compare_auto_en(index=j, testX=testX,
                                                                                            testY=np.squeeze(testY, axis=-1),
                                                                                            model=current_model,
                                                                                            tokenizer_en=tokenizer_en,
                                                                                            tokenizer_fr=tokenizer_fr)
            else:
                input_text, predicted_text, ground_truth_text = predict_and_compare(index=j, testX=testX,
                                                                                    model=current_model,
                                                                                    tokenizer_en=tokenizer_en,
                                                                                    tokenizer_fr=tokenizer_fr)
            all_predictions.append((input_text, predicted_text, ground_truth_text))

        # Writing predictions to a text file
        with open(path + 'results/predictions/model_predictions_' + method_name[i] + '.txt', 'w', encoding='utf-8') as file:
            for input_text, predicted_text, ground_truth in all_predictions:
                file.write("Input (English): " + input_text + "\n")
                file.write("Predicted (French): " + predicted_text + "\n")
                file.write("Ground Truth (French): " + ground_truth + "\n")
                file.write("----------\n")

def sample_top(a=[], top_k=10):
    idx = np.argsort(a)[::-1]
    idx = idx[:top_k]
    probs = a[idx]
    probs = probs / np.sum(probs)
    choice = np.random.choice(idx, p=probs)
    return choice


def run_CNN_ByteNet():
    parser = argparse.ArgumentParser()
    parser.add_argument('--learning_rate', type=float, default=0.001,
                        help='Learning Rate')
    parser.add_argument('--bucket_quant', type=int, default=50,
                        help='Bucket Quant')
    parser.add_argument('--beta1', type=float, default=0.5,
                        help='Momentum for Adam Update')
    parser.add_argument('--resume_model', type=str, default=None,
                        help='Pre-Trained Model Path, to resume from')
    parser.add_argument('--sample_every', type=int, default=500,
                        help='Sample generator output every x steps')
    parser.add_argument('--summary_every', type=int, default=50,
                        help='Sample generator output every x steps')
    parser.add_argument('--top_k', type=int, default=5,
                        help='Sample from top k predictions')
    args = parser.parse_args()

    translator_model = CNN_ByteNet(MAX_VOCAB_SIZE_FR)
    translator_model.build_options()

    optim = tf.keras.optimizers.Adam(args.learning_rate)

    translator_model.build_translator(reuse=True)
    translator_model.build_model(MAX_VOCAB_SIZE_FR)
    merged_summary = tf.compat.v1.summary.merge_all()

    sess = tf.compat.v1.InteractiveSession()
    tf.compat.v1.initialize_all_variables().run()
    saver = tf.compat.v1.train.Saver()

    if args.resume_model:
        saver.restore(sess, args.resume_model)

    step = 0
    for epoch in range(EPOCHS):
        batch_no = 0
        start = time.process_time()

        _, loss, prediction = sess.run(
            [optim, translator_model.loss, translator_model.arg_max_prediction],

            feed_dict={
                translator_model.source_sentence: trainX,
                translator_model.target_sentence: trainY,
            })
        end = time.process_time()

        print
        "LOSS: {}\tEPOCH: {}\tBATCH_NO: {}\t STEP:{}\t total_batches:{}\t bucket_size:{}".format(
            loss, epoch, batch_no, step)
        print
        "TIME FOR BATCH", end - start

        batch_no += 1
        step += 1
        if step % args.summary_every == 0:
            [summary] = sess.run([merged_summary], feed_dict={
                translator_model.source_sentence: trainX,
                translator_model.target_sentence: trainY,
            })
            print
            "******"
            print
            "Source ", trainX
            print
            "---------"
            print
            "Target ", trainY
            print
            "----------"
            print
            "Prediction ", prediction
            print
            "******"

        if step % args.sample_every == 0:
            log_file = open('translator_sample.txt', 'wb')
            generated_target = trainY[:, 0:1]
            for col in range(batch_no):
                [probs] = sess.run([translator_model.t_probs],
                                   feed_dict={
                                       translator_model.t_source_sentence: trainX,
                                       translator_model.t_target_sentence: generated_target,
                                   })

                curr_preds = []
                for bi in range(probs.shape[0]):
                    pred_word = sample_top(probs[bi][-1], top_k=args.top_k)
                    curr_preds.append(pred_word)

                generated_target = np.insert(generated_target, generated_target.shape[1],
                                             curr_preds, axis=1)


Using GPU: PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
CNN_Basic
Epoch 1/100
Epoch 1: val_accuracy improved from -inf to 0.82222, saving model to drive/My Drive/Colab Notebooks/Université Jean Monet/Deep Learning II/Project/results/weights/weights_CNN_Basic.best.h5


  saving_api.save_model(


Epoch 2/100
Epoch 2: val_accuracy improved from 0.82222 to 0.82373, saving model to drive/My Drive/Colab Notebooks/Université Jean Monet/Deep Learning II/Project/results/weights/weights_CNN_Basic.best.h5
Epoch 3/100
Epoch 3: val_accuracy did not improve from 0.82373
Epoch 4/100
Epoch 4: val_accuracy improved from 0.82373 to 0.82381, saving model to drive/My Drive/Colab Notebooks/Université Jean Monet/Deep Learning II/Project/results/weights/weights_CNN_Basic.best.h5
Epoch 5/100
Epoch 5: val_accuracy did not improve from 0.82381
Epoch 6/100
Epoch 6: val_accuracy did not improve from 0.82381
Epoch 7/100
Epoch 7: val_accuracy did not improve from 0.82381
Epoch 8/100
Epoch 8: val_accuracy did not improve from 0.82381
Epoch 9/100
Epoch 9: val_accuracy did not improve from 0.82381
Epoch 9: early stopping
CNN_Auto_Bigger
Epoch 1/100
Epoch 1: val_accuracy improved from -inf to 0.83113, saving model to drive/My Drive/Colab Notebooks/Université Jean Monet/Deep Learning II/Project/results/weights