<a href="https://colab.research.google.com/github/shenzhun/machine-learning-prep/blob/master/tensorflow/day1_tree_ways_of_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm
from tensorflow.keras import *

In [5]:
train_token_path = "../data/imdb/train_token.csv"
test_token_path = "../data/imdb/test_token.csv"

MAX_WORDS = 10000  # We will only consider the top 10,000 words in the dataset
MAX_LEN = 200  # We will cut reviews after 200 words
BATCH_SIZE = 20 

# Constructing data pipeline
def parse_line(line):
    t = tf.strings.split(line,"\t")
    label = tf.reshape(tf.cast(tf.strings.to_number(t[0]),tf.int32),(-1,))
    features = tf.cast(tf.strings.to_number(tf.strings.split(t[1]," ")),tf.int32)
    return (features,label)

ds_train=  tf.data.TextLineDataset(filenames = [train_token_path]) \
   .map(parse_line,num_parallel_calls = tf.data.experimental.AUTOTUNE) \
   .shuffle(buffer_size = 1000).batch(BATCH_SIZE) \
   .prefetch(tf.data.experimental.AUTOTUNE)

ds_test=  tf.data.TextLineDataset(filenames = [test_token_path]) \
   .map(parse_line,num_parallel_calls = tf.data.experimental.AUTOTUNE) \
   .shuffle(buffer_size = 1000).batch(BATCH_SIZE) \
   .prefetch(tf.data.experimental.AUTOTUNE)

Modeling Using Sequential

In [8]:
tf.keras.backend.clear_session()

model = models.Sequential()

model.add(layers.Embedding(MAX_WORDS, 7, input_length=MAX_WORDS))
model.add(layers.Conv1D(filters=64, kernel_size=5, activation="relu"))
model.add(layers.MaxPool1D(2))
model.add(layers.Conv1D(filters=32, kernel_size=3, activation="relu"))
model.add(layers.MaxPool1D(2))
model.add(layers.Flatten())
model.add(layers.Dense(1, activation="sigmoid"))

model.compile(optimizer='Nadam',
              loss='binary_crossentropy',
              metrics=['accuracy', 'AUC'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 10000, 7)          70000     
_________________________________________________________________
conv1d (Conv1D)              (None, 9996, 64)          2304      
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 4998, 64)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 4996, 32)          6176      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 2498, 32)          0         
_________________________________________________________________
flatten (Flatten)            (None, 79936)             0         
_________________________________________________________________
dense (Dense)                (None, 1)                 7

In [13]:
import datetime
baselogger = callbacks.BaseLogger(stateful_metrics=['AUC'])
logdir = '../data/keras_model' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)
history = model.fit(ds_train, validation_data=ds_test,
                    epochs=6, callbacks=[baselogger, tensorboard_callback])

Epoch 1/6


NotFoundError: ignored

In [16]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

import matplotlib.pyplot as plt

def plot_metric(history, metric):
  train_metrics = history.history[metric]
  val_metrics = history.history['val_'+metric]

  epochs = range(1, len(train_metrics)+1)
  plt.plot(epochs, train_metrics, 'bo--')
  plt.plot(epochs, val_metrics, 'ro--')

  plt.title('Training and validation '+metric)
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend(["train_"+metric, "val_"+metric])
  plt.show()


In [17]:
plot_metric(history, "AUC")

NameError: ignored