In [None]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses

In [None]:
print(tf.__version__)

In [None]:
url = "https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz"
dataset = tf.keras.utils.get_file('stack_overflow_16k', url, untar=True, cache_dir='.', cache_subdir='')

In [None]:
train_dir = 'train'

In [None]:
os.listdir(train_dir)

In [None]:
batch_size = 32
seed = 42

In [None]:
raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    'train', batch_size=batch_size, validation_split=0.2, subset='training', seed=seed
)

In [None]:
print("Label 0 corresponds to", raw_train_ds.class_names[0])
print("Label 1 corresponds to", raw_train_ds.class_names[1])

In [None]:
raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    'train', batch_size=batch_size, validation_split=0.2, subset='validation', seed=seed
)

In [None]:
raw_test_ds = tf.keras.utils.text_dataset_from_directory(
    'test', batch_size=batch_size
)

In [None]:
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), '')

In [None]:
max_features = 10000 # vocabulary size
sequence_length = 250

vectorize_layer = tf.keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens = max_features,
    output_mode='int', output_sequence_length=sequence_length
)

In [None]:
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

In [None]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [None]:
text_batch, label_batch = next(iter(raw_train_ds))
first_review, first_label = text_batch[0], label_batch[0]
print('review:', first_review)
print('label:', raw_train_ds.class_names[first_label])
print('vectorized_review:', vectorize_text(first_review, first_label))

In [None]:
print('1287 -->', vectorize_layer.get_vocabulary()[1287])
print('0 -->', vectorize_layer.get_vocabulary()[0])
print('vocab size:', len(vectorize_layer.get_vocabulary()))

In [None]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

In [None]:
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
embedding_dim = 16

In [None]:
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(max_features + 1, embedding_dim), # (batch, sequence, embedding)
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.GlobalAveragePooling1D(),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(4)
])

In [None]:
model.summary()

In [None]:
losses = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ['accuracy']

In [None]:
model.compile(loss=losses, optimizer='adam', metrics=metrics)

In [None]:
epochs = 10
history = model.fit(train_ds, validation_data=val_ds, epochs=epochs)

In [None]:
loss, acc = model.evaluate(test_ds)

In [None]:
print('loss: {}, accuracy: {}'.format(loss, acc))

In [None]:
history_dict = history.history
history_dict.keys()

In [None]:
acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

In [None]:
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.show()

In [None]:
export_model = tf.keras.Sequential([
  vectorize_layer, 
  model, 
  layers.Activation('sigmoid')                                    
])

In [None]:
export_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer='adam', metrics=['accuracy'])

In [None]:
loss, accuracy = export_model.evaluate(raw_test_ds)
print(accuracy)

In [None]:
examples = [
  "The is csharp!",
  "The is java.",
  "The is not python"
]

export_model.predict(examples)