In [1]:
import tensorflow as tf

In [4]:
batch_size = 32
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    "ingredients/train",
    batch_size = batch_size,
    validation_split = 0.2,
    subset = 'training',
    seed = seed
)

Found 309082 files belonging to 5 classes.
Using 247266 files for training.


I0000 00:00:1733969111.117286  113080 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6120 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9


In [8]:
for i in range(5):
    print(f"Label {i} corresponds to {raw_train_ds.class_names[i]}")

Label 0 corresponds to breakfast
Label 1 corresponds to dessert
Label 2 corresponds to dinner
Label 3 corresponds to lunch
Label 4 corresponds to side


In [None]:
raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    'ingredients/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed)

Found 309082 files belonging to 5 classes.
Using 61816 files for validation.


In [10]:
raw_test_ds = tf.keras.utils.text_dataset_from_directory(
    'ingredients/test',
    batch_size=batch_size)

Found 77271 files belonging to 5 classes.


In [15]:
max_features = 10000
sequence_length = 250

vectorize_layer = tf.keras.layers.TextVectorization(
    output_mode='int',
    output_sequence_length=sequence_length
)

In [16]:
# Make a text-only dataset (without labels), then call adapt
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

2024-12-11 19:18:41.193497: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [17]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [20]:
# retrieve a batch (of 32 reviews and labels) from the dataset
text_batch, label_batch = next(iter(raw_train_ds))
first_review, first_label = text_batch[20], label_batch[20]
print("Review", first_review)
print("Label", raw_train_ds.class_names[first_label])
print("Vectorized review", vectorize_text(first_review, first_label))

Review tf.Tensor(b'spinach garlic vegetable_oil fresh_lemon_juice soy_sauce salt fresh_ground_black_pepper', shape=(), dtype=string)
Label dinner
Vectorized review (<tf.Tensor: shape=(1, 250), dtype=int64, numpy=
array([[249,  18,  23,  71,  39,   2,  48,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

In [21]:
print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))

Vocabulary size: 15892


In [22]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

In [23]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [81]:
from tensorflow.keras import layers

model = tf.keras.Sequential([
  layers.Embedding(max_features, 16),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dense(32, activation='relu'),
  layers.Dense(32, activation='relu'),
  layers.Dropout(0.3),
  layers.Dense(5, activation='softmax')])

model.summary()

In [82]:
model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(learning_rate=.005),
    metrics=["accuracy"]
)

In [83]:
class_weight = \
{0: 0.0807499877055439,
 1: 0.2346662249290157,
 2: 0.4982671287656625,
 3: 0.10766837581175764,  
 4: 0.07864828278802029}

In [84]:
epochs = 15
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs,
    class_weight = class_weight
)

Epoch 1/15
[1m7728/7728[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 3ms/step - accuracy: 0.5784 - loss: 0.2154 - val_accuracy: 0.6191 - val_loss: 1.3434
Epoch 2/15
[1m7728/7728[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - accuracy: 0.6562 - loss: 0.1712 - val_accuracy: 0.6268 - val_loss: 1.2771
Epoch 3/15
[1m7728/7728[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step - accuracy: 0.6624 - loss: 0.1677 - val_accuracy: 0.6660 - val_loss: 1.1895
Epoch 4/15
[1m7728/7728[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step - accuracy: 0.6648 - loss: 0.1649 - val_accuracy: 0.6561 - val_loss: 1.2561
Epoch 5/15
[1m7728/7728[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step - accuracy: 0.6674 - loss: 0.1614 - val_accuracy: 0.6763 - val_loss: 1.1271
Epoch 6/15
[1m7728/7728[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - accuracy: 0.6696 - loss: 0.1587 - val_accuracy: 0.6772 - val_loss: 1.0430
Epoch 7/15

In [85]:
loss, accuracy = model.evaluate(test_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

[1m2415/2415[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.6768 - loss: 1.0655
Loss:  1.0661582946777344
Accuracy:  0.6779775023460388


In [55]:
predictions = model.predict(test_ds)

[1m2415/2415[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step


In [66]:
import pandas as pd

df = pd.DataFrame(predictions)
df.columns = raw_train_ds.class_names
df["top"] = df.idxmax(axis=1)
df["top"].to_csv("ANN_predictions.csv", header=False)