# Modality 1: Image Classification (Dogs vs. Cats)

In [1]:
# Step 1: Feature Extraction (EfficientNetB0, frozen)

import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense
from tensorflow.keras.models import Model
import tensorflow_datasets as tfds

# Load dataset (you can replace with your own)
(train_ds, val_ds), ds_info = tfds.load("cats_vs_dogs", split=["train[:80%]", "train[80%:]"],
                                        as_supervised=True, with_info=True)

# Preprocessing
IMG_SIZE = 224
def preprocess(img, label):
    img = tf.image.resize(img, (IMG_SIZE, IMG_SIZE))
    return img / 255.0, label

train_ds = train_ds.map(preprocess).batch(32).prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.map(preprocess).batch(32).prefetch(tf.data.AUTOTUNE)

# Base model
base_model = EfficientNetB0(include_top=False, weights='imagenet', input_shape=(IMG_SIZE, IMG_SIZE, 3))
base_model.trainable = False  # feature extractor

inputs = tf.keras.Input(shape=(IMG_SIZE, IMG_SIZE, 3))
x = base_model(inputs, training=False)
x = GlobalAveragePooling2D()(x)
outputs = Dense(1, activation="sigmoid")(x)
model = Model(inputs, outputs)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(train_ds, epochs=3, validation_data=val_ds)




Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/cats_vs_dogs/4.0.1...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]



Shuffling /root/tensorflow_datasets/cats_vs_dogs/incomplete.DITHWZ_4.0.1/cats_vs_dogs-train.tfrecord*...:   0%…

Dataset cats_vs_dogs downloaded and prepared to /root/tensorflow_datasets/cats_vs_dogs/4.0.1. Subsequent calls will reuse this data.
Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step
Epoch 1/3
[1m582/582[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 104ms/step - accuracy: 0.5056 - loss: 0.6975 - val_accuracy: 0.5099 - val_loss: 0.6932
Epoch 2/3
[1m582/582[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 61ms/step - accuracy: 0.5044 - loss: 0.6968 - val_accuracy: 0.5099 - val_loss: 0.6927
Epoch 3/3
[1m582/582[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 62ms/step - accuracy: 0.5100 - loss: 0.6962 - val_accuracy: 0.5099 - val_loss: 0.6923


<keras.src.callbacks.history.History at 0x7bcf64b6c590>

In [2]:
# Step 2: Fine-Tuning (unfreeze top layers)

base_model.trainable = True

# Fine-tune from layer 100 onward
for layer in base_model.layers[:100]:
    layer.trainable = False

model.compile(optimizer=tf.keras.optimizers.Adam(1e-5),  # lower LR
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(train_ds, epochs=3, validation_data=val_ds)


Epoch 1/3
[1m582/582[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 140ms/step - accuracy: 0.5808 - loss: 0.6709 - val_accuracy: 0.6563 - val_loss: 0.6211
Epoch 2/3
[1m582/582[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 82ms/step - accuracy: 0.6587 - loss: 0.6132 - val_accuracy: 0.7124 - val_loss: 0.5622
Epoch 3/3
[1m582/582[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 82ms/step - accuracy: 0.6865 - loss: 0.5763 - val_accuracy: 0.7298 - val_loss: 0.5327


<keras.src.callbacks.history.History at 0x7bcf64fc3b90>

# Modality 2: Audio (Using YAMNet)

YAMNet is a pretrained audio classifier from TF Hub.

In [3]:
!pip install tensorflow-io


Collecting tensorflow-io
  Downloading tensorflow_io-0.37.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Downloading tensorflow_io-0.37.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (49.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.6/49.6 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorflow-io
Successfully installed tensorflow-io-0.37.1


In [4]:
import tensorflow as tf

# Download a sample audio file
audio_path = tf.keras.utils.get_file(
    'miaow_16k.wav',
    'https://storage.googleapis.com/audioset/miaow_16k.wav'
)


Downloading data from https://storage.googleapis.com/audioset/miaow_16k.wav
[1m215546/215546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3us/step


In [5]:
import tensorflow_hub as hub
import numpy as np

# Download and load audio
audio_path = tf.keras.utils.get_file('miaow_16k.wav', 'https://storage.googleapis.com/audioset/miaow_16k.wav')
audio_binary = tf.io.read_file(audio_path)
wav, sr = tf.audio.decode_wav(audio_binary)
wav = tf.squeeze(wav, axis=-1)
wav = tf.cast(wav, tf.float32)

# Load YAMNet and extract features
yamnet_model = hub.load('https://tfhub.dev/google/yamnet/1')
scores, embeddings, _ = yamnet_model(wav)

In [6]:
# Simulate dataset with 2 classes
X_audio = embeddings.numpy()
num_samples = len(X_audio)
y_audio = np.array([0] * (num_samples // 2) + [1] * (num_samples - num_samples // 2))

# Train classifier on top of embeddings
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X_audio, y_audio, test_size=0.3, random_state=42)
clf = LogisticRegression().fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Audio classifier accuracy:", accuracy_score(y_test, y_pred))

Audio classifier accuracy: 0.25


# Modality 3: Video (Action Recognition with I3D)

In [7]:
import tensorflow_hub as hub
import tensorflow as tf
import numpy as np

# Load pretrained I3D model
i3d_model = hub.load("https://tfhub.dev/deepmind/i3d-kinetics-400/1").signatures["default"]

# Simulated multiple dummy videos
X_video = []
y_video = []
for i in range(20):
    video = tf.random.uniform((1, 64, 224, 224, 3))
    logits = i3d_model(video)["default"]
    features = tf.squeeze(logits).numpy().reshape(-1)
    X_video.append(features)
    y_video.append(i % 2)  # alternate class 0 and 1

X_video = np.array(X_video)
y_video = np.array(y_video)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X_video, y_video, test_size=0.3, random_state=42)
clf_video = LogisticRegression().fit(X_train, y_train)
y_pred_video = clf_video.predict(X_test)
print("\nVideo classifier report:\n", classification_report(y_test, y_pred_video))


Video classifier report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         4

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6



# Modality 4: NLP (Text Classification with BERT)

In [8]:
import tensorflow_hub as hub
import tensorflow_text as text

# Build model using Sequential API to avoid symbolic graph issues
bert_preprocess = hub.load("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.load("https://tfhub.dev/google/experts/bert/wiki_books/sst2/2")

# Preprocess and encode manually (outside symbolic graph)
texts = ["I love this movie", "Worst film ever.", "An okay experience",
          "Brilliant plot and cast.", "Terrible direction", "Absolutely amazing"]
labels = [1, 0, 1, 1, 0, 1]

preprocessed = bert_preprocess(texts)
encoded = bert_encoder(preprocessed)['pooled_output'].numpy()

# Train classifier on top
from sklearn.linear_model import LogisticRegression
clf_text = LogisticRegression().fit(encoded, labels)
pred_labels = clf_text.predict(encoded)
print("Text classifier predictions:", pred_labels)

Text classifier predictions: [1 0 1 1 0 1]


In [9]:
import tensorflow_datasets as tfds
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load IMDb reviews dataset
imdb_data = tfds.load('imdb_reviews', split='train[:5000]', as_supervised=True)
texts, labels = [], []
for text, label in tfds.as_numpy(imdb_data):
    texts.append(text.decode('utf-8'))
    labels.append(label)

# Load preprocessing and encoder from TF Hub
bert_preprocess = hub.load("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.load("https://tfhub.dev/google/experts/bert/wiki_books/sst2/2")

# Preprocess and encode in batches
batch_size = 64
encoded_batches = []
for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i+batch_size]
    batch_inputs = bert_preprocess(batch_texts)
    batch_output = bert_encoder(batch_inputs)['pooled_output']
    encoded_batches.append(batch_output.numpy())

X_text = np.concatenate(encoded_batches, axis=0)
y_text = np.array(labels)

# Train/test split and classification
X_train, X_test, y_train, y_test = train_test_split(X_text, y_text, test_size=0.2, random_state=42)
clf = LogisticRegression(max_iter=1000).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("IMDb Text classifier accuracy:", accuracy_score(y_test, y_pred))



Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.JJ6TNM_1.0.0/imdb_reviews-train.tfrecor…

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.JJ6TNM_1.0.0/imdb_reviews-test.tfrecord…

Generating unsupervised examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.JJ6TNM_1.0.0/imdb_reviews-unsupervised.…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.
IMDb Text classifier accuracy: 0.84
