In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import os
import json
import random
from PIL import Image
import warnings
warnings.filterwarnings("ignore")

# Defining Data Locations
trainval_image_dir = os.path.join('/kaggle/input/coco-image-caption', 'train2014', 'train2014')
trainval_captions_dir = os.path.join('/kaggle/input/coco-image-caption', 'annotations_trainval2014', 'annotations')
test_image_dir = os.path.join('/kaggle/input/coco-image-caption', 'val2017', 'val2017')
test_captions_dir = os.path.join('/kaggle/input/coco-image-caption', 'annotations_trainval2017', 'annotations')

trainval_captions_filepath = os.path.join(trainval_captions_dir, 'captions_train2014.json')
test_captions_filepath = os.path.join(test_captions_dir, 'captions_val2017.json')

# Splitting Data into Train and Validation Set
#We'll be using 20% of train_2014 data to be used as our Validation Set and rest as Training Set


all_filepaths = np.array([os.path.join(trainval_image_dir, f) for f in os.listdir(trainval_image_dir)])
rand_indices = np.arange(len(all_filepaths))
np.random.shuffle(rand_indices)

split = int(len(all_filepaths)*0.8)

train_filepaths, valid_filepaths = all_filepaths[rand_indices[:split]], all_filepaths[rand_indices[split:]]

print(f"Train dataset size: {len(train_filepaths)}")
print(f"Valid dataset size: {len(valid_filepaths)}")

# Processing Data
#Here we'll be making train, valid and test dataframes


with open(trainval_captions_filepath, 'r') as f:
    trainval_data = json.load(f)

trainval_captions_df = pd.json_normalize(trainval_data, "annotations")
trainval_captions_df["image_filepath"] = trainval_captions_df["image_id"].apply(
    lambda x: os.path.join(trainval_image_dir, 'COCO_train2014_'+format(x, '012d')+'.jpg')
)

def preprocess_captions(image_captions_df):

#Preprocessing the captions """

    image_captions_df["preprocessed_caption"] = "[START] " + image_captions_df["caption"].str.lower().str.replace('[^\w\s]','') + " [END]"
    return image_captions_df

train_captions_df = trainval_captions_df[trainval_captions_df["image_filepath"].isin(train_filepaths)]
train_captions_df = preprocess_captions(train_captions_df)
valid_captions_df = trainval_captions_df[trainval_captions_df["image_filepath"].isin(valid_filepaths)]
valid_captions_df = preprocess_captions(valid_captions_df)

with open(test_captions_filepath, 'r') as f:
    test_data = json.load(f)

test_captions_df = pd.json_normalize(test_data, "annotations")
test_captions_df["image_filepath"] = test_captions_df["image_id"].apply(
    lambda x: os.path.join(test_image_dir, format(x, '012d')+'.jpg')
)
test_captions_df = preprocess_captions(test_captions_df)

train_captions_df.head()

# Understanding Data

sample_data = valid_captions_df.groupby("image_filepath")["caption"].agg(list).iloc[:5]

fig, axes = plt.subplots(5, 2, figsize=(8,18))

for ax_row, index, sample in zip(axes, sample_data.index, sample_data):

    ax_row[0].imshow(Image.open(index))
    ax_row[0].axis("off")
    text_y = 0.9
    for cap in sample:
        ax_row[1].text(0, text_y, cap, fontsize=14)
        text_y -= 0.2
    ax_row[1].axis("off")

n_samples = 1000

train_image_stats_df = valid_captions_df.loc[:n_samples, "image_filepath"].apply(lambda x: Image.open(x).size)
train_image_stats_df = pd.DataFrame(train_image_stats_df.tolist(), index=train_image_stats_df.index)
train_image_stats_df.describe()

train_vocabulary = train_captions_df["preprocessed_caption"].str.split(" ").explode().value_counts()
print(len(train_vocabulary[train_vocabulary>=25]))

# Understanding the Bert Tokenizer

from tokenizers import BertWordPieceTokenizer

# Initialize an empty BERT tokenizer
tokenizer = BertWordPieceTokenizer(
    #reserved_tokens=["[UNK]", "[START]", "[END]", "[PAD]"],
    unk_token="[UNK]",
    #trainer_params=None,
    #vocab_size=8000,
    clean_text=False,
    lowercase=False,
)

tokenizer.train_from_iterator(
    train_captions_df["preprocessed_caption"].tolist(),
    vocab_size=4000,
    special_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]
)

# Encoding a sentence
example_captions = valid_captions_df["preprocessed_caption"].iloc[:10].tolist()
example_tokenized_captions = tokenizer.encode_batch(example_captions)

for caption, tokenized_cap in zip(example_captions, example_tokenized_captions):
    print(f"{caption} -> {tokenized_cap.tokens}")

vocab = tokenizer.get_vocab()

for token in ["[UNK]", "[PAD]", "[START]", "[END]"]:
    print(f"{token} -> {vocab[token]}")

# Defining the `tf.data.Dataset` for image captioning

def parse_image(filepath, resize_height, resize_width):
    image = tf.io.read_file(filepath)
    image = tf.io.decode_jpeg(image, channels=3)
    image = tf.image.convert_image_dtype(image, tf.float32)
    image = tf.image.resize(image, [resize_height, resize_width])
    image = image*2.0 - 1.0
    return image

def generate_tokenizer(captions_df, n_vocab):
    """ Generate the tokenizer with given captions """

    # Define the tokenizer
    tokenizer = BertWordPieceTokenizer(
        unk_token="[UNK]",
        clean_text=False,
        lowercase=False,
    )

    # Train the tokenizer
    tokenizer.train_from_iterator(
        captions_df["preprocessed_caption"].tolist(),
        vocab_size=n_vocab,
        special_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]
    )

    return tokenizer

def generate_tf_dataset(image_captions_df, tokenizer=None, n_vocab=5000, pad_length=33, batch_size=32, training=False):
   # """ Generate the tf.data.Dataset"""

    # If the tokenizer is not available, create one
    if not tokenizer:
        tokenizer = generate_tokenizer(image_captions_df, n_vocab)

    # Get the caption IDs using the tokenizer
    image_captions_df["caption_token_ids"] = [enc.ids for enc in tokenizer.encode_batch(image_captions_df["preprocessed_caption"])]

    vocab = tokenizer.get_vocab()

    # Add the padding to short sentences and truncate long ones
    image_captions_df["caption_token_ids"] = image_captions_df["caption_token_ids"].apply(
        lambda x: x+[vocab["[PAD]"]]*(pad_length - len(x) + 2) if pad_length + 2 >= len(x) else x[:pad_length + 1] + [x[-1]]
    )

    # Create a dataset with images and captions
    dataset = tf.data.Dataset.from_tensor_slices({
        "image_filepath": image_captions_df["image_filepath"],
        "caption_token_ids": np.array(image_captions_df["caption_token_ids"].tolist())
    })

    # Each sample in our dataset consists of
    # (image, caption token IDs, position IDs), (caption token IDs offset by 1)
    dataset = dataset.map(
        lambda x: (
            (parse_image(x["image_filepath"], 224, 224), x["caption_token_ids"][:-1], tf.range(pad_length+1, dtype='float32')), x["caption_token_ids"]
        )
    )

    # Shuffle and batch data in the training mode
    if training:
        dataset = dataset.shuffle(buffer_size=batch_size*10)

    dataset = dataset.batch(batch_size)

    return dataset, tokenizer

n_vocab=4000
batch_size=2
sample_dataset, sample_tokenizer = generate_tf_dataset(train_captions_df, n_vocab=n_vocab, pad_length=10, batch_size=batch_size, training=True)
for i in sample_dataset.take(1):
    print(i)

# Defining the model
import tensorflow_hub as hub
import tensorflow.keras.backend as K

K.clear_session()

image_input = tf.keras.layers.Input(shape=(224, 224, 3))
image_encoder = hub.KerasLayer("https://tfhub.dev/sayakpaul/vit_s16_fe/1", trainable=False)
image_features = image_encoder(image_input)
print(f"Final representation shape: {image_features.shape}")

## The Text Decoder Transformer
##Here we define the text decoder. It takes the final image representation of ViT and concatenate that with caption IDs. Then we predict caption token ID from the next time step with the decoder.


class SelfAttentionLayer(tf.keras.layers.Layer):
#Defines the computations in the self attention layer

    def __init__(self, d):
        super(SelfAttentionLayer, self).__init__()
        # Feature dimensionality of the output
        self.d = d

    def build(self, input_shape):
        # Query weight matrix
        self.Wq = self.add_weight(
            shape=(input_shape[-1], self.d), initializer='glorot_uniform',
            trainable=True, dtype='float32'
        )
        # Key weight matrix
        self.Wk = self.add_weight(
            shape=(input_shape[-1], self.d), initializer='glorot_uniform',
            trainable=True, dtype='float32'
        )
        # Value weight matrix
        self.Wv = self.add_weight(
            shape=(input_shape[-1], self.d), initializer='glorot_uniform',
            trainable=True, dtype='float32'
        )

    def call(self, q_x, k_x, v_x, mask=None):

        q = tf.matmul(q_x,self.Wq) #[None, t, d]
        k = tf.matmul(k_x,self.Wk) #[None, t, d]
        v = tf.matmul(v_x,self.Wv) #[None, t, d]

        # Computing the final output
        h = tf.keras.layers.Attention(causal=True)([
            q, #q
            v, #v
            k, #k
        ], mask=[None, mask]) # [None, t, t] . [None, t, d] => [None, t, d]

        return h


class TransformerDecoderLayer(tf.keras.layers.Layer):
   # """ The Decoder layer """

    def __init__(self, d, n_heads):
        super(TransformerDecoderLayer, self).__init__()
        # Feature dimensionality
        self.d = d

        # Dimensionality of a head
        self.d_head = int(d/n_heads)

        # Number of heads
        self.n_heads = n_heads

        # Actual attention heads
        self.attn_heads = [SelfAttentionLayer(self.d_head) for i in range(self.n_heads)]

        # Fully connected layers
        self.fc1_layer = tf.keras.layers.Dense(512, activation='relu')
        self.fc2_layer = tf.keras.layers.Dense(d)

        self.add_layer = tf.keras.layers.Add()
        self.norm1_layer = tf.keras.layers.LayerNormalization()
        self.norm2_layer = tf.keras.layers.LayerNormalization()


    def _compute_multihead_output(self, x):
      #  """ Computing the multi head attention output"""
        outputs = [head(x, x, x) for head in self.attn_heads]
        outputs = tf.concat(outputs, axis=-1)
        return outputs

    def call(self, x):


        # Multi head attention layer output
        h1 = self._compute_multihead_output(x)

        h1_add = self.add_layer([x, h1])
        h1_norm = self.norm1_layer(h1_add)

        # Fully connected outputs
        h2_1 = self.fc1_layer(h1_norm)
        h2_2 = self.fc2_layer(h2_1)

        h2_add = self.add_layer([h1, h2_2])
        h2_norm = self.norm2_layer(h2_add)


        return h2_norm


# Input layer
caption_input = tf.keras.layers.Input(shape=(None,))
position_input = tf.keras.layers.Input(shape=(None,))
d_model = 384

# Token embeddings
input_embedding = tf.keras.layers.Embedding(len(tokenizer.get_vocab()), d_model, mask_zero=True)

# Position embeddings
position_embedding = tf.keras.layers.Lambda(
    lambda x: tf.where(
        tf.math.mod(tf.repeat(tf.expand_dims(x, axis=-1), d_model, axis=-1), 2)==0,
        tf.math.sin(
            #tf.repeat(tf.expand_dims(x, axis=-1), d_model, axis=-1) /
            tf.expand_dims(x, axis=-1) /
            10000**(2*tf.reshape(tf.range(d_model, dtype='float32'),[1,1, -1])/d_model)
        ),
        tf.math.cos(
            tf.expand_dims(x, axis=-1) /
            10000**(2*tf.reshape(tf.range(d_model, dtype='float32'),[1,1, -1])/d_model)
        )
    )
)

# Combined token position embeddings
embed_out = input_embedding(caption_input) + position_embedding(position_input)
# Concatenate image caption and token embeddings
image_caption_embed_out = tf.keras.layers.Concatenate(axis=1)([tf.expand_dims(image_features,axis=1), embed_out])

# Generate hidden representation with Transformer decoder layer
out = image_caption_embed_out
for l in range(4):
    out  = TransformerDecoderLayer(d_model, 64)(out)

# Final prediction layer
final_out = tf.keras.layers.Dense(n_vocab, activation='softmax')(out)

# Define the final model and compile
full_model = tf.keras.models.Model(inputs=[image_input, caption_input, position_input], outputs=final_out)
full_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics='accuracy')

full_model.summary()

"""## Defining the Blue Metric"""

import collections
import math


def _get_ngrams(segment, max_order):
    ngram_counts = collections.Counter()
    for order in range(1, max_order + 1):
        for i in range(0, len(segment) - order + 1):
            ngram = tuple(segment[i:i+order])
            ngram_counts[ngram] += 1
    return ngram_counts


def compute_bleu(reference_corpus, translation_corpus, max_order=4,
                 smooth=False):

    matches_by_order = [0] * max_order
    possible_matches_by_order = [0] * max_order
    reference_length = 0
    translation_length = 0
    for (references, translation) in zip(reference_corpus,
                                           translation_corpus):
        reference_length += min(len(r) for r in references)
        translation_length += len(translation)

        merged_ref_ngram_counts = collections.Counter()
        for reference in references:
            merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
        translation_ngram_counts = _get_ngrams(translation, max_order)
        overlap = translation_ngram_counts & merged_ref_ngram_counts
        for ngram in overlap:
            matches_by_order[len(ngram)-1] += overlap[ngram]
        for order in range(1, max_order+1):
            possible_matches = len(translation) - order + 1
            if possible_matches > 0:
                possible_matches_by_order[order-1] += possible_matches

        precisions = [0] * max_order
        for i in range(0, max_order):
            if smooth:
                   precisions[i] = ((matches_by_order[i] + 1.) /
                           (possible_matches_by_order[i] + 1.))
            else:
                if possible_matches_by_order[i] > 0:
                    precisions[i] = (float(matches_by_order[i]) /
                             possible_matches_by_order[i])
                else:
                    precisions[i] = 0.0

        if min(precisions) > 0:
            p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
            geo_mean = math.exp(p_log_sum)
        else:
            geo_mean = 0

        ratio = float(translation_length) / reference_length

        if ratio > 1.0:
            bp = 1.
        else:
            bp = math.exp(1 - 1. / ratio)

        bleu = geo_mean * bp

        return (bleu, precisions, bp, ratio, translation_length, reference_length)

from tensorflow.keras.layers.experimental.preprocessing import StringLookup

class BLEUMetric(object):

    def __init__(self, tokenizer, name='bleu_metric', **kwargs):
        super().__init__()
        self.tokenizer = tokenizer

      #self.vocab = vocabulary
      #self.id_to_token_layer = StringLookup(vocabulary=self.vocab, num_oov_indices=0, oov_token='[UNKUNK]', invert=True)

    def calculate_bleu_from_predictions(self, real, pred):

        # Get the predicted token IDs
        pred_argmax = tf.argmax(pred, axis=-1)

        # Convert token IDs to words using the vocabulary and the StringLookup
        pred_tokens = np.array([[self.tokenizer.id_to_token(pp) for pp in p] for p in pred_argmax])
        real_tokens = tf.constant([[self.tokenizer.id_to_token(rr) for rr in r] for r in real])

        def clean_text(tokens):

            # 3. Strip the string of any extra white spaces
            translations_in_bytes = tf.strings.strip(
                        # 2. Replace everything after the eos token with blank
                        tf.strings.regex_replace(
                            # 1. Join all the tokens to one string in each sequence
                            tf.strings.join(
                                tf.transpose(tokens), separator=' '
                            ),
                        "\[END\].*", ""),
                   )

            # Decode the byte stream to a string
            translations = np.char.decode( #
                translations_in_bytes.numpy().astype(np.bytes_), encoding='utf-8'
            )

            # If the string is empty, add a [UNK] token
            # Otherwise get a Division by zero error
            translations = [sent if len(sent)>0 else "[UNK]" for sent in translations ]

            # Split the sequences to individual tokens
            translations = np.char.split(translations).tolist()

            return translations

        # Get the clean versions of the predictions and real seuqences
        pred_tokens = clean_text(pred_tokens)
        # We have to wrap each real sequence in a list to make use of a function to compute bleu
        real_tokens = [[token_seq] for token_seq in clean_text(real_tokens)]

        # The compute_bleu method accpets the translations and references in the following format
        # tranlation - list of list of tokens
        # references - list of list of list of tokens
        bleu, precisions, bp, ratio, translation_length, reference_length = compute_bleu(real_tokens, pred_tokens, smooth=False)

        return bleu


batch_size=96

train_fraction = 0.6
valid_fraction = 0.2

tokenizer = generate_tokenizer(
    train_captions_df, n_vocab=n_vocab
)

bleu_metric = BLEUMetric(tokenizer=tokenizer)

sampled_validation_captions_df = valid_captions_df.sample(frac=valid_fraction)

for e in range(5):
    print(f"Epoch: {e+1}")

    train_dataset, _ = generate_tf_dataset(
        train_captions_df.sample(frac=train_fraction), tokenizer=tokenizer, n_vocab=n_vocab, batch_size=batch_size, training=True
    )
    valid_dataset, _ = generate_tf_dataset(
        sampled_validation_captions_df, tokenizer=tokenizer, n_vocab=n_vocab, batch_size=batch_size, training=False
    )

    full_model.fit(
        train_dataset,
        epochs=1
    )

    valid_loss, valid_accuracy, valid_bleu = [], [], []
    for vi, v_batch in enumerate(valid_dataset):
        print(f"{vi+1} batches processed", end='\r')
        loss, accuracy = full_model.test_on_batch(v_batch[0], v_batch[1])
        batch_predicted = full_model(v_batch[0])
        bleu_score = bleu_metric.calculate_bleu_from_predictions(v_batch[1], batch_predicted)
        valid_loss.append(loss)
        valid_accuracy.append(accuracy)
        valid_bleu.append(bleu_score)

    print(
        f"\nvalid_loss: {np.mean(valid_loss)} - valid_accuracy: {np.mean(valid_accuracy)} - valid_bleu: {np.mean(valid_bleu)}"
    )

bleu_metric = BLEUMetric(tokenizer=tokenizer)

test_dataset, _ = generate_tf_dataset(
    test_captions_df, tokenizer=tokenizer, n_vocab=n_vocab, batch_size=batch_size, training=False
)

test_loss, test_accuracy, test_bleu = [], [], []
for ti, t_batch in enumerate(test_dataset):
    print(f"{ti+1} batches processed", end='\r')
    loss, accuracy = full_model.test_on_batch(t_batch[0], t_batch[1])
    batch_predicted = full_model.predict_on_batch(t_batch[0])
    bleu_score = bleu_metric.calculate_bleu_from_predictions(t_batch[1], batch_predicted)
    test_loss.append(loss)
    test_accuracy.append(accuracy)
    test_bleu.append(bleu_score)

print(
    f"\ntest_loss: {np.mean(test_loss)} - test_accuracy: {np.mean(test_accuracy)} - test_bleu: {np.mean(test_bleu)}"
)

n_samples = 10
test_dataset, _ = generate_tf_dataset(
    valid_captions_df.sample(n=n_samples), tokenizer=tokenizer, n_vocab=n_vocab, batch_size=n_samples, training=False
)

def generate_caption(model, image_input, tokenizer, n_samples):
    # 2 -> [START]
    batch_tokens = np.repeat(np.array([[2]]), n_samples, axis=0)

    for i in range(30):
        if np.all(batch_tokens[:,-1] == 3):
            break

        position_input = tf.repeat(tf.reshape(tf.range(i+1),[1,-1]), n_samples, axis=0)
        probs = full_model((image_input, batch_tokens, position_input)).numpy()
        batch_tokens = np.argmax(probs, axis=-1)

    predicted_text = []
    for sample_tokens in batch_tokens:
        sample_predicted_token_ids = sample_tokens.ravel()
        sample_predicted_tokens = []
        for wid in sample_predicted_token_ids:
            sample_predicted_tokens.append(tokenizer.id_to_token(wid))
            if wid == 3:
                break
        sample_predicted_text = " ".join([tok for tok in sample_predicted_tokens])
        sample_predicted_text = sample_predicted_text.replace(" ##", "")
        predicted_text.append(sample_predicted_text)

    return predicted_text


for batch in test_dataset.take(1):
    (batch_image_input, _, _), batch_true_caption = batch

batch_predicted_text = generate_caption(full_model, batch_image_input, tokenizer, n_samples)

fig, axes = plt.subplots(n_samples, 2, figsize=(8,30))

for i,(sample_image_input, sample_true_caption, sample_predicated_caption) in enumerate(zip(batch_image_input, batch_true_caption, batch_predicted_text)):

    sample_true_caption_tokens  = [tokenizer.id_to_token(wid) for wid in sample_true_caption.numpy().ravel()]

    sample_true_text = []
    for tok in sample_true_caption_tokens:
        sample_true_text.append(tok)
        if tok == '[END]':
            break

    sample_true_text = " ".join(sample_true_text).replace(" ##", "")
    axes[i][0].imshow(((sample_image_input.numpy()+1.0)/2.0))
    axes[i][0].axis('off')

    true_annotation = f"TRUE: {sample_true_text}"
    predicted_annotation = f"PRED: {sample_predicated_caption}"
    axes[i][1].text(0, 0.75, true_annotation, fontsize=18)
    axes[i][1].text(0, 0.25, predicted_annotation, fontsize=18)
    axes[i][1].axis('off')

# full_model.save("image_captioning_modelV2.h5")

# full_model.save_weights("image_captioning_weights.h5")

# full_model.load_weights("/kaggle/working/image_captioning_weights.h5")

# Commented out IPython magic to ensure Python compatibility.
import numpy as np
import keras
import shutil
import os
import random
import matplotlib.pyplot as plt
# %matplotlib inline

from keras.preprocessing.image import ImageDataGenerator

#assigning image paths to variables
mask_data = "../input/facemask-dataset/Mask/Mask/"
no_mask_data = "../input/facemask-dataset/No Mask/No Mask/"

total_mask_images = os.listdir(mask_data)
print("no of mask images:: {}".format(len(total_mask_images)))
total_nonmask_images = os.listdir(no_mask_data)
print("no of non-mask images:: {}".format(len(total_nonmask_images)))

os.makedirs('./train/mask')
os.makedirs('./train/no mask')
os.makedirs('./test/mask')
os.makedirs('./test/no mask')

for images in random.sample(total_mask_images,100):
    shutil.copy(mask_data+images, './train/mask')
for images in random.sample(total_mask_images,30):
    shutil.copy(mask_data+images, './test/mask')
for images in random.sample(total_nonmask_images,100):
    shutil.copy(no_mask_data+images, './train/no mask')
for images in random.sample(total_nonmask_images,30):
    shutil.copy(no_mask_data+images, './test/no mask')

train_batch = ImageDataGenerator(rescale=1./255, zoom_range=0.2, horizontal_flip=True, vertical_flip=True, shear_range=0.2).\
            flow_from_directory('./train', target_size=(224,224), batch_size=32, class_mode = 'categorical')
test_batch = ImageDataGenerator(rescale=1./255).\
            flow_from_directory('./test', target_size = (224,224), batch_size=32, class_mode='categorical')

train_batch.class_indices

class_mask = ['mask', 'no mask']

#import vgg16
from keras.applications.vgg16 import VGG16

IMAZE_SIZE = [224,224]
vgg = VGG16(input_shape=IMAZE_SIZE+[3], weights='imagenet', include_top=False)

for layers in vgg.layers:
    layers.trainable = False

flatten_layer = keras.layers.Flatten()(vgg.output)
prediction_layer = keras.layers.Dense(2, activation='softmax')(flatten_layer)

model = keras.models.Model(inputs = vgg.input, outputs = prediction_layer)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

r = model.fit_generator(train_batch, validation_data=test_batch, epochs=5, steps_per_epoch=len(train_batch), validation_steps=len(test_batch))

plt.plot(r.history['loss'], label = 'train loss')
plt.plot(r.history['val_loss'], label='val loss')
plt.legend()


plt.plot(r.history['accuracy'], label = 'train acc')
plt.plot(r.history['val_accuracy'], label='val acc')
plt.legend()

model.save(f"best_testfacemask.hdf5")

import keras.utils as imagel
from keras.applications.imagenet_utils import preprocess_input

img = imagel.load_img('../input/facemask-dataset/No Mask/No Mask/No Mask109.jpg', target_size=(224,224))
x=imagel.img_to_array(img)
x = np.expand_dims(x,0)
y = preprocess_input(x)
pred = class_mask[np.argmax(model.predict(y))]
print(pred)
plt.imshow(img)

img = imagel.load_img('../input/facemask-dataset/Mask/Mask/Mask214.jpeg', target_size=(224,224))
x=imagel.img_to_array(img)
x = np.expand_dims(x,0)
y = preprocess_input(x)
pred = class_mask[np.argmax(model.predict(y))]
print(pred)
plt.imshow(img)

# import tensorflow as tf
# from tensorflow.keras.models import load_model
# import matplotlib.pyplot as plt
# import numpy as np

# # Step 1: Preprocess the independent image
# def preprocess_image(image_path, image_size):
#     img = tf.keras.preprocessing.image.load_img(image_path, target_size=(image_size, image_size))
#     img_array = tf.keras.preprocessing.image.img_to_array(img)
#     img_array = tf.expand_dims(img_array, 0)
#     return img_array / 255.0  # Rescale pixel values to [0, 1]

# # # Step 2: Load the trained model
# # model_path = '/content/drive/MyDrive/Datasets/best_test.hdf5'
# # model = load_model(model_path)

# # Step 3: Make predictions on the independent image
# def predict(model, image_array, class_names):
#     predictions = model.predict(image_array)
#     predict_class = class_names[np.argmax(predictions[0])]
#     confidence = round(100 * np.max(predictions[0]), 2)
#     return predict_class, confidence

# # Step 4: Provide the path to the independent image and test it
# # independent_image_path = '/kaggle/input/flickr-image-dataset/flickr30k_images/flickr30k_images/1000092795.jpg'  # Replace with the actual path to the image
# IMAGE_SIZE = 256
# # classes_names = ['class_1', 'class_2', 'class_3']  # Replace with your class names used during training

# # preprocessed_image = preprocess_image(independent_image_path, IMAGE_SIZE)
# # predicted_class, confidence = predict(model2, preprocessed_image, classes_names)

model2=tf.keras.models.load_model('/kaggle/input/facemaskdataset/best_test.hdf5')

def detectmask(imgpth):
    img = imagel.load_img(imgpth, target_size=(224,224))
    x=imagel.img_to_array(img)
    x = np.expand_dims(x,0)
    y = preprocess_input(x)
    pred = class_mask[np.argmax(model.predict(y))]
    return pred

ans=detectmask('/kaggle/input/facemask/train/without_mask/0690.jpg')
ans

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

def load_and_preprocess_image(image_path):
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, (224, 224))  # Adjust the size as needed
    image = tf.keras.applications.inception_v3.preprocess_input(image)
    return image

def generate_image_caption(image_path, model, tokenizer):
    mask= detectmask(image_path)
    image = load_and_preprocess_image(image_path)

    image_input = np.expand_dims(image, axis=0)

    n_samples = 1

    batch_tokens = np.repeat(np.array([[2]]), n_samples, axis=0)

    for i in range(30):
        if np.all(batch_tokens[:, -1] == 3):
            break

        position_input = tf.repeat(tf.reshape(tf.range(i + 1), [1, -1]), n_samples, axis=0)
        probs = model((image_input, batch_tokens, position_input)).numpy()
        batch_tokens = np.argmax(probs, axis=-1)

    predicted_tokens = batch_tokens[0]
    predicted_text = []

    for wid in predicted_tokens:
        predicted_text.append(tokenizer.id_to_token(wid))
        if wid == 3:
            break

    predicted_text = " ".join(predicted_text).replace(" ##", "")

    return predicted_text, image,mask

def visualize_image_with_caption(image_path, model, tokenizer):
    predicted_caption, image,mask = generate_image_caption(image_path, model, tokenizer)

    plt.imshow(image)
    plt.axis('off')
    if mask=='mask':
        variable1 = predicted_caption
        variable2 = mask

#         title = f"{variable1} with {variable2}"
        title = f"{predicted_caption.replace('[END]', '')} with mask [END]"
        plt.title(title)
    else:
        plt.title(predicted_caption)
    plt.show()

# Usage example

image_path = "/kaggle/input/facemask-dataset/Mask/Mask/Mask118.jpeg"  # Replace with the actual image path
visualize_image_with_caption(image_path, full_model, tokenizer)

# Usage example
image_path = "/kaggle/input/facemask/train/without_mask/0699.jpg"  # Replace with the actual image path
visualize_image_with_caption(image_path, full_model, tokenizer)

image_path = "/kaggle/input/facemask-dataset/Mask/Mask/Mask126.jpg"  # Replace with the actual image path
visualize_image_with_caption(image_path, full_model, tokenizer)

image_path = "/kaggle/input/facemask/train/with_mask/0235.jpg"  # Replace with the actual image path
visualize_image_with_caption(image_path, full_model, tokenizer)

image_path = "/kaggle/input/facemask/train/with_mask/0256.jpg"  # Replace with the actual image path
visualize_image_with_caption(image_path, full_model, tokenizer)

image_path = "/kaggle/input/facemask/train/without_mask/0717.jpg"  # Replace with the actual image path
visualize_image_with_caption(image_path, full_model, tokenizer)
