In [43]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D, Flatten, Dense, MultiHeadAttention

# Assuming you have an input tensor with shape (height, width, channels)
input_shape = (64, 64, 3)
num_classes = 10

# Define the model
inputs = Input(shape=input_shape)
x = Conv2D(64, (4, 4), activation='relu')(inputs)

# Add a self-attention layer
attention_output = MultiHeadAttention(key_dim=60, num_heads=4)(x, x)

# Flatten the output for further processing
flattened = Flatten()(attention_output)

# Add more layers as needed
x = Dense(128, activation='relu')(flattened)
outputs = Dense(num_classes, activation='softmax')(x)

# Create the model
model = tf.keras.Model(inputs=inputs, outputs=outputs)

# Compile the model and specify the optimizer, loss function, and metrics
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()


Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_32 (InputLayer)       [(None, 64, 64, 3)]          0         []                            
                                                                                                  
 conv2d_13 (Conv2D)          (None, 61, 61, 64)           3136      ['input_32[0][0]']            
                                                                                                  
 multi_head_attention_22 (M  (None, 61, 61, 64)           62224     ['conv2d_13[0][0]',           
 ultiHeadAttention)                                                  'conv2d_13[0][0]']           
                                                                                                  
 flatten_13 (Flatten)        (None, 238144)               0         ['multi_head_attention_2

# Project 4

### team members:
### 1. Sasi Kanduri
### 2. Vikas Mishra
### 3. Ashish Thranath Kotian

### import libraries

In [None]:
import numpy as np
from numpy import array
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, Sequential
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Flatten,Embedding,Activation, Dropout, LSTM, MultiHeadAttention
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model, image_dataset_from_directory
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Flatten, Conv2D, Rescaling, MaxPooling2D, concatenate

from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import classification_report

### Importing training/validation/test Data from the files downlaoded with image_downloader.py

In [None]:
df_train = pd.read_csv('final_df_train.csv', index_col=False)
df_validate = pd.read_csv('final_df_validate.csv', index_col=False)
df_test = pd.read_csv('final_df_test.csv', index_col=False)

y_train = df_train['2_way_label'].to_numpy()
y_validate = df_validate['2_way_label'].to_numpy()
y_test = df_test['2_way_label'].to_numpy()

print(f"train set x shape : {df_train.shape} y shape: {y_train.shape}")
print(f"validate set x shape : {df_validate.shape} y shape: {y_validate.shape}")
print(f"test set x shape : {df_test.shape} y shape: {y_test.shape}")


train set x shape : (1921, 2) y shape: (1921,)
validate set x shape : (486, 2) y shape: (486,)
test set x shape : (477, 2) y shape: (477,)


## Text Model with GLoVe Embedding for text data

### Contractions

In [None]:
contractions = {
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how does",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
" u ": " you ",
" ur ": " your ",
" n ": " and "}

### Function to clean the text using the contractions

In [None]:
def get_clean_text(x):
    if type(x) is str:
        x = x.lower()
        for key in contractions:
            value = contractions[key]
            x = x.replace(key, value)
        return x
    else:
        return x


### Applying Contraction Function on Train/Validation/Testing Data

In [None]:
df_train['text with images'] = df_train['text with images'].apply(lambda x: get_clean_text(x))
df_validate['text with images'] = df_validate['text with images'].apply(lambda x: get_clean_text(x))
df_test['text with images'] = df_test['text with images'].apply(lambda x: get_clean_text(x))

### Train Data After Applying Contraction Function

In [None]:
df_train['text with images']

0       my walgreens offbrand mucinex was engraved wit...
1           hackers leak emails from uae ambassador to us
2                                puppy taking in the view
3       bride and groom exchange vows after fatal shoo...
4                                           major thermos
                              ...                        
1916              us solar installations hit million mark
1917    thought this little flower was pretty interesting
1918                          the john rylands library oc
1919    this snail who has climbed up to my first stor...
1920    plop your infant in front of these pictures of...
Name: text with images, Length: 1921, dtype: object

### Validation Data After Applying Contraction Function

In [None]:
df_validate['text with images']

0                   my xbox controller says hi
1               new image from the mandalorian
2                say hello to my little friend
3                   watch your step little one
4      this tree i found with a solo cup on it
                        ...                   
481                               high fashion
482                    years old world records
483                railroad track senior photo
484         a rare photograph of billy the kid
485        the onion reviews crazy rich asians
Name: text with images, Length: 486, dtype: object

### Test Data After Applying Contraction Function

In [None]:
df_test['text with images']

0                                              stargazer
1                                                   yeah
2      pd phoenix car thief gets instructions from yo...
3      as trump accuses iran he has one problem his o...
4                                    believers hezbollah
                             ...                        
472                                           angry baby
473                            this sign in a restaurant
474                                       disaster pratt
475    reading the manifesto of russia painting by gr...
476                             httpsiimgurcomxcvuzmtjpg
Name: text with images, Length: 477, dtype: object

### Generating Tokens using keras tokenizer

In [None]:
text_train = df_train['text with images'].tolist()
text_validate = df_validate['text with images'].tolist()
text_test = df_test['text with images'].tolist()

token = Tokenizer()

token.fit_on_texts(text_train)


### vocabulary size

In [None]:
vocab_size  = len(token.word_index) + 1
vocab_size

5063

### Encoding Text to Sequences for Train/Validation/Test Data

In [None]:
encoded_text_train = token.texts_to_sequences(text_train)
encoded_text_validate = token.texts_to_sequences(text_validate)
encoded_text_test = token.texts_to_sequences(text_test)

### Padding

In [None]:
max_length = 225
text_train = pad_sequences(encoded_text_train, maxlen=max_length, padding='post')
text_validate = pad_sequences(encoded_text_validate, maxlen=max_length, padding='post')
text_test = pad_sequences(encoded_text_test, maxlen=max_length, padding='post')

### Converting data to numpy arrays

In [None]:
text_train = np.array(text_train)
text_validate = np.array(text_validate)
text_test = np.array(text_test)

In [None]:
text_train[0]

array([   8, 1645, 1646,  878,   24, 1647,   13,    1,  879,  878,   90,
          6,    2,  250,  323,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

### Reading the Glove Vectors for each word from the pretrained embeddings file 'glove.twitter.27B.25d.txt' - 25 dimensions

In [None]:
glove_vectors = dict()

In [None]:
%%time

file = open('glove.twitter.27B.25d.txt', encoding='utf-8')

for line in file:
    values = line.split()
    word = values[0]
    vectors = np.asarray(values[1: ])
    glove_vectors[word] = vectors
file.close()

CPU times: user 5.21 s, sys: 249 ms, total: 5.46 s
Wall time: 5.51 s


In [None]:
len(glove_vectors)

1193514

### Creating Word Matrix For Each Word In Token

In [None]:
word_vector_matrix = np.zeros((vocab_size, 25))

tokens = []
labels = []

for word, index in token.word_index.items():   # index returned here starts with 1 so we need set vocab_size = len(token.word_index) + 1  to be able to index up to the greatest token ID
    vector = glove_vectors.get(word)
    if vector is not None:
        word_vector_matrix[index] = vector


### Word matrix size

In [None]:
word_vector_matrix.shape

(5063, 25)

### Building The Glove Embedding Model

In [None]:
# text model
input1 = Input(shape = (max_length))
embedding = Embedding(vocab_size, 25, weights = [word_vector_matrix], trainable = False, name='embedding')(input1)

lstm_layer1 = LSTM(units=100)(embedding)

dense1 = Dense(64, activation='relu')(lstm_layer1)

text_model = Model(inputs = input1, outputs = dense1)

## CNN model for images

### Converting target label to list for labels

In [None]:

y_labels_train = y_train.tolist()
y_labels_validate = y_validate.tolist()
y_labels_test = y_test.tolist()

### Creating dataset for our train/validation/test Data

In [None]:

image_train = image_dataset_from_directory("images_train", labels=y_labels_train, label_mode="binary", image_size=(64,64), batch_size=32, color_mode='rgb')
image_val = image_dataset_from_directory("images_validate", labels=y_labels_validate, label_mode="binary", image_size=(64,64), batch_size=32, color_mode='rgb')
image_test = image_dataset_from_directory("images_test", labels=y_labels_test, label_mode="binary", image_size=(64,64), batch_size=32, color_mode='rgb')

Found 1921 files belonging to 2 classes.
Found 486 files belonging to 2 classes.
Found 477 files belonging to 2 classes.


### Autotune with keras for performance

In [None]:
AUTOTUNE = tf.data.AUTOTUNE
train_dataset = image_train.prefetch(buffer_size=AUTOTUNE)

# train data
data_batches = []

for batch in train_dataset:
    data_batches.append(batch[0])

image_data_train = np.concatenate(data_batches, axis=0)

#validation data
val_dataset = image_val.prefetch(buffer_size=AUTOTUNE)

data_batches = []

for batch in val_dataset:
    try:
        data_batches.append(batch[0])
    except:
        print(batch)

image_data_validate = np.concatenate(data_batches, axis=0)

# testd data
test_dataset = image_test.prefetch(buffer_size=AUTOTUNE)

data_batches = []

for batch in test_dataset:
    data_batches.append(batch[0])

image_data_test = np.concatenate(data_batches, axis=0)

In [None]:
print(image_data_train.shape)
print(image_data_validate.shape)
print(image_data_test.shape)

(1921, 64, 64, 3)
(486, 64, 64, 3)
(477, 64, 64, 3)


### Building the CNN Model

In [None]:
input2 = Input(shape=(64,64,3))
rescaling = Rescaling(1./255)(input2) # scale pixels

conv1 = Conv2D(100, (4, 4), activation='relu')(rescaling)
pool1 = MaxPooling2D((2, 2), padding='same')(conv1)
conv2 = Conv2D(64, (2, 2), activation='relu')(pool1)
pool2 = MaxPooling2D((2, 2), padding='same')(conv2)

flat_layer = Flatten()(pool2)
dense2 = Dense(64, activation='relu')(flat_layer)

image_model = Model(inputs = input2, outputs = dense2)

## Creating a multimodal architecture by merging the dense layers of text and image models

In [None]:
text_model.summary()

Model: "model_24"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_15 (InputLayer)       [(None, 225)]             0         
                                                                 
 embedding (Embedding)       (None, 225, 25)           126575    
                                                                 
 lstm_8 (LSTM)               (None, 100)               50400     
                                                                 
 dense_44 (Dense)            (None, 64)                6464      
                                                                 
Total params: 183439 (716.56 KB)
Trainable params: 56864 (222.12 KB)
Non-trainable params: 126575 (494.43 KB)
_________________________________________________________________


In [None]:
image_model.summary()

Model: "model_25"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_16 (InputLayer)       [(None, 64, 64, 3)]       0         
                                                                 
 rescaling_7 (Rescaling)     (None, 64, 64, 3)         0         
                                                                 
 conv2d_14 (Conv2D)          (None, 61, 61, 100)       4900      
                                                                 
 max_pooling2d_14 (MaxPooli  (None, 31, 31, 100)       0         
 ng2D)                                                           
                                                                 
 conv2d_15 (Conv2D)          (None, 30, 30, 64)        25664     
                                                                 
 max_pooling2d_15 (MaxPooli  (None, 15, 15, 64)        0         
 ng2D)                                                    

In [None]:
merge = concatenate([dense1, dense2])

hidden1 = Dense(128, activation='relu')(merge)
hidden2 = Dense(32, activation='relu')(hidden1)
output = Dense(1, activation='sigmoid')(hidden2)

base_model = Model(inputs=[input1, input2], outputs=output)

base_model.summary()

Model: "model_26"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_16 (InputLayer)       [(None, 64, 64, 3)]          0         []                            
                                                                                                  
 rescaling_7 (Rescaling)     (None, 64, 64, 3)            0         ['input_16[0][0]']            
                                                                                                  
 conv2d_14 (Conv2D)          (None, 61, 61, 100)          4900      ['rescaling_7[0][0]']         
                                                                                                  
 max_pooling2d_14 (MaxPooli  (None, 31, 31, 100)          0         ['conv2d_14[0][0]']           
 ng2D)                                                                                     

In [None]:
checkpointer = ModelCheckpoint(filepath="best_nultimodal.hdf5", verbose=2, save_best_only=True, monitor='val_loss')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=2, mode='auto')

base_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=['accuracy'])

base_model.fit([text_train, image_data_train], y_train, epochs=50, callbacks=[monitor, checkpointer], batch_size = 32,
            validation_data=([text_validate, image_data_validate], y_validate))

Epoch 1/50
Epoch 1: val_loss improved from inf to 0.68774, saving model to best_nultimodal.hdf5
Epoch 2/50
 3/61 [>.............................] - ETA: 2s - loss: 0.7184 - accuracy: 0.4271

  saving_api.save_model(


Epoch 2: val_loss improved from 0.68774 to 0.67463, saving model to best_nultimodal.hdf5
Epoch 3/50
Epoch 3: val_loss did not improve from 0.67463
Epoch 4/50
Epoch 4: val_loss improved from 0.67463 to 0.67362, saving model to best_nultimodal.hdf5
Epoch 5/50
Epoch 5: val_loss did not improve from 0.67362
Epoch 6/50
Epoch 6: val_loss did not improve from 0.67362
Epoch 7/50
Epoch 7: val_loss did not improve from 0.67362
Epoch 8/50
Epoch 8: val_loss did not improve from 0.67362
Epoch 9/50
Epoch 9: val_loss did not improve from 0.67362
Epoch 9: early stopping


<keras.src.callbacks.History at 0x5d126d450>

In [None]:
base_model.load_weights("best_nultimodal.hdf5")
pred = base_model.predict([text_test, image_data_test])
pred
pred = (pred > 0.5)




In [None]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.58      1.00      0.74       278
           1       0.00      0.00      0.00       199

    accuracy                           0.58       477
   macro avg       0.29      0.50      0.37       477
weighted avg       0.34      0.58      0.43       477



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [46]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D, MultiHeadAttention, Flatten, Dense

# Create input layer
input_layer = Input(shape=(64, 64, 3))

# Convolutional layer
conv_layer = Conv2D(filters=64, kernel_size=(3, 3), activation='relu')(input_layer)

# Reshape the output of the convolutional layer
reshape_layer = tf.keras.layers.Reshape((-1, 64))(conv_layer)

# Multi-headed self-attention layer
attention_layer = MultiHeadAttention(key_dim=64, num_heads=4)(reshape_layer, reshape_layer, reshape_layer)

# Flatten the attention output
flatten_layer = Flatten()(attention_layer)

# Fully connected layer
dense_layer = Dense(128, activation='relu')(flatten_layer)

# Output layer
output_layer = Dense(num_classes, activation='softmax')(dense_layer)

# Create the model
model = tf.keras.Model(inputs=input_layer, outputs=output_layer)

# Compile the model and specify the optimizer, loss function, and metrics
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()


Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_34 (InputLayer)       [(None, 64, 64, 3)]          0         []                            
                                                                                                  
 conv2d_15 (Conv2D)          (None, 62, 62, 64)           1792      ['input_34[0][0]']            
                                                                                                  
 reshape_1 (Reshape)         (None, 3844, 64)             0         ['conv2d_15[0][0]']           
                                                                                                  
 multi_head_attention_24 (M  (None, 3844, 64)             66368     ['reshape_1[0][0]',           
 ultiHeadAttention)                                                  'reshape_1[0][0]',     