In [1]:
# Google colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U bitsandbytes
!pip install -q nltk

In [2]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import accelerate

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime, calendar
from ast import literal_eval
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils import clip_grad_norm_
from torch.nn.utils.rnn import pad_sequence
import torch.utils.data
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import AutoTokenizer, FalconForCausalLM, BitsAndBytesConfig
from transformers.optimization import AdamW, get_linear_schedule_with_warmup

import nltk
from nltk.translate.bleu_score import corpus_bleu

In [10]:
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_use_double_quant=True,
    bnb_8bit_quant_type="nf4",
    bnb_8bit_compute_dtype=torch.bfloat16
)

In [12]:
model = FalconForCausalLM.from_pretrained("tiiuae/falcon-7b", quantization_config=bnb_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
model

FalconForCausalLM(
  (transformer): FalconModel(
    (word_embeddings): Embedding(65024, 4544)
    (h): ModuleList(
      (0-31): 32 x FalconDecoderLayer(
        (self_attention): FalconAttention(
          (maybe_rotary): FalconRotaryEmbedding()
          (query_key_value): Linear8bitLt(in_features=4544, out_features=4672, bias=False)
          (dense): Linear8bitLt(in_features=4544, out_features=4544, bias=False)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): FalconMLP(
          (dense_h_to_4h): Linear8bitLt(in_features=4544, out_features=18176, bias=False)
          (act): GELU(approximate='none')
          (dense_4h_to_h): Linear8bitLt(in_features=18176, out_features=4544, bias=False)
        )
        (input_layernorm): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_f): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=4544, out_features=65024, bias=False)
)

In [14]:
class Custom_Model(nn.Module):
    """Custom version of Falcon to enable direct use of embeddings.
    """
    def __init__(self, model, embed_dim=768, seq_len=5):
        super(Custom_Model,self).__init__()
        self.seq_len = seq_len
        self.layer = nn.Linear(embed_dim, 2048*self.seq_len)
        self.conv_layer1 = nn.Conv1d(256, 271, kernel_size=3, padding=1)
        self.conv_layer2 = nn.Conv1d(261, 271, kernel_size=3, padding=1)
        self.conv_layer3 = nn.Conv1d(266, 271, kernel_size=3, padding=1)
        self.conv_layer4 = nn.Conv1d(271, 271, kernel_size=3, padding=1)
        self.model = model
        self.word_embed = model.transformer.word_embeddings

    def forward(self, mode1, mode2, mode3, tokenized_prompt, label, batch_size): 
        """Forward pass for image, video and audio modes."""
        features = []

        text_embed = self.word_embed(tokenized_prompt).squeeze()
        features.append(text_embed)

        count_conv = 3

        if torch.any(mode1 != 0): # Image
          mode1_embed = self.layer(mode1).squeeze()

          mode1_embed_reshape = torch.reshape(mode1_embed, (self.seq_len,2048))
          features.append(mode1_embed_reshape)
          count_conv -= 1

        if torch.any(mode2 != 0): # Video
          mode2_embed = self.layer(mode2).squeeze()

          mode2_embed_reshape = torch.reshape(mode2_embed, (self.seq_len,2048))
          features.append(mode2_embed_reshape)
          count_conv -= 1

        if torch.any(mode3 != 0): # Audio
          mode3_embed = self.layer(mode3).squeeze()

          mode3_embed_reshape = torch.reshape(mode3_embed, (self.seq_len,2048))
          features.append(mode3_embed_reshape)
          count_conv -= 1


        embed_output = torch.squeeze(torch.cat(features, dim=1))

        if count_conv == 3:
          conv_output = self.conv_layer1(embed_output)
        elif count_conv == 2:
          conv_output = self.conv_layer2(embed_output)
        elif count_conv == 1:
          conv_output = self.conv_layer3(embed_output)
        else:
          conv_output = self.conv_layer4(embed_output)

        output = self.model(inputs_embeds = conv_output, labels = label)

        return output

In [15]:
model_trial = Custom_Model(model)

## Lora Configuration

In [16]:
config = LoraConfig(
    r=16, # rank
    lora_alpha=16,
    target_modules=["query_key_value", "layer", "transformer.word_embeddings", "dense", "dense_h_to_4h", "dense_4h_to_h"],
    lora_dropout=0.05,
    bias="none"
)

In [17]:
model_trial = prepare_model_for_kbit_training(model_trial)

In [18]:
model_trial = get_peft_model(model_trial,config)
model_trial.print_trainable_parameters()

trainable params: 33,926,144 || all params: 6,964,379,394 || trainable%: 0.48713807908323153


In [27]:
model_trial

PeftModel(
  (base_model): LoraModel(
    (model): Custom_Model(
      (layer): Linear(in_features=768, out_features=10240, bias=True)
      (conv_layer1): Conv1d(256, 271, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv_layer2): Conv1d(261, 271, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv_layer3): Conv1d(266, 271, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv_layer4): Conv1d(271, 271, kernel_size=(3,), stride=(1,), padding=(1,))
      (model): FalconForCausalLM(
        (transformer): FalconModel(
          (word_embeddings): Embedding(65024, 4544)
          (h): ModuleList(
            (0-31): 32 x FalconDecoderLayer(
              (self_attention): FalconAttention(
                (maybe_rotary): FalconRotaryEmbedding()
                (query_key_value): lora.Linear(
                  (base_layer): Linear8bitLt(in_features=4544, out_features=4672, bias=False)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05,

## Dataset Analysis and Preprocessing

In [40]:
train_df = pd.read_csv("audio_video_image_embeds_finale.csv")
train_df.columns

Index(['Unnamed: 0', 'index', 'id', 'date', 'likes', 'content', 'username',
       'media', 'inferred company', 'single_media', 'caption', 'audio',
       'image_embeds', 'video_embeds', 'audio_embeds'],
      dtype='object')

In [32]:
len(train_df)

5000

In [33]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,index,id,date,likes,content,username,media,inferred company,single_media,caption,audio,image_embeds,video_embeds,audio_embeds
0,0,0,1,12-12-2020 00:47,1,"Spend your weekend morning with a Ham, Egg, an...",TimHortonsPH,[Photo(previewUrl='https://pbs.twimg.com/media...,tim hortons,https://pbs.twimg.com/media/Eo8N3JLVoAAlDJT?fo...,a sandwich and a cup of coffee on a table,,"[[0.006107481662184, 0.007661679293960333, 0.0...",,
1,1,1,2,30-06-2018 10:04,2750,Watch rapper <mention> freestyle for over an H...,IndyMusic,[Photo(previewUrl='https://pbs.twimg.com/media...,independent,https://pbs.twimg.com/media/Dg7mW-VX0AE_hMn?fo...,a young man in a blue shirt is playing a video...,,"[[0.02237328514456749, -0.0030112541280686855,...",,
2,2,2,3,29-09-2020 19:47,57,Canadian Armenian community demands ban on mil...,CBCCanada,[Photo(previewUrl='https://pbs.twimg.com/media...,cbc,https://pbs.twimg.com/media/EjG2s4aXgAMNM1o?fo...,a man on a fire hydrant spraying water on the ...,,"[[-0.0002408653817838058, 0.010936545208096504...",,
3,3,3,4,01-10-2020 11:40,152,"1st in Europe to be devastated by COVID-19, It...",MKWilliamsRome,[Photo(previewUrl='https://pbs.twimg.com/media...,williams,https://pbs.twimg.com/media/EjPaVniX0AAaWLJ?fo...,a beach filled with lots of people and umbrellas,,"[[-0.002168221166357398, -0.04033593460917473,...",,
4,4,4,6,15-11-2020 16:01,525,An 85-year-old primary school in Shanghai has ...,cnni,[Video(thumbnailUrl='https://pbs.twimg.com/amp...,cnn,https://video.twimg.com/amplify_video/13223822...,A group of people are building a building in t...,"tensor([[-4.8395e-02, 3.3148e-02, -3.0773e-02...","[[-0.04480615630745888, 0.0036263801157474518,...","[[-0.04480615630745888, 0.0036263801157474518,...",[[-4.8395e-02 3.3148e-02 -3.0773e-02 1.1186e...


In [41]:
def convert_to_array(value):
  "Safely convert to array."
  try:
    return np.array(literal_eval(value))
  except Exception as e:
    return

In [42]:
train_df['audio_embeds'] = train_df['audio_embeds'].apply(convert_to_array)
train_df['image_embeds'] = train_df['image_embeds'].apply(convert_to_array)
train_df['video_embeds'] = train_df['video_embeds'].apply(convert_to_array)

In [43]:
train_df["audio_embeds"].fillna('nan', inplace=True)
train_df["image_embeds"].fillna('nan', inplace=True)
train_df["video_embeds"].fillna('nan', inplace=True)

In [33]:
train_df = train_df[:100] # for testing

## Dataset Class

In [45]:
tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b")
tokenizer.pad_token = tokenizer.eos_token

In [46]:
def tokenize_data(prompt, label, tokenizer, max_length1=256, max_length2=271):
  inputs = tokenizer(prompt, return_tensors="pt", max_length=max_length1, truncation=True, padding="max_length")
  labels = tokenizer(label, return_tensors="pt", max_length=max_length2, truncation=True, padding="max_length")
  return inputs, labels

In [47]:
def findDay(date):
  "Find day of the week from date."
  born = datetime.datetime.strptime(date, '%Y %m %d').weekday()
  return (calendar.day_name[born])

def convert_to_tensor(value, embedding):
  """Convert embeddings to tensors."""
  if value == "audio":
    if type(embedding) != str:
      try:
        a = torch.tensor(eval(embedding))
        return a
      except SyntaxError as e: # Addressing NaN values
        ans = torch.zeros([1,768])
        return ans
    else:
      ans = torch.zeros([1,768])
      return ans

  if value == "image":
    if type(embedding) != str:
      embedding = embedding[0:-21]
      try:
        a = torch.tensor(eval(embedding))
        return a
      except SyntaxError as e:
        ans = torch.zeros([1,768])
        return ans
    else:
      ans = torch.zeros([1,768])
      return ans

  if value == "video":
    if type(embedding) != str:
      embedding = embedding[7:-22]
      try:
        a = torch.tensor(eval(embedding))
        return a
      except SyntaxError as e:
        ans = torch.zeros([1,768])
        return ans
    else:
      ans = torch.zeros([1,768])
      return ans

class content(Dataset):
  """Dataset class for pytorch DataLoaders"""
  def __init__(self,df):
    self.df = df

  def __len__(self):
    return len(self.df)

  def __getitem__(self,idx):
    """Concatenate embeddings and return as tensors."""
    likes = self.df.loc[idx,"likes"]
    user = self.df.loc[idx,"username"]
    company = self.df.loc[idx,"inferred company"]
    date = self.df.loc[idx,"date"].split(" ")[0]
    time = self.df.loc[idx,"date"].split(" ")[1]
    day = findDay(self.df.loc[idx,"date"].split(" ")[0].split("-")[2]+" "+self.df.loc[idx,"date"].split(" ")[0].split("-")[1]+" "+self.df.loc[idx,"date"].split(" ")[0].split("-")[0])
    url = self.df.loc[idx,"single_media"]

    self.df["audio_embeds"] = [a[7:-1] if isinstance(a, str) and a != 'nan' else a for a in self.df["audio_embeds"]]
    self.df["image_embeds"] = [a[7:-1] if isinstance(a, str) and a !=' nan' else a for a in self.df["image_embeds"]]
    self.df["video_embeds"] = [str(a) for a in self.df["video_embeds"]]

    image_embed = convert_to_tensor("image", self.df.at[idx,"image_embeds"])
    video_embed = convert_to_tensor("video", self.df.at[idx,"video_embeds"])
    audio_embed = convert_to_tensor("audio", self.df.at[idx,"audio_embeds"])

    prompt = f"predict the tweet content for the following tweet :-\n\n{user} posted this tweet with an image.\nthis tweet was posted on {day}, {date} at {time}\nthe tweet has generated {likes} likes \nThe inferred company mentioned: {company}.\n\ntweet content :\n"

    label = self.df.loc[idx,"content"].replace("<mention>",company).replace("<hyperlink>"," ")

    prompt,label = tokenize_data(prompt,label,tokenizer)

    image_embed = image_embed.float()
    video_embed = video_embed.float()
    audio_embed = audio_embed.float()


    prompt_token = prompt["input_ids"]
    label_token = label["input_ids"]

    return [image_embed, video_embed, audio_embed, prompt_token], label_token

  if value is "audio":
  if value is "image":
  if value is "video":


## Training

In [50]:
batch_size = 4
train_dataset= content(train_df)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=0)

optimizer = AdamW(model_trial.parameters(), lr=1e-6)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=500, num_training_steps=len(train_loader) * 3)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_trial.to(device)



PeftModel(
  (base_model): LoraModel(
    (model): Custom_Model(
      (layer): lora.Linear(
        (base_layer): Linear(in_features=768, out_features=10240, bias=True)
        (lora_dropout): ModuleDict(
          (default): Dropout(p=0.05, inplace=False)
        )
        (lora_A): ModuleDict(
          (default): Linear(in_features=768, out_features=16, bias=False)
        )
        (lora_B): ModuleDict(
          (default): Linear(in_features=16, out_features=10240, bias=False)
        )
        (lora_embedding_A): ParameterDict()
        (lora_embedding_B): ParameterDict()
      )
      (conv_layer1): Conv1d(256, 271, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv_layer2): Conv1d(261, 271, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv_layer3): Conv1d(266, 271, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv_layer4): Conv1d(271, 271, kernel_size=(3,), stride=(1,), padding=(1,))
      (model): FalconForCausalLM(
        (transformer): FalconModel(
    

In [51]:
def froze_check(model):
  train_layers = ["base_model.model.layer.weight", "base_model.model.layer.bias",
                  "base_model.model.conv_layer1.weight", "base_model.model.conv_layer1.bias",
                  "base_model.model.conv_layer2.weight", "base_model.model.conv_layer2.bias",
                  "base_model.model.conv_layer3.weight", "base_model.model.conv_layer3.bias",
                  "base_model.model.conv_layer4.weight", "base_model.model.conv_layer4.bias"]
  for name, param in model.named_parameters():
    if name in train_layers:
      param.requires_grad = True
    print(f'Layer: {name}, Requires Grad: {param.requires_grad}')

froze_check(model_trial)

Layer: base_model.model.layer.base_layer.weight, Requires Grad: False
Layer: base_model.model.layer.base_layer.bias, Requires Grad: False
Layer: base_model.model.layer.lora_A.default.weight, Requires Grad: True
Layer: base_model.model.layer.lora_B.default.weight, Requires Grad: True
Layer: base_model.model.conv_layer1.weight, Requires Grad: True
Layer: base_model.model.conv_layer1.bias, Requires Grad: True
Layer: base_model.model.conv_layer2.weight, Requires Grad: True
Layer: base_model.model.conv_layer2.bias, Requires Grad: True
Layer: base_model.model.conv_layer3.weight, Requires Grad: True
Layer: base_model.model.conv_layer3.bias, Requires Grad: True
Layer: base_model.model.conv_layer4.weight, Requires Grad: True
Layer: base_model.model.conv_layer4.bias, Requires Grad: True
Layer: base_model.model.model.transformer.word_embeddings.base_layer.weight, Requires Grad: False
Layer: base_model.model.model.transformer.word_embeddings.lora_embedding_A.default, Requires Grad: True
Layer: bas

In [53]:
num_epochs=3
# Training loop
for epoch in range(num_epochs):
    model_trial.train()
    total_loss = 0

    for step, batch in enumerate(tqdm(train_loader)):

        image_input = batch[0][0].to(device)
        video_input = batch[0][1].to(device)
        audio_input = batch[0][2].to(device)

        inputs = batch[0][3].to(device)
        labels = batch[1].to(device)

        output = model_trial(image_input, video_input, audio_input, inputs, labels, batch_size)
        vocab = output.logits

        optimizer.zero_grad()
        loss = output.loss
        total_loss += loss.item()

        loss.backward()
         # Clip gradients to a maximum norm

        optimizer.step()
        scheduler.step()

        if step % 100 == 0 and step != 0:
            avg_loss = total_loss / 100
            print(f"Epoch [{epoch + 1}/{num_epochs}] | Step [{step}/{len(train_loader)}] | Loss: {avg_loss}")
            total_loss = 0

The current implementation of Falcon calls `torch.scaled_dot_product_attention` directly, this will be deprecated in the future in favor of the `BetterTransformer` API. Please install the latest optimum library with `pip install -U optimum` and call `model.to_bettertransformer()` to benefit from `torch.scaled_dot_product_attention` and future performance optimizations.
  8%|▊         | 101/1250 [02:40<29:15,  1.53s/it]

Epoch [1/3] | Step [100/1250] | Loss: 10.953024950027466


 16%|█▌        | 201/1250 [05:13<26:52,  1.54s/it]

Epoch [1/3] | Step [200/1250] | Loss: 10.525006980895997


 24%|██▍       | 301/1250 [07:47<24:14,  1.53s/it]

Epoch [1/3] | Step [300/1250] | Loss: 9.19089551448822


 32%|███▏      | 401/1250 [10:21<21:46,  1.54s/it]

Epoch [1/3] | Step [400/1250] | Loss: 5.88734622001648


 40%|████      | 501/1250 [12:54<19:09,  1.53s/it]

Epoch [1/3] | Step [500/1250] | Loss: 2.957784767150879


 48%|████▊     | 601/1250 [15:28<16:32,  1.53s/it]

Epoch [1/3] | Step [600/1250] | Loss: 1.951450914144516


 56%|█████▌    | 701/1250 [18:01<14:01,  1.53s/it]

Epoch [1/3] | Step [700/1250] | Loss: 1.6958054202795028


 64%|██████▍   | 801/1250 [20:34<11:28,  1.53s/it]

Epoch [1/3] | Step [800/1250] | Loss: 1.7086118483543395


 72%|███████▏  | 901/1250 [23:07<08:54,  1.53s/it]

Epoch [1/3] | Step [900/1250] | Loss: 1.5450992381572723


 80%|████████  | 1001/1250 [25:41<06:18,  1.52s/it]

Epoch [1/3] | Step [1000/1250] | Loss: 1.5401735973358155


 88%|████████▊ | 1101/1250 [28:14<03:48,  1.53s/it]

Epoch [1/3] | Step [1100/1250] | Loss: 1.4998229020833969


 96%|█████████▌| 1201/1250 [30:46<01:14,  1.53s/it]

Epoch [1/3] | Step [1200/1250] | Loss: 1.4240305602550507


100%|██████████| 1250/1250 [32:02<00:00,  1.54s/it]
  8%|▊         | 101/1250 [02:34<29:22,  1.53s/it]

Epoch [2/3] | Step [100/1250] | Loss: 1.4370363336801528


 16%|█▌        | 201/1250 [05:08<26:56,  1.54s/it]

Epoch [2/3] | Step [200/1250] | Loss: 1.4146601235866547


 24%|██▍       | 301/1250 [07:42<24:12,  1.53s/it]

Epoch [2/3] | Step [300/1250] | Loss: 1.4005113762617112


 32%|███▏      | 401/1250 [10:16<21:46,  1.54s/it]

Epoch [2/3] | Step [400/1250] | Loss: 1.405314999818802


 40%|████      | 501/1250 [12:49<19:08,  1.53s/it]

Epoch [2/3] | Step [500/1250] | Loss: 1.3380072742700577


 48%|████▊     | 601/1250 [15:23<16:39,  1.54s/it]

Epoch [2/3] | Step [600/1250] | Loss: 1.3152976667881011


 56%|█████▌    | 701/1250 [17:57<14:03,  1.54s/it]

Epoch [2/3] | Step [700/1250] | Loss: 1.3553737181425094


 64%|██████▍   | 801/1250 [20:31<11:29,  1.54s/it]

Epoch [2/3] | Step [800/1250] | Loss: 1.331892101764679


 72%|███████▏  | 901/1250 [23:04<08:56,  1.54s/it]

Epoch [2/3] | Step [900/1250] | Loss: 1.2565658622980118


 80%|████████  | 1001/1250 [25:38<06:19,  1.53s/it]

Epoch [2/3] | Step [1000/1250] | Loss: 1.3252622228860855


 88%|████████▊ | 1101/1250 [28:11<03:48,  1.53s/it]

Epoch [2/3] | Step [1100/1250] | Loss: 1.2864363080263137


 96%|█████████▌| 1201/1250 [30:45<01:15,  1.53s/it]

Epoch [2/3] | Step [1200/1250] | Loss: 1.3207723313570023


100%|██████████| 1250/1250 [32:00<00:00,  1.54s/it]
  8%|▊         | 101/1250 [02:35<29:22,  1.53s/it]

Epoch [3/3] | Step [100/1250] | Loss: 1.2799880522489548


 16%|█▌        | 201/1250 [05:08<26:49,  1.53s/it]

Epoch [3/3] | Step [200/1250] | Loss: 1.2490106439590454


 24%|██▍       | 301/1250 [07:42<24:15,  1.53s/it]

Epoch [3/3] | Step [300/1250] | Loss: 1.2783547431230544


 32%|███▏      | 401/1250 [10:15<21:34,  1.52s/it]

Epoch [3/3] | Step [400/1250] | Loss: 1.2426733309030533


 40%|████      | 501/1250 [12:49<19:12,  1.54s/it]

Epoch [3/3] | Step [500/1250] | Loss: 1.2812939512729644


 48%|████▊     | 601/1250 [15:23<16:37,  1.54s/it]

Epoch [3/3] | Step [600/1250] | Loss: 1.2192412668466568


 56%|█████▌    | 701/1250 [17:56<14:02,  1.53s/it]

Epoch [3/3] | Step [700/1250] | Loss: 1.236958818435669


 64%|██████▍   | 801/1250 [20:30<11:24,  1.52s/it]

Epoch [3/3] | Step [800/1250] | Loss: 1.2334161400794983


 72%|███████▏  | 901/1250 [23:03<09:04,  1.56s/it]

Epoch [3/3] | Step [900/1250] | Loss: 1.267915650010109


 80%|████████  | 1001/1250 [25:37<06:20,  1.53s/it]

Epoch [3/3] | Step [1000/1250] | Loss: 1.2442920970916749


 88%|████████▊ | 1101/1250 [28:10<03:49,  1.54s/it]

Epoch [3/3] | Step [1100/1250] | Loss: 1.2756521040201187


 96%|█████████▌| 1201/1250 [30:44<01:14,  1.53s/it]

Epoch [3/3] | Step [1200/1250] | Loss: 1.2416263961791991


100%|██████████| 1250/1250 [31:59<00:00,  1.54s/it]


# Inference

In [55]:
nltk.download('punkt')  # Download 'punkt' tokenizer models if not already downloaded

[nltk_data] Downloading package punkt to /home/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [56]:
test_df = pd.read_csv("audio_video_image_embeds_finale.csv")
test_df.columns

Index(['Unnamed: 0', 'index', 'id', 'date', 'likes', 'content', 'username',
       'media', 'inferred company', 'single_media', 'caption', 'audio',
       'image_embeds', 'video_embeds', 'audio_embeds'],
      dtype='object')

In [57]:
test_df = test_df[:100] # Quick inference

In [59]:
test_df['audio_embeds'] = test_df['audio_embeds'].apply(convert_to_array)
test_df['image_embeds'] = test_df['image_embeds'].apply(convert_to_array)
test_df['video_embeds'] = test_df['video_embeds'].apply(convert_to_array)

In [60]:
test_df["audio_embeds"].fillna('nan', inplace=True)
test_df["image_embeds"].fillna('nan', inplace=True)
test_df["video_embeds"].fillna('nan', inplace=True)

In [61]:
# Evaluation hyperparameters
batch_size = 4
test_dataset= content(test_df)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=True, num_workers=0)

optimizer = AdamW(model_trial.parameters(), lr=1e-6)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=500, num_training_steps=len(test_loader) * 3)

device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_trial.to(device)



PeftModel(
  (base_model): LoraModel(
    (model): Custom_Model(
      (layer): lora.Linear(
        (base_layer): Linear(in_features=768, out_features=10240, bias=True)
        (lora_dropout): ModuleDict(
          (default): Dropout(p=0.05, inplace=False)
        )
        (lora_A): ModuleDict(
          (default): Linear(in_features=768, out_features=16, bias=False)
        )
        (lora_B): ModuleDict(
          (default): Linear(in_features=16, out_features=10240, bias=False)
        )
        (lora_embedding_A): ParameterDict()
        (lora_embedding_B): ParameterDict()
      )
      (conv_layer1): Conv1d(256, 271, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv_layer2): Conv1d(261, 271, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv_layer3): Conv1d(266, 271, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv_layer4): Conv1d(271, 271, kernel_size=(3,), stride=(1,), padding=(1,))
      (model): FalconForCausalLM(
        (transformer): FalconModel(
    

In [62]:
references_eval = test_df["content"].to_list()

In [64]:
def calculate_bleu(references, hypotheses):
    bleu_score = corpus_bleu(references, hypotheses)
    return bleu_score
    
def evaluate_model(model_trial, tokenizer, test_loader):
    model_trial.eval()
    all_preds = []  # List to store model-generated sequences for calculating BLEU

    with torch.no_grad():
        # Wrap the evaluation loop with tqdm for progress visualization
        for batch in tqdm(test_loader, desc="Evaluating", unit="batch"):
            
            image_input = batch[0][0].to(device)
            video_input = batch[0][1].to(device)
            audio_input = batch[0][2].to(device)
            
            inputs = batch[0][3].to(device)
            labels = batch[1].to(device)
    
            output = model_trial(image_input, video_input, audio_input, inputs, labels, batch_size)
            
            logits = output.logits

            ############################################### argmax #################################################
            # Convert logits to token IDs
            predicted_token_ids = torch.argmax(logits, dim=-1)

            # Decode batch using the tokenizer
            generated_sequences = tokenizer.batch_decode(predicted_token_ids.tolist(),skip_special_tokens=True)

            # Store model-generated sequences for BLEU calculation
            all_preds.extend(generated_sequences)
            ######################################################################################################
            ############################################### topk #################################################

    # Calculate BLEU score
    bleu = calculate_bleu(references_eval, all_preds)
    print(f"BLEU Score on Evaluation Data: {bleu}")

    return all_preds, bleu


evaluate_model(model_trial, tokenizer, test_loader)


Evaluating: 100%|██████████| 25/25 [00:09<00:00,  2.77batch/s]

BLEU Score on Evaluation Data: 1.499473553012327e-231



The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


([' ',
  '',
  ' ',
  ' ',
  '.',
  ' ',
  '',
  '',
  '',
  ' ',
  '',
  '',
  ',\n\n\nThe.',
  ' ',
  '',
  '',
  '',
  '',
  '',
  '.\n\nThe.. ',
  ' ',
  '',
  '',
  ' ',
  ' ',
  ' \n\n The..',
  '.\n\n The. ',
  ' \n\nThe.',
  '',
  ' ',
  ' \n\nThe\n.',
  '  ',
  '',
  ' \n\n\nThe.. ',
  ' \n\n.The.. ',
  '',
  '',
  '',
  ' ',
  '',
  '',
  '',
  'The. ',
  '',
  '',
  '',
  ' \n\n\nThe.',
  '\n\nThe.\n,',
  ' ',
  '',
  ' ',
  '\nThe.',
  ' ',
  '',
  '',
  '',
  ' \n\n\n ',
  '',
  '.',
  '.\nThe.  the the',
  '.\n\n\nTheThe,',
  '\n',
  '',
  '',
  ' ',
  '',
  '',
  ' ',
  '',
  '',
  '',
  '',
  ' \n\n\nThe.',
  '.',
  '',
  'The , of',
  '',
  ' \n\n\nThe.',
  '',
  '',
  ' ',
  ' \n\n\nThe, ',
  '  ',
  ' ',
  ' ',
  '',
  '.\n\n\nThe, ',
  ' \n\n\n\n\nThe',
  ' ',
  ' ',
  '\n,The. ',
  '',
  ' \n\n\nThe.',
  '',
  '\n\nThe.',
  '',
  '  ',
  '..\nThe.',
  '   ',
  ' \n.The.'],
 1.499473553012327e-231)

In [None]:

nltk.download('punkt')  # Download 'punkt' tokenizer models if not already downloaded

references_eval = train_df["content"].tolist()  # List of reference sequences for evaluation
prediction_eval = []  # List of model-generated sequences for evaluation


def calculate_bleu(references, hypotheses):
    bleu_score = corpus_bleu(references, hypotheses)
    return bleu_score

def evaluate_model(model, tokenizer, eval_loader):
    model.eval()
    all_preds = []  # List to store model-generated sequences for calculating BLEU

    with torch.no_grad():
        # Wrap the evaluation loop with tqdm for progress visualization
        for batch in tqdm(eval_loader, desc="Evaluating", unit="batch"):

            image_input = batch[0].to(device)
            video_input = batch[1].to(device)
            audio_input = batch[2].to(device)

            inputs = batch[3].to(device)
            labels = batch[4].to(device)

            output = model_trial(image_input, video_input, audio_input, inputs, labels, batch_size)

            # Convert token IDs to text (assuming using a tokenizer)
            generated_sequences = tokenizer.batch_decode(output.logits, skip_special_tokens=True)

            # Store model-generated sequences for BLEU calculation
            all_preds.extend(generated_sequences)

    # Calculate BLEU score
    bleu = calculate_bleu(references_eval, all_preds)
    print(f"BLEU Score on Evaluation Data: {bleu}")


# Usage example:
# Call evaluate_model function passing your trained model and evaluation data loader

test_dataset = content(train_df[:20])
eval_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, num_workers=2)

evaluate_model(model, tokenizer, eval_loader)
