In [None]:
# Importing the drive
from google.colab import output
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
# Installing the necessary libraries
!pip install datasets
!pip install evaluate
!pip install transformers
!pip install accelerate
!pip install sentencepiece

In [None]:
import pandas as pd
import numpy as np
import torch
import datetime
import calendar
import evaluate
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [None]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers.optimization import AdamW, get_linear_schedule_with_warmup
from torchtext.data.metrics import bleu_score

In [None]:
def findDay(date):
    """Obtains the day given the date, month and year
    """
    born = datetime.datetime.strptime(date, '%d %m %Y').weekday()
    return (calendar.day_name[born])

In [None]:
# Tokenizing the data
def tokenize_data(prompt, label, tokenizer, max_length=256):
    inputs = tokenizer(prompt, return_tensors="pt", max_length=max_length, truncation=True, padding="max_length")
    labels = tokenizer(label, return_tensors="pt", max_length=max_length, truncation=True, padding="max_length")
    return inputs, labels

In [None]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl", device_map="auto", torch_dtype=torch.float16)

In [None]:
class content(Dataset):
    def __init__(self,df):
        self.df=df

    def __len__(self):
        return len(self.df)

    def __getitem__(self,idx):
        """Extracting the meta data from the dataframe and creating the prompt for the model
        """
        img_cap=self.df.loc[idx,"caption"]
        user=self.df.loc[idx,"username"]
        company=self.df.loc[idx,"inferred company"]
        date=self.df.loc[idx,"date"].split(" ")[0]
        time=self.df.loc[idx,"date"].split(" ")[1]
        likes=self.df.loc[idx,"likes"]
        day=findDay(date.split(" ")[0].split("-")[2]+" "+date.split(" ")[0].split("-")[1]+" "+date.split(" ")[0].split("-")[0])

        prompt = f"predict the tweet content for the following tweet :\n{user} posted this tweet with an image which can be desribed as {img_cap} on {day}, {date} at {time} and has got {likes} likes. The inferred company is {company}.\n\ntweet content :\n"

        label=self.df.loc[idx,"content"]

        prompt,label=tokenize_data(prompt,label,tokenizer)

        return prompt , label


In [None]:
df=pd.read_csv("/content/drive/MyDrive/Adobe/Training_Data _new_imgcap_1-52k.csv")
df_cleaned = df.dropna(subset=['caption'])
df_cleaned.reset_index(drop=True, inplace=True)
train_df, test_df = train_test_split(df_cleaned, test_size=0.1, random_state=42)
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
train_df.head()

In [None]:
train_dataset=content(train_df)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)

optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=500, num_training_steps=len(train_loader) * 3)

device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
num_epochs=3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for step, batch in enumerate(train_loader):

        inputs = {'input_ids': batch[0]['input_ids'].squeeze().to(device),
                  'attention_mask': batch[0]['attention_mask'].squeeze().to(device)}

        labels = {'input_ids': batch[1]['input_ids'].squeeze().to(device),
                  'attention_mask': batch[1]['attention_mask'].squeeze().to(device)}

        labels=labels["input_ids"]
        labels[labels == tokenizer.pad_token_id] = -100

        optimizer.zero_grad()
        outputs = model(input_ids=inputs["input_ids"],attention_mask=inputs["attention_mask"], labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

        if step % 100 == 0 and step != 0:
            avg_loss = total_loss / 100
            print(f"Epoch [{epoch + 1}/{num_epochs}] | Step [{step}/{len(train_loader)}] | Loss: {avg_loss}")
            total_loss = 0

# Save the fine-tuned model
model.save_pretrained("/content/drive/MyDrive/Adobe/flan_t5_fine_tuned_model")
tokenizer.save_pretrained("/content/drive/MyDrive/Adobe/flan_t5_fine_tuned_tokenizer")

In [None]:
# Save the fine-tuned model
model.save_pretrained("/content/drive/MyDrive/Adobe/flan_t5_fine_tuned_model")
tokenizer.save_pretrained("/content/drive/MyDrive/Adobe/flan_t5_fine_tuned_tokenizer")

('/content/drive/MyDrive/Adobe/flan_t5_fine_tuned_tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/Adobe/flan_t5_fine_tuned_tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/Adobe/flan_t5_fine_tuned_tokenizer/spiece.model',
 '/content/drive/MyDrive/Adobe/flan_t5_fine_tuned_tokenizer/added_tokens.json')

In [None]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the saved tokenizer
trained_tokenizer = T5Tokenizer.from_pretrained("/content/drive/MyDrive/Adobe/flan_t5_fine_tuned_tokenizer")

# Load the saved model
trained_model = T5ForConditionalGeneration.from_pretrained("/content/drive/MyDrive/Adobe/flan_t5_fine_tuned_model")

trained_model.to(device)
groundtruth_eval = test_df["content"].tolist()
preds_eval = []

# BLEU calculation function
def calculate_bleu(ground_truth, preds):
    bleu = bleu_score(ground_truth, preds)
    return bleu

# Evaluation loop
def evaluate_model(model, tokenizer, eval_loader):
    model.eval()
    all_preds = []

    with torch.no_grad():
        for batch in eval_loader:
            inputs = {'input_ids': batch[0]['input_ids'].squeeze().to(device),
                      'attention_mask': batch[0]['attention_mask'].squeeze().to(device)}


            outputs = model.generate(input_ids=inputs["input_ids"], max_length=256)
            # Convertinf token IDs to text
            generated_sequences = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            # Store model-generated sequences for BLEU calculation
            all_preds.extend(generated_sequences)


    references_eval = test_df["content"].tolist()
    bleu = calculate_bleu(references_eval, all_preds)
    print(f"BLEU Score on Evaluation Data: {bleu}")

    return all_preds, bleu


test_dataset = content(test_df)
eval_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=2)

generated_outputs, bleu_score = evaluate_model(trained_model, trained_tokenizer, eval_loader)
print("Generated Outputs:", generated_outputs)
print("BLEU Score:", bleu_score)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


BLEU Score on Evaluation Data: 0.0
Generated Outputs: ['mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention>', "The #MotoFone is a new #motogyma. It's the first #motogymaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "'It's not a day to go.' hyperlink> hyperlink>", 'mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> mention> m