# Installation and Imports

In [None]:
!pip install -q datasets bitsandbytes sentencepiece
!pip install -q accelerate loralib peft

In [None]:
import os
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset, Dataset, DatasetDict
from PIL import Image
from IPython.display import display

from transformers import (
    IdeficsForVisionText2Text,
    AutoProcessor,
    Trainer,
    TrainingArguments,
    BitsAndBytesConfig,
    AutoTokenizer
)

import torch
import torchvision.transforms as transforms
from peft import LoraConfig, get_peft_model
from sklearn.model_selection import train_test_split

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the IDEFICS model with QLoRA

In [None]:
!huggingface-cli login

In [None]:
## Load in IDEFICS with 4bit quantization
checkpoint = "HuggingFaceM4/idefics-9b"

# Here we skip some special modules that can't be quantized properly
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    llm_int8_skip_modules=["lm_head", "embed_tokens"],
)

## Load the IDEFICS processor, which encodes images and tokenizes text
processor = AutoProcessor.from_pretrained(checkpoint, use_auth_token=True)

## Download the model from HF
model = IdeficsForVisionText2Text.from_pretrained(
    checkpoint,
    quantization_config=quant_config,
    device_map={"": 0}
)

In [None]:
## Create a simple script to predict on images

def check_inference(model, processor, prompts, max_new_tokens=50):
    ## Collect tokenizer
    tokenizer = processor.tokenizer

    ## Remove IDEFICS tags from text for tokenizing
    bad_words = ["<image>", "<fake_token_around_image>"]
    if len(bad_words) > 0:
        bad_words_ids = tokenizer(bad_words, add_special_tokens=False).input_ids
    eos_token = "</s>"
    eos_token_id = tokenizer.convert_tokens_to_ids(eos_token)

    ## Process the prompt and generate outputs.
    inputs = processor(prompts, return_tensors="pt").to(device)
    generated_ids = model.generate(**inputs, eos_token_id=[eos_token_id], bad_words_ids=bad_words_ids, max_new_tokens=max_new_tokens, early_stopping=True)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return generated_text

# Download the sports classification dataset

In [None]:
## Upload your kaggle.json credentials file. You can find this with the following steps:
## 1) Create a Kaggle account
## 2) Go to your Kaggle settings page.
## 3) Scroll down to the API section, and "Create New Token". This will download a json credentials file.
## 4) Upload the .json to the Colab notebook in /content/

## Determine in the kaggle config folder exists. If not create it.

if os.path.exists('~/.kaggle/'): ## This isn't working right
    sys.exit()
!mkdir ~/.kaggle

In [None]:
## Copy your cred file into the config folder and configure access.
%cd /content/
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
## If the data has not been downloaded, retrieve from kaggle
!kaggle datasets download -d gpiosenka/sports-classification
!unzip -q sports-classification.zip -d sports-classification

## Format so that there are no spaces in sport name
for folder in ('train','valid','test'):
    path = f'./sports-classification/{folder}'
    sports = os.listdir(path)
    for sport in sports:
        if ' ' in sport:
            newsport = sport.replace(' ','_')
            sport = sport.replace(' ','\ ')
            os.system(f'mv {path}/{sport} {path}/{newsport}')

# Format as Huggingface dataset

In [None]:
## Routine to convert each image into a multimodal prompt asking which sport
## is in the image.
def process_data(batch):
    prompts = []
    for i in range(len(batch['files'])):
        f, l = batch['files'][i], batch['labels'][i].replace('_',' ')
        # We split the captions to avoid having very long examples, which would require more GPU ram during training
        image = Image.open(f)
        prompts.append(
            [
                image,
                f"Question: What sport is in this image? Answer: {l}. ",
            ],
        )

    inputs = processor(prompts, return_tensors="pt").to(device)
    inputs["labels"] = inputs["input_ids"]
    return inputs

## Routine to collect needed info about the dataset into a df.
def create_dataset(folder):
    files = [[folder+fs+'/'+f for f in os.listdir(folder+fs)] for fs in os.listdir(folder)]
    files = [l for ls in files for l in ls]
    labels = [tf.split('/')[-2] for tf in files]
    df = pd.DataFrame({'files':files,'labels':labels})
    return df


## Load and prepare the dataset
train_df = create_dataset('./sports-classification/train/')
valid_df = create_dataset('./sports-classification/valid/')
test_df = create_dataset('./sports-classification/test/')

## Subsample of training data, include 10 examples of each sport
all_sports = train_df.labels.drop_duplicates()
out_indices = []
for sport in all_sports:
    indices = train_df[train_df.labels==sport].sample(50,random_state=7).index.tolist()
    out_indices.extend(indices)
train_df = train_df.iloc[out_indices].sample(frac=1,random_state=13).copy()

## Create HF dataset and apply transformation
sports_dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'valid': Dataset.from_pandas(valid_df),
    'test': Dataset.from_pandas(test_df),
})

sports_dataset['train'].set_transform(process_data)
sports_dataset['valid'].set_transform(process_data)
sports_dataset['test'].set_transform(process_data)

In [None]:
sports_dataset

# Sports classification

## Zero-shot eval

In [None]:
## Test a single zero-shot example
image = Image.open('./sports-classification/test/cricket/2.jpg')
prompts = [
    image,
    "Question: What sport is in this image? Answer:",
]
display(image)
print(check_inference(model, processor, prompts, max_new_tokens=10))

In [None]:
## Test a single zero-shot example
image = Image.open('/content/sports-classification/test/tug_of_war/4.jpg')
prompts = [
    image,
    "Question: What sport is in this image? Answer:",
]
display(image)
print(check_inference(model, processor, prompts, max_new_tokens=10).replace('is in this','is in\nthis'))

In [None]:
## Lets predict on the full test set in this zero-shot approach and see what happens.
out_labels = []
for f, l in tqdm(test_df.to_numpy()):

    image = Image.open(f)
    prompts = [
        # "Instruction: provide an answer to the question. Use the image to answer.\n",
        image,
        "Question: What sport is in this image? Answer:",
    ]
    output = check_inference(model, processor, prompts, max_new_tokens=10)
    out_labels.append(output)

In [None]:
test_df['predictions'] = out_labels

In [None]:
test_df.to_csv('./test_set_zero_shot_preds.csv',index=False)

## Fine-tune with QLoRA

In [None]:
model_name = checkpoint.split("/")[1]
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
)
model = get_peft_model(model, config)

model.print_trainable_parameters()

In [None]:
training_args = TrainingArguments(
    output_dir=f"{model_name}-sports",
    learning_rate=2e-4,
    fp16=True,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    dataloader_pin_memory=False,
    save_total_limit=3,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=10,
    eval_steps=10,
    logging_steps=10,
    num_train_epochs=1,
    remove_unused_columns=False,
    push_to_hub=False,
    label_names=["labels"],
    load_best_model_at_end=True,
    report_to=None,
    optim="paged_adamw_8bit",
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=sports_dataset['train'],
    eval_dataset=sports_dataset['valid'],
)

In [None]:
trainer.train()

In [None]:
test_df = test_df[['files','labels']]

In [None]:
## Predict on the full test set in this zero-shot approach and see what happens.
ft_out_labels = []
for f, l in tqdm(test_df.to_numpy()):

    image = Image.open(f)
    prompts = [
        # "Instruction: provide an answer to the question. Use the image to answer.\n",
        image,
        "Question: What sport is in this image? Answer:",
    ]
    output = check_inference(trainer.model, processor, prompts, max_new_tokens=15)
    ft_out_labels.append(output)

In [None]:
test_df['predictions'] = ft_out_labels

In [None]:
test_df.to_csv('./test_set_qlora_preds.csv',index=False)

## Grade the predictions

In [None]:
## Read in the results
df1 = pd.read_csv('./test_set_zero_shot_preds.csv')
df2 = pd.read_csv('./test_set_qlora_preds.csv')

In [None]:
## Normalize the answers and compare against the dataset label.
def norm_preds(p):
    try: p = p.split('image? Answer: ')[1]
    except:
        return ''
    p = p.split('.')[0]
    p = p.lower()
    return p

df1['preds_norm'] = df1.predictions.apply(norm_preds)
df2['preds_norm'] = df2.predictions.apply(norm_preds)

df1['labels'] = df1.labels.apply(lambda x: x.replace('_',' '))
df2['labels'] = df2.labels.apply(lambda x: x.replace('_',' '))

In [None]:
## How many labels were correctly predicted with zero-shot
print(sum(df1.labels == df1.preds_norm),' / 500')

## NOTE: loading with just LoRA instead of QLoRA increases score by 8.

In [None]:
## How many labels were correctly predicted after fine-tuning
print(sum(df2.labels == df2.preds_norm),' / 500')

# Image captioning

**NOTE**: You must reload the IDEFICS model before proceeding in this section, if you have fine-tuned the loaded model in the above image classification task. Please restart the kernel, rerun the code in sections "Installation and Imports" and "Load the IDEFICS model with QLoRA", and then proceed.

## Zero-shot sports captions

In [None]:
image = Image.open('/content/sports-classification/train/football/029.jpg')
prompts = [
    image,
    "Question: What is a caption for this photo? Answer:",
]
display(image)
print(check_inference(model, processor, prompts, max_new_tokens=40))

## In-context sports captions

In [None]:
image_dict = {
    './sports-classification/train/axe_throwing/005.jpg':'A man prepares to throw an ax at a target.</s>\n',
    './sports-classification/train/bowling/002.jpg':'A woman rolls a bowling ball down a bowling alley.</s>\n',
    './sports-classification/train/hurdles/014.jpg':'Several competitors jump over hurdles during a race.</s>\n',
    './sports-classification/train/rugby/003.jpg': 'A man in a red jersey tackles a man carrying a rugby ball in a blue jersey.</s>\n',
    './sports-classification/train/football/029.jpg': ''
}

prompts = []
for k in image_dict.keys():
    image = Image.open(k)
    prompts.append(image)
    prompts.append(f"Question: What is a caption for this photo? Answer: {image_dict[k]}")

print(check_inference(model, processor, prompts, max_new_tokens=15))
image

## Fine-tune with QLoRA

In [None]:
## Download the sports subset of the Flickr30k dataset.

%cd /content/
!git clone https://github.com/ShinThant3010/Captioning-on-Sport-Images.git

%cd /content/Captioning-on-Sport-Images/
!unzip Training\ Images.zip
!unzip Testing\ Images.zip

In [None]:
## Format as a Huggingface dataset for fine-tuning

def process_data(batch):
    prompts = []
    for i in range(len(batch['image_name'])):
        f, l = batch['image_name'][i], batch['comment'][i]
        # We split the captions to avoid having very long examples, which would require more GPU ram during training
        image = Image.open(f)
        prompts.append(
            [
                image,
                f"Question: What is a caption for this photo? Answer: {l} ",
            ],
        )

    inputs = processor(prompts, return_tensors="pt").to(device)
    inputs["labels"] = inputs["input_ids"]
    return inputs

## Read and format the data
train_df = pd.read_csv('/content/Captioning-on-Sport-Images/Overall_Training_Captions_csv.csv',delimiter='|')
train_df['image_name'] = '/content/Captioning-on-Sport-Images/Training Images/'+train_df['image_name']
train_df.columns = ['image_name', 'comment_number', 'comment']

train_split, valid_split = train_test_split(train_df[['image_name']].drop_duplicates(),test_size=0.05,random_state=42)
valid_df = train_df[train_df.image_name.isin(valid_split.image_name)]
train_df = train_df[train_df.image_name.isin(train_split.image_name)]

test_df = pd.read_csv('/content/Captioning-on-Sport-Images/Overall_Training_Captions_csv.csv',delimiter='|')
test_df['image_name'] = '/content/Captioning-on-Sport-Images/Testing Images/'+test_df['image_name']
test_df.columns = ['image_name', 'comment_number', 'comment']

## Create HF dataset and apply transformation
captions_dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df.sample(frac=1,random_state=7)),
    'valid': Dataset.from_pandas(valid_df.sample(frac=1,random_state=13)),
    'test': Dataset.from_pandas(test_df.sample(frac=1,random_state=15)),
})

captions_dataset['train'].set_transform(process_data)
captions_dataset['valid'].set_transform(process_data)
captions_dataset['test'].set_transform(process_data)

In [None]:
captions_dataset

In [None]:
## Reload IDEFICS and Add the LoRA component to the model

model_name = checkpoint.split("/")[1]
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
)
model = get_peft_model(model, config)

model.print_trainable_parameters()

In [None]:
training_args = TrainingArguments(
    output_dir=f"{model_name}-captions_3e5",
    learning_rate=3e-5,
    fp16=True,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    dataloader_pin_memory=False,
    save_total_limit=3,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=20,
    eval_steps=20,
    logging_steps=20,
    max_steps=200,
    num_train_epochs=1,
    remove_unused_columns=False,
    push_to_hub=False,
    label_names=["labels"],
    load_best_model_at_end=True,
    report_to=None,
    optim="paged_adamw_8bit",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=captions_dataset['train'],
    eval_dataset=captions_dataset['valid'],
)

In [None]:
trainer.train()

## Caption with fine-tuned model

In [None]:
## Test a single example
image = Image.open('/content/sports-classification/train/football/029.jpg')
prompts = [
    image,
    "Question: What is a caption for this photo? Answer:",
]
print(check_inference(trainer.model, processor, prompts))
image