In [17]:
import json
import requests
from PIL import Image
import pickle
import csv
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from transformers import AutoProcessor, AutoTokenizer
import torch

In [18]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 1 - GET INSTRUCT150K DATA

In [3]:
! wget https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/resolve/main/llava_instruct_150k.json

--2024-01-30 00:06:38--  https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/resolve/main/llava_instruct_150k.json
Resolving huggingface.co (huggingface.co)... 108.156.211.95, 108.156.211.125, 108.156.211.90, ...
Connecting to huggingface.co (huggingface.co)|108.156.211.95|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/repos/4d/41/4d41ea1e2709f0e68e9e361e4218192b9620c5a3f2cb8055bc625942b6cd3039/6b68bc5ca2bfd8a71119af0e8454929668ccda6a334955ccc95d114fc8d082fa?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27llava_instruct_150k.json%3B+filename%3D%22llava_instruct_150k.json%22%3B&response-content-type=application%2Fjson&Expires=1706832398&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNjgzMjM5OH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy80ZC80MS80ZDQxZWExZTI3MDlmMGU2OGU5ZTM2MWU0MjE4MTkyYjk2MjBjNWEzZjJjYjgwNTViYzYyNTk0MmI2Y2QzMDM5L

In [19]:
# Opening JSON file - instruct150k
f = open('llava_instruct_150k.json')

# returns JSON object as
# a dictionary
data = json.load(f)
data[0]

{'id': '000000033471',
 'image': '000000033471.jpg',
 'conversations': [{'from': 'human',
   'value': '<image>\nWhat are the colors of the bus in the image?'},
  {'from': 'gpt', 'value': 'The bus in the image is white and red.'},
  {'from': 'human',
   'value': 'What feature can be seen on the back of the bus?'},
  {'from': 'gpt', 'value': 'The back of the bus features an advertisement.'},
  {'from': 'human',
   'value': 'Is the bus driving down the street or pulled off to the side?'},
  {'from': 'gpt',
   'value': 'The bus is driving down the street, which is crowded with people and other vehicles.'}]}

# 2 - ASSEMBLE TRAINING DATA

In [20]:
# create input pickle file by flattening the data
data_instruct150_flatten = []
r = 0

for a_idx,d in enumerate(data):
    image = d['image']
    image_url = 'http://images.cocodataset.org/train2017/' + image
    conv_iter = iter( d['conversations'])
    for i in conv_iter:
      gpt_ans = next(conv_iter)
      if len(gpt_ans['value']) > 200: # filter out too long answers
          continue
      if i['from'] == 'human' and gpt_ans['from'] == 'gpt':
        data_instruct150_flatten.append((image_url, i['value'].replace('<image>\n','').replace('\n<image>',''),gpt_ans['value']))

    if a_idx % 10000 == 0:
      print(f"{10000 * r} processed")
      r += 1

0 processed
10000 processed
20000 processed
30000 processed
40000 processed
50000 processed
60000 processed
70000 processed
80000 processed
90000 processed
100000 processed
110000 processed
120000 processed
130000 processed
140000 processed
150000 processed


In [21]:
data_instruct150_flatten[1]

('http://images.cocodataset.org/train2017/000000033471.jpg',
 'What feature can be seen on the back of the bus?',
 'The back of the bus features an advertisement.')

In [22]:
# add tokens
phi_model_name  = "microsoft/phi-2"
tokenizer  = AutoTokenizer.from_pretrained(phi_model_name, trust_remote_code=True)
tokenizer.add_tokens('[QA]')
tokenizer.add_special_tokens({'pad_token':'[PAD]'}) 
tokenizer.pad_token, tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


('[PAD]', '<|endoftext|>')

In [23]:
# gpt like training dataset
with open('train_token.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows([['img_url','input','label']])
    
train_data_temp = []
r = 1
for df in data_instruct150_flatten:
  image_url = df[0]
  image_q   = df[1] + ' [QA]'
  image_a   = df[2] +  tokenizer.eos_token
  #print(image_q)
  #print(image_a)
  
  # tokenise 
  ques_token = tokenizer(image_q, return_tensors="pt", return_attention_mask=False)['input_ids'].squeeze(0)
  ans_token  = tokenizer(image_a, return_tensors="pt", return_attention_mask=False)['input_ids'].squeeze(0)

  #print(ques_token)
  #print(ans_token)
  #break

  context_length = len(ques_token)
  combo_q_a = torch.cat( [ques_token,ans_token])
    
  for al in range(len(ans_token)):   
    input = combo_q_a[al : al + context_length].numpy()
    label = combo_q_a[al + 1 : al + context_length + 1].numpy()
    train_data_temp.append([image_url,input,label])
    if len(train_data_temp) >= 100000: # write to the file
       print(f"Writing to disk after {r * 100000} rows")
       r += 1
       with open('train_token.csv', 'a', newline='') as file:
          writer = csv.writer(file)
          writer.writerows(train_data_temp)
       train_data_temp = []

Writing to disk after 100000 rows
Writing to disk after 200000 rows
Writing to disk after 300000 rows
Writing to disk after 400000 rows
Writing to disk after 500000 rows
Writing to disk after 600000 rows
Writing to disk after 700000 rows
Writing to disk after 800000 rows
Writing to disk after 900000 rows
Writing to disk after 1000000 rows
Writing to disk after 1100000 rows
Writing to disk after 1200000 rows
Writing to disk after 1300000 rows
Writing to disk after 1400000 rows
Writing to disk after 1500000 rows
Writing to disk after 1600000 rows
Writing to disk after 1700000 rows
Writing to disk after 1800000 rows
Writing to disk after 1900000 rows
Writing to disk after 2000000 rows
Writing to disk after 2100000 rows
Writing to disk after 2200000 rows
Writing to disk after 2300000 rows
Writing to disk after 2400000 rows
Writing to disk after 2500000 rows
Writing to disk after 2600000 rows
Writing to disk after 2700000 rows
Writing to disk after 2800000 rows
Writing to disk after 2900000

# 3 - PYTORCH DATASET AND DATALOADER TESTING

In [24]:
import torch
from torch.utils.data import Dataset
from PIL import Image
from torch.utils.data import random_split, DataLoader
from transformers import AutoProcessor, AutoTokenizer
import pickle

In [25]:
df_data = pd.read_csv('train_token.csv')
df_data.head(10)

Unnamed: 0,img_url,input,label
0,http://images.cocodataset.org/train2017/000000...,[ 2061 389 262 7577 286 262 1323 2...,[ 389 262 7577 286 262 1323 287 2...
1,http://images.cocodataset.org/train2017/000000...,[ 389 262 7577 286 262 1323 287 2...,[ 262 7577 286 262 1323 287 262 29...
2,http://images.cocodataset.org/train2017/000000...,[ 262 7577 286 262 1323 287 262 29...,[ 7577 286 262 1323 287 262 2939 ...
3,http://images.cocodataset.org/train2017/000000...,[ 7577 286 262 1323 287 262 2939 ...,[ 286 262 1323 287 262 2939 30 2...
4,http://images.cocodataset.org/train2017/000000...,[ 286 262 1323 287 262 2939 30 2...,[ 262 1323 287 262 2939 30 220 502...
5,http://images.cocodataset.org/train2017/000000...,[ 262 1323 287 262 2939 30 220 502...,[ 1323 287 262 2939 30 220 50295 4...
6,http://images.cocodataset.org/train2017/000000...,[ 1323 287 262 2939 30 220 50295 4...,[ 287 262 2939 30 220 50295 464 13...
7,http://images.cocodataset.org/train2017/000000...,[ 287 262 2939 30 220 50295 464 13...,[ 262 2939 30 220 50295 464 1323 2...
8,http://images.cocodataset.org/train2017/000000...,[ 262 2939 30 220 50295 464 1323 2...,[ 2939 30 220 50295 464 1323 287 2...
9,http://images.cocodataset.org/train2017/000000...,[ 2939 30 220 50295 464 1323 287 2...,[ 30 220 50295 464 1323 287 262 29...


In [26]:
for i in (df_data[0:1]['input']):
    print(i)
for i in (df_data[0:1]['label']):
    print(i)

[ 2061   389   262  7577   286   262  1323   287   262  2939    30   220
 50295]
[  389   262  7577   286   262  1323   287   262  2939    30   220 50295
   464]


In [27]:
class llavadataset(Dataset):
  def __init__(self, qa_dataset, phi_model_name, clip_model_name, tokenizer):
    self.processor  = AutoProcessor.from_pretrained(clip_model_name)
    self.qa_dataset = qa_dataset

  def __len__(self):
    return self.qa_dataset.shape[0]

  def __getitem__(self, idx):
    # from image perspective
    img_url = self.qa_dataset.img_url[idx]
    ques    = torch.tensor(np.array(np.matrix(self.qa_dataset.input[idx]))[0])  
    ans     = torch.tensor(np.array(np.matrix(self.qa_dataset.label[idx]))[0])
    
    # image load
    image_load = Image.open(requests.get(img_url,stream=True).raw)
    image_processed = self.processor(images=image_load, return_tensors="pt") ['pixel_values']
    image_processed = image_processed.squeeze(0)
    # q = self.tokenizer(ques, return_tensors="pt", return_attention_mask=False)['input_ids'].squeeze(0)
    # a = self.tokenizer(ans, return_tensors="pt", return_attention_mask=False)['input_ids'].squeeze(0)
    return(image_processed , ques, ans)

In [28]:
clip_model_name = "openai/clip-vit-base-patch32"
#phi_model_name  = "microsoft/phi-2"
#tokenizer  = AutoTokenizer.from_pretrained(phi_model_name, trust_remote_code=True)
csv_file = 'train_token.csv'
qa_dataset = pd.read_csv(csv_file)
step2_dataset = llavadataset(qa_dataset, phi_model_name, clip_model_name, tokenizer)

In [29]:
step2_dataset[0]

(tensor([[[ 1.1858,  1.1566,  1.1274,  ...,  1.3902,  1.3756,  1.1712],
          [ 1.2588,  1.2296,  1.1858,  ...,  1.4048,  1.2150,  1.2734],
          [ 1.4194,  1.3318,  1.2880,  ...,  1.3610,  1.1128,  1.3756],
          ...,
          [-0.4346, -0.3762, -0.3762,  ...,  0.8209,  0.7041,  0.9522],
          [-0.4200, -0.4054, -0.3908,  ...,  0.1493,  0.1639,  0.1055],
          [-0.4346, -0.4346, -0.3908,  ...,  0.2223,  0.1493,  0.1347]],
 
         [[ 1.4145,  1.3695,  1.3395,  ...,  1.6397,  1.6247,  1.3995],
          [ 1.4596,  1.4446,  1.3995,  ...,  1.6397,  1.4446,  1.5046],
          [ 1.5646,  1.4896,  1.4295,  ...,  1.5946,  1.3395,  1.6096],
          ...,
          [-0.4014, -0.3264, -0.3264,  ...,  0.9793,  0.7542,  1.0393],
          [-0.3564, -0.3414, -0.3264,  ...,  0.2439,  0.2289,  0.1839],
          [-0.3714, -0.3714, -0.3264,  ...,  0.2439,  0.1989,  0.1839]],
 
         [[ 1.7904,  1.7904,  1.7477,  ...,  1.9895,  1.9610,  1.7477],
          [ 1.8473,  1.8331,

In [30]:
def collate_fn(batch):
    image_embeddings, ques, ans = zip(*batch)
    image_embeddings_stacked = torch.stack(image_embeddings, dim=0)
    ques_padded = torch.nn.utils.rnn.pad_sequence(ques, batch_first=True, padding_value=tokenizer.pad_token_id)
    ans_padded = torch.nn.utils.rnn.pad_sequence(ans, batch_first=True, padding_value=tokenizer.pad_token_id)
    return (image_embeddings_stacked, ques_padded,ans_padded)

In [31]:
tokenizer.pad_token_id,tokenizer.eos_token_id

(50296, 50256)

In [32]:
val_dataloader   = DataLoader(llavadataset(qa_dataset, phi_model_name,clip_model_name,tokenizer),
                      collate_fn=collate_fn, batch_size=2, num_workers = 10, shuffle=True, pin_memory=True)

In [33]:
next(iter(val_dataloader))

[tensor([[[[-0.9018, -0.9018, -0.7704,  ..., -0.0696,  0.0471,  0.1493],
           [-1.0039, -1.0039, -0.8434,  ...,  0.4997,  1.5216,  1.8573],
           [-1.0331, -0.9893, -0.9164,  ...,  0.0617,  1.6968,  1.9303],
           ...,
           [ 0.9230,  0.9522,  0.9522,  ...,  1.2004,  1.1858,  1.1712],
           [ 0.9522,  0.9522,  0.9522,  ...,  1.2004,  1.1712,  1.1566],
           [ 0.9668,  0.9960,  1.0252,  ...,  1.1858,  1.2150,  1.1566]],
 
          [[-1.1818, -1.1668, -1.0017,  ..., -0.4764, -0.3714, -0.3264],
           [-1.1818, -1.1968, -1.0167,  ...,  0.1239,  1.5346,  1.8498],
           [-1.1818, -1.1818, -1.0918,  ..., -0.2063,  1.8498,  2.0749],
           ...,
           [ 1.0243,  1.0544,  1.0544,  ...,  1.3095,  1.2945,  1.2795],
           [ 1.0544,  1.0544,  1.0544,  ...,  1.3095,  1.2795,  1.2645],
           [ 1.0544,  1.0844,  1.1294,  ...,  1.2945,  1.3245,  1.2645]],
 
          [[-1.0678, -1.0252, -0.9541,  ..., -0.3000,  0.1977,  0.3684],
           [-

# 4 - SAMPLE VALIDATION DATA

In [34]:
import random

In [35]:
#! wget https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/resolve/main/llava_instruct_150k.json
# Opening JSON file - instruct150k
f = open('llava_instruct_150k.json')

# returns JSON object as
# a dictionary
data = json.load(f)

In [36]:
# create input pickle file by flattening the data
data_instruct150_sample_val_flatten = []
r = 0

for a_idx,d in enumerate(data):
    image = d['image']
    image_url = 'http://images.cocodataset.org/train2017/' + image
    conv_iter = iter( d['conversations'])
    for i in conv_iter:
      gpt_ans = next(conv_iter)
      if len(gpt_ans['value']) > 200: # filter out too long answers
          continue
      if i['from'] == 'human' and gpt_ans['from'] == 'gpt':
        image_q   = i['value'].replace('<image>\n','').replace('\n<image>','') + ' [QA]'
        image_a   = gpt_ans['value'] +  tokenizer.eos_token
        data_instruct150_sample_val_flatten.append([image_url, image_q, image_a ])

    if a_idx % 10000 == 0:
      print(f"{10000 * r} processed")
      r += 1
      if r >= 2:
        break

0 processed
10000 processed


In [38]:
data_instruct150_sample_val_flatten[3]

['http://images.cocodataset.org/train2017/000000052846.jpg',
 'Where is the cat positioned in the image? [QA]',
 'The cat is positioned on top of the back of the couch in the living room.<|endoftext|>']

In [39]:
# header
with open('sample_val_data.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows([['img_url','q','a']])

# data
with open('sample_val_data.csv', 'a', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(data_instruct150_sample_val_flatten)