In [None]:
!pip3 install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 8.0 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 38.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 38.7 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.2 transformers-4.24.0


In [None]:
import os
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
path = "/content/gdrive/MyDrive/CoffeeImagesRawFINAL"

In [None]:
os.chdir(path)

In [None]:
# Using process generated by teammate Sanjay M to extend captioning to all images from our database

import torch
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
from PIL import Image

In [None]:
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

Downloading:   0%|          | 0.00/4.61k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/982M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/228 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/241 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/120 [00:00<?, ?B/s]

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
def predict_step(image_paths):
  images = []
  captions = []
  for image_path in image_paths:
    i_image = Image.open(image_path)
    if i_image.mode != "RGB":
      i_image = i_image.convert(mode="RGB")

    images.append(i_image)
  pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
  pixel_values = pixel_values.to(device)

  output_ids = model.generate(pixel_values, **gen_kwargs)

  preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
  preds = [pred.strip() for pred in preds]
  return preds

In [None]:
import glob

In [None]:
folders = glob.glob("/content/gdrive/My Drive/CoffeeImagesRawFINAL/")
image_captions = []
for folder in folders:
  print("Looking for images in",folder)
  for f in glob.glob(folder+'*.jpg'):
    x=(predict_step([f]))
    print("Image:",f)
    print("Caption:",x)
    image_captions.append(x)

Looking for images in /content/gdrive/My Drive/CoffeeImagesRawFINAL/
Image: /content/gdrive/My Drive/CoffeeImagesRawFINAL/coffeeaddict3.jpg
Caption: ['a living room filled with furniture and a fire place']
Image: /content/gdrive/My Drive/CoffeeImagesRawFINAL/coffeeaddict2.jpg
Caption: ['a building that has graffiti on it']
Image: /content/gdrive/My Drive/CoffeeImagesRawFINAL/coffeeaddict4.jpg
Caption: ['a large building with a clock on the front of it']
Image: /content/gdrive/My Drive/CoffeeImagesRawFINAL/coffeeaddict5.jpg
Caption: ['a skateboard with a knife sticking out of it']
Image: /content/gdrive/My Drive/CoffeeImagesRawFINAL/coffeeaddict9.jpg
Caption: ['a large swimming pool in front of a large building']
Image: /content/gdrive/My Drive/CoffeeImagesRawFINAL/coffeeaddict8.jpg
Caption: ['a person holding a cup of coffee on top of a table']
Image: /content/gdrive/My Drive/CoffeeImagesRawFINAL/coffeelovers8.jpg
Caption: ['a person pouring a liquid into a blender']
Image: /content/gd

In [None]:
image_captions

[['a living room filled with furniture and a fire place'],
 ['a building that has graffiti on it'],
 ['a large building with a clock on the front of it'],
 ['a skateboard with a knife sticking out of it'],
 ['a large swimming pool in front of a large building'],
 ['a person holding a cup of coffee on top of a table'],
 ['a person pouring a liquid into a blender'],
 ['a wooden table topped with a cup of coffee'],
 ['a painting of a fire hydrant with a cartoon character on it'],
 ['a large kitchen with a lot of counter space'],
 ['a person holding a wine glass in their hand'],
 ['a coffee mug sitting on top of a coffee table'],
 ['a coffee cup sitting on top of a wooden table'],
 ['a chocolate cupcake in a blender on a table'],
 ['a green and white building with a blue umbrella'],
 ['a large building with a sign on the front of it'],
 ['a person holding a cup in their hand'],
 ['a store with a sign on the side of the building'],
 ['people are standing outside of a restaurant'],
 ['a kitc

In [None]:
import pandas as pd

In [None]:
captions_df = pd.DataFrame()

captions_df['ImageCaptions'] = image_captions

In [None]:
print(captions_df)

                                          ImageCaptions
0     [a living room filled with furniture and a fir...
1                  [a building that has graffiti on it]
2     [a large building with a clock on the front of...
3        [a skateboard with a knife sticking out of it]
4     [a large swimming pool in front of a large bui...
...                                                 ...
1033  [a brick building with a clock on the side of it]
1034        [a woman holding a child on top of a couch]
1035        [a cup of coffee sitting on top of a table]
1036  [a man standing in the middle of a lake with a...
1037          [a patio area with a bench and a walkway]

[1038 rows x 1 columns]


In [None]:
captions_df.to_csv('imgcaptions.csv')

In [None]:
!cp imgcaptions.csv "/content/gdrive/MyDrive"