In [50]:
from langchain.schema import AIMessage, HumanMessage, SystemMessage
from langchain_community.chat_models import ChatOllama

chat = ChatOllama(model="llama2:7b-chat")

In [88]:
msgs = [
    SystemMessage(
        content=""" Answer ONLY the parsable token that is specified in the prompt.
        If the user DOES NOT request an image you should respond with a message that says "[NOREQ]". 
        If the user DOES request to see an image you should answer [IMGREQ].

        Here are some example requests:
        "Can you show me a cat?"
        "Find me an image of a bicycle."
        "I'd like to see a picture of a mountain."
        "I want to see a photo of a dog."
        "Find a picture of a woman riding a bike."
        """
    ),
    HumanMessage(content="Can you show me something cool?"),
]

chat(msgs)

AIMessage(content='[IMGREQ]')

In [52]:
msgs = [
    SystemMessage(
        content="""Your job is to describe the image in a way that is useful to the user.
        The image will be described to you and you will need to describe it back to the user.
        """
    ),
    HumanMessage(
        content="This image depicts a man riding a bike past a car. and is described as a positive image"
    ),
]

chat(msgs)

AIMessage(content='Thank you for providing me with the image details! Based on your description, I can tell that the image depicts a male individual riding a bicycle while traveling in the same direction as a car. The man is wearing a casual outfit and a helmet, and he appears to be enjoying the ride. The car in the background is also moving forward, but it is not the main focus of the image.\n\nI hope this description helps! Is there anything else you would like me to mention about the image?')

In [92]:
msgs = [
    SystemMessage(
        content=""" From a user's request, you should translate the request into a search query for an image.

        Here are some example requests:
        "Can you show me a cat?" -> "cat"
        "Find me an image of a bicycle." -> "bicycle"
        "I'd like to see a picture of a mountain." -> "mountain"
        "I want to see a photo of a woman riding a bike" -> "woman riding bike"
        "Get a picture of a banana split" -> "banana split"

        Answer ONLY with the search query.
        """
    ),
    HumanMessage(content="Show me a dancing monkey"),
]

chat(msgs)

AIMessage(content='"monkey dancing"')

In [1]:
import pandas as pd

sentiment_file = "../img_data/coco_ann2014/annotations/with_sentiment.csv"

sentiment_df = pd.read_csv(sentiment_file)

In [24]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

In [4]:
sentiment_df["sentiment_label_text"] = sentiment_df.apply(
    lambda x: "Positive" if x["sentiment_label"] else "Negative", axis=1
)

In [11]:
def combine_text(row):
    return f"This image depicts {row['caption'].lower()} and is described as a {row['sentiment_label_text'].lower()} image."


sentiment_df["description"] = sentiment_df.apply(combine_text, axis=1)

In [16]:
sentiment_df.sort_values(by="image_id").reset_index()[
    ["image_id", "description"]
].to_csv("image_descriptions.csv", index=False)

In [21]:
df = pd.read_csv("image_descriptions.csv")

In [26]:
embeddings = model.encode(df["description"].tolist(), show_progress_bar=True)

Batches:   0%|          | 0/6333 [00:00<?, ?it/s]

In [28]:
# save the embeddings to a numpy file
import numpy as np

np.save("image_descriptions.npy", embeddings)

In [29]:
msg = "Get me a picture of a happy dog"

In [30]:
msg_emb = model.encode(msg)

In [32]:
# cosine similarity


def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [33]:
np.array([cosine_similarity(msg_emb, emb) for emb in embeddings]).argmax()

88323

In [38]:
df.iloc[88323]["image_id"]

253386

In [42]:
import os
import json

current_dir = os.path.dirname(os.path.realpath("./"))
senticap_data_dir = os.path.join(current_dir, "txt_data", "data")
coco_img_data_dir = os.path.join(current_dir, "img_data", "coco_val2014", "val2014")
senticap_data_json_path = os.path.join(senticap_data_dir, "senticap_dataset.json")
senticap_data_csv_path = os.path.join(senticap_data_dir, "senticap_dataset.csv")
coco_ann_data_dir = os.path.join(current_dir, "img_data", "coco_ann2014", "annotations")
coco_cap_data_path = os.path.join(coco_ann_data_dir, "captions_val2014.json")

In [43]:
# Load the captions from the coco dataset
with open(coco_cap_data_path, "r") as f:
    coco_cap_data = json.load(f)

coco_cap_data_ann = coco_cap_data["annotations"]
coco_cap_data_img = coco_cap_data["images"]

# Create a dataframe from the coco captions
coco_cap_ann_df = pd.DataFrame(coco_cap_data_ann)
coco_cap_img_df = pd.DataFrame(coco_cap_data_img)

# Rename the id column to image_id
coco_cap_img_df.rename(columns={"id": "image_id"}, inplace=True)

In [None]:
# function that takes image id 391895 and returns the image file name COCO_val2014_000000391895.jpg
# for example 522418 -> COCO_val2014_000000522418.jpg
# 554625 -> COCO_val2014_000000554625.jpg
def get_image_file_name(image_id):
    return f"COCO_val2014_{str(image_id).zfill(12)}.jpg"

In [48]:
coco_cap_img_df[["image_id", "file_name"]]

Unnamed: 0,image_id,file_name
0,391895,COCO_val2014_000000391895.jpg
1,522418,COCO_val2014_000000522418.jpg
2,184613,COCO_val2014_000000184613.jpg
3,318219,COCO_val2014_000000318219.jpg
4,554625,COCO_val2014_000000554625.jpg
...,...,...
40499,134574,COCO_val2014_000000134574.jpg
40500,572233,COCO_val2014_000000572233.jpg
40501,418825,COCO_val2014_000000418825.jpg
40502,560744,COCO_val2014_000000560744.jpg
