# summary

this notebook is built to classify all the images in landscape1

# imports

this section imports all relevant libraries, the CLIP model, and the earcon tags from the earcon dataset

In [1]:
import torch
import clip
import ast
import os
import warnings
import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image

warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device)

In [None]:
# load in earcon dataset

if os.path.isfile("../dataset/earcon_dataset/earcon_dataset.csv"):
    earcon_dataset = pd.read_excel('../dataset/earcon_dataset/earcon_dataset.csv')

earcon_dataset.head()

In [None]:
earcon_tags = earcon_dataset['tags'].tolist()

taglist = []
for element in earcon_tags:
    temp = element.replace("[", "")
    temp = temp.replace("]", "")
    temp = temp.replace("'", "")
    temp = temp.split(", ")
    # print(temp)
    for tag in temp:
        tag = tag.strip()
        tag = tag.lower()
        if tag not in taglist:
            taglist.append(tag)

print(f"There are {len(taglist)} unique tags in the dataset")
print(f"Here are the first 10 tags: {taglist[:10]}")

In [None]:
for i in range(len(taglist)-1, 0, -1):
    if len(taglist[i]) == 1:
        taglist.pop(i)
        
# print(f"There are {len(taglist)} unique tags in the dataset")
# print(f"Here are the first 10 tags: {taglist[:10]}")
for tag in taglist:
    print(tag)

In [3]:
# manual tags

taglist = [
    "bright", "dark", "happy", "sad", "excited", "calm", "fast", "slow", "cold", "warm", "loud", "quiet", "dark", "light",
    "dull", "sharp", "flat", "low", "high", "intense", "soft", "rough", "sparkling", "simple", "complex", "natural",
    "artificial", "clean", "horror", "scary", "mysterious", "correct", "incorrect", "accept", "reject", "agree", "disagree",
    "menu", "analog", "digital", "positive", "negative", "good", "bad", "win", "lose", "start", "stop", "yes", "no",
    "curved", "straight", "open", "closed", "up", "down", "left", "right", "long", "short", "shrill", "deep", "narrow",
    "musical", "nonmusical", "melodic", "rhythmic", "harmonic", "dissonant"
]

In [4]:
taglist1 = []
taglist2 = []
taglist3 = []
taglist4 = []
taglist5 = []

for i in range(len(taglist)):
    taglist1.append(f"a {taglist[i]} landscape")
    taglist2.append(f"a landscape picture with a {taglist[i]} scene")
    taglist3.append(f"a {taglist[i]} landscape picture")
    taglist4.append(f"this picture gives a {taglist[i]} feeling")
    taglist5.append(f"this picture is {taglist[i]}")

taglist1_tokens = clip.tokenize(taglist1).to(device)
taglist2_tokens = clip.tokenize(taglist2).to(device)
taglist3_tokens = clip.tokenize(taglist3).to(device)
taglist4_tokens = clip.tokenize(taglist4).to(device)
taglist5_tokens = clip.tokenize(taglist5).to(device)

# load paths

this section crawls all the images so that we have a list of image files that we can use

In [None]:
# test the code using validation coast from landscape dataset 1

sub_folders = [
    "Coast",
    "Desert",
    "Forest",
    "Glacier",
    "Mountain"
]

split = [
    "test",
    "train",
    "validation"
]

folders = [
    "../dataset/landscape1/Testing Data/",
    "../dataset/landscape1/Training Data/",
    "../dataset/landscape1/Validation Data/",
]

image_paths = []

# for filepath in folders:
for i in range(len(folders)):
    for folder in sub_folders:
        for item in os.scandir(folders[i] + folder):
            image_paths.append({"split": split[i], "folder": folder, "filename": item.name, "filepath": item.path})

image_paths = pd.DataFrame(image_paths)
image_paths.to_csv("../dataset/landscape1/csvs/image_paths.csv", index=False)
image_paths.head()

In [6]:
def inference(image_df, tags, text_tokens, k=10):
    result = []
    splits = image_df["split"].tolist()
    folders = image_df["folder"].tolist()
    filenames = image_df["filename"].tolist()
    paths = image_df["filepath"].tolist()

    # Forward pass for each image
    for i in range(len(image_paths)):
        # Load and preprocess the image
        image = preprocess(Image.open(paths[i])).unsqueeze(0).to(device)

        # Forward pass to get image and text features
        with torch.no_grad():
            image_features = model.encode_image(image)
            text_features = model.encode_text(text_tokens)

        # Normalize features to compare cosine similarity
        image_features /= image_features.norm(dim=-1, keepdim=True)
        text_features /= text_features.norm(dim=-1, keepdim=True)

        # Compute similarity between the image and text prompts
        similarities = (100.0 * image_features @ text_features.T).softmax(dim=-1)

        # Get the top k predictions (tags and similarity scores)
        top_preds = torch.topk(similarities, k)
        top_indices = top_preds.indices.squeeze(0).tolist()
        top_scores = top_preds.values.squeeze(0).tolist()

        # Create a dictionary entry for the image classification
        result.append({
            "split": splits[i],
            "folder": folders[i],
            "filename": filenames[i],
            "image_path": paths[i],
            "top_tags": [tags[i] for i in top_indices],
            "similarity_scores": top_scores
        })

        if (i+1) % 100 == 0:
            print(f"Processed {i+1}/{len(paths)} images")

    # Convert the list of dictionaries to a pandas DataFrame
    result = pd.DataFrame(result)
    result["top_tags"] = result["top_tags"].apply(lambda x: str(x))
    result["similarity_scores"] = result["similarity_scores"].apply(lambda x: str(x))
    return result

In [7]:
result1 = inference(image_paths, taglist1, taglist1_tokens)

In [None]:
result2 = inference(image_paths, taglist2, taglist2_tokens)

In [None]:
result3 = inference(image_paths, taglist3, taglist3_tokens)

In [None]:
result4 = inference(image_paths, taglist4, taglist4_tokens)

In [None]:
result5 = inference(image_paths, taglist5, taglist5_tokens)

# verify images and tags

In [12]:
def visualise_results(result, n=10):
    temp = result.sample(n)
    for index, row in temp.iterrows():
        # Display the image
        image = Image.open(row['image_path'])
        plt.imshow(image)
        plt.axis('off')
        plt.show()

        tags = ast.literal_eval(row["top_tags"])
        scores = ast.literal_eval(row["similarity_scores"])

        # Display the top n tags and their similarity scores
        count = 1
        print(f"Top {n} Tags and Similarity Scores:")
        for tag, score in zip(tags[:n], scores[:n]):
            print(f"{count:>2}. {tag:>50}: {score:.2f}")
            count += 1
        print("\n")

In [None]:
visualise_results(result1)

In [None]:
visualise_results(result2)

In [None]:
visualise_results(result3)

In [None]:
visualise_results(result4)

In [None]:
visualise_results(result5)