<a href="https://colab.research.google.com/github/samp3209/capstone/blob/main/search_for_images.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Link google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
#Imports, data retrieval, data transformation 
import os
import re

def get_word_set(text):
    text = text.replace('png', ' ')
    text = re.sub(r'\W+', ' ', text.lower())
    word_set = set(text.split())
    print(word_set)
    return word_set

# path to images
folder_path = '/content/drive/MyDrive/MassOutput/Test/'

file_names = []

for filename in os.listdir(folder_path):
    if filename.endswith(('.jpg', '.jpeg', '.png')):
        file_names.append(filename)


In [24]:
# Define the query
query = 'chauncey the beaver baseball'

In [25]:
#Search function that outputs the most closely desired image based on caption
query_word_set = get_word_set(query)

scores = {}

# Loop through each file name and calculate the similarity score
for file_name in file_names:
    file_word_set = get_word_set(file_name)
    intersection = query_word_set.intersection(file_word_set)
    union = query_word_set.union(file_word_set)
    numerator = len(intersection) + len(intersection)/len(query_word_set)
    denominator = len(union)
    if denominator != 0:
        score = numerator / denominator
    else:
        score = 0.0
    scores[file_name] = score
    
    #debugging
    #print('file Info:', file_word_set)
    #print('query Info:', query_word_set)
    #print('untersection:', intersection)
    #print('numerator:', numerator)
    #print('union:', union)
    #print('denominator:', denominator)
    #print('similarity Score:', score)
    

for file_name, score in sorted(scores.items(), key=lambda x: x[1], reverse=True):
    print(file_name, ':', score)

{'chauncey', 'the', 'beaver', 'baseball'}
{'at', 'computer', 'looking', 'of', 'something', 's', 'screen', 'in', 'cartoon', 'the', 'beaver', 'he', 'chauncey', 'blue', 'like', 'front', 'holding'}
{'reading', 'in', 'cartoon', 'the', 'beaver', 'chauncey', 'park', 'green'}
{'motion', 'orange', 'in', 'and', 'cartoon', 'the', 'beaver', 'chauncey', 'playing', 'blue', 'baseball', 'on', 'field'}
chauncey the beaver,cartoon,green,reading,reading,in park.png : 0.4166666666666667
chauncey the beaver,cartoon,orange and blue,in motion,playing baseball,on field.png : 0.38461538461538464
chauncey the beaver,cartoon,blue,like he's holding something,looking at computer screen,in front of computer screen.png : 0.20833333333333334


In [26]:
#Definitely using this search function
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# File names
file_names = [
    'chauncey the beaver,cartoon,green,reading,reading,in park.png',
    'chauncey the beaver,cartoon,orange and blue,in motion,playing baseball,on field.png',
    'chauncey the beaver,cartoon,blue,like he\'s holding something,looking at computer screen,in front of computer screen.png'
]

# Define the query
query = 'chauncey the beaver baseball'

vectorizer = TfidfVectorizer()

corpus = file_names

tfidf_matrix = vectorizer.fit_transform(corpus)

query_vector = vectorizer.transform([query])

similarities = cosine_similarity(query_vector, tfidf_matrix)

results = [(similarity, file_name) for similarity, file_name in zip(similarities[0], file_names)]
results = sorted(results, key=lambda x: x[0], reverse=True)

for similarity, file_name in results:
    print(f"{file_name}: {similarity:.4f}")

chauncey the beaver,cartoon,orange and blue,in motion,playing baseball,on field.png: 0.4600
chauncey the beaver,cartoon,green,reading,reading,in park.png: 0.2571
chauncey the beaver,cartoon,blue,like he's holding something,looking at computer screen,in front of computer screen.png: 0.1693
