### Importing of Libraries

In [1]:
import pandas as pd
import cv2
import numpy as np
import matplotlib.pyplot as plt
import requests
from io import BytesIO
import numpy as np
from PIL import Image        
import random
from sklearn.metrics.pairwise import cosine_similarity

### Data Loading

In [2]:
data = pd.read_csv('A2_data.csv')

### Data Exploration

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,Image,Review Text
0,3452,['https://images-na.ssl-images-amazon.com/imag...,Loving these vintage springs on my vintage str...
1,1205,['https://images-na.ssl-images-amazon.com/imag...,Works great as a guitar bench mat. Not rugged ...
2,1708,['https://images-na.ssl-images-amazon.com/imag...,We use these for everything from our acoustic ...
3,2078,['https://images-na.ssl-images-amazon.com/imag...,Great price and good quality. It didn't quite...
4,801,['https://images-na.ssl-images-amazon.com/imag...,I bought this bass to split time as my primary...


In [4]:
data.columns

Index(['Unnamed: 0', 'Image', 'Review Text'], dtype='object')

In [5]:
data = data.drop(columns=['Unnamed: 0'])

In [6]:
data.head()

Unnamed: 0,Image,Review Text
0,['https://images-na.ssl-images-amazon.com/imag...,Loving these vintage springs on my vintage str...
1,['https://images-na.ssl-images-amazon.com/imag...,Works great as a guitar bench mat. Not rugged ...
2,['https://images-na.ssl-images-amazon.com/imag...,We use these for everything from our acoustic ...
3,['https://images-na.ssl-images-amazon.com/imag...,Great price and good quality. It didn't quite...
4,['https://images-na.ssl-images-amazon.com/imag...,I bought this bass to split time as my primary...


In [7]:
data.head()

Unnamed: 0,Image,Review Text
0,['https://images-na.ssl-images-amazon.com/imag...,Loving these vintage springs on my vintage str...
1,['https://images-na.ssl-images-amazon.com/imag...,Works great as a guitar bench mat. Not rugged ...
2,['https://images-na.ssl-images-amazon.com/imag...,We use these for everything from our acoustic ...
3,['https://images-na.ssl-images-amazon.com/imag...,Great price and good quality. It didn't quite...
4,['https://images-na.ssl-images-amazon.com/imag...,I bought this bass to split time as my primary...


### Get the image out of URL

In [8]:
def getImagefromURL(imageLink):
    try:
        response = requests.get(imageLink)
        image = np.array(Image.open(BytesIO(response.content)))
        return image
    except Exception as e:
        return None

### Image Preprocessing

In [9]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import requests
from io import BytesIO
import numpy as np
def imagePreprocessing(image):
    
    if(image is not None):
        # rotated_image = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
        # Image Resizing 
        getResizeImage = cv2.resize(image,(128,128))

        # Contrast and Brightness
        a = 1.5
        b = 10
        get_contrast_brightnessImage = cv2.convertScaleAbs(getResizeImage,alpha=a,beta=b)

        # Random Flip
        val = random.randint(0, 1)
        flippedImage = cv2.flip(get_contrast_brightnessImage,val)

        # Normalization of Image
        normalizedImage = flippedImage/255.0

        return normalizedImage
    else:
        return None

### Text/Review Preprocessing

In [10]:
import re
import nltk
# to get stopwords list
nltk.download('stopwords') 
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
import string
def queryProcessing(str_data):
    if isinstance(str_data, str):
        str_data = str_data.lower()
        str_data = re.sub('[^a-zA-Z]',' ',str_data)
        str_data = str_data.split()
    
        # Get english stopwords
        allStopwords = stopwords.words('english')    
    
        # Add words that are not stopwords
        # str_data = [word for word in str_data if not word in set(allStopwords)] 

        # changes
        
        ps = PorterStemmer()
        str_data = [ps.stem(word) for word in str_data if not word in set(allStopwords)]

        lm = WordNetLemmatizer()
        str_data = [lm.lemmatize(token) for token in str_data] 

        
        # join string
        str_temp = ' '.join(str_data)
    
        str_temp.translate(str.maketrans('', '', string.punctuation))
    
        str_temp = str_temp.split()
    
        # Check each word in each string for blank space
        str_data = [i for i in str_temp if i.strip()!='']
    
        # Join to form strin again
        str_data = ' '.join(str_data)
        return str_data
    else:
        return ""

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pulkit/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/pulkit/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Feature Extraction using MobileNetV2

In [11]:
import tensorflow as tf
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
def feature_ExtractionImage(image):
    if(image is not None):
        img_shape = (128,128,3)
        base_model = tf.keras.applications.MobileNetV2(input_shape=img_shape,include_top=False,weights='imagenet')
        imageTrained = preprocess_input(image)
        getFeatures = base_model.predict(np.expand_dims(imageTrained, axis=0))
        return getFeatures
    else:
        return None

### Get the TF-IDF Scores

In [12]:
import math
def tf_idfScoring(str):

    # Get the TF List
    tf_list = []
    for curr_doc in str:
        temp = curr_doc.split(" ")
        d = {}
        for i in temp:
            if i not in d:
                d[i]=1
            else:
                d[i] = d[i]+1
    
        for key in d:
            d[key] = 1+math.log2(d[key])
        tf_list.append(d)

    # Get the IDF List
    idf_list = []
    total_doc = len(str)
    for curr_doc in str:
        temp = curr_doc.split(" ")
        d={}
        for i in temp:
            val = 0
            for j in str:
                check_curr_doc = set(j.split(" "))
                if(i in check_curr_doc):
                    val = val+1
            d[i] = math.log2(total_doc/val)

        idf_list.append(d)

    result_all_tfidf = []

    # Get the TF-IDF results
    for d1, d2 in zip(tf_list, idf_list):
        result_d = {}
        for key in d1.keys() | d2.keys():
            result_d[key] = d1.get(key, 0) * d2.get(key, 0)
        result_all_tfidf.append(result_d)
    return result_all_tfidf

### Get the Similarity Score of Images

In [13]:
def find_similar_images(input_image, image_list):
    # List to store cosine similarity scores for each list of images
    result = []
    
    # Iterate over each list of images in image_list
    for image_sublist in image_list:
        sublist_scores = []
        
        # Iterate over each image in the current list
        for curr_img in image_sublist:
            if curr_img is not None:
                # Flatten the input image and the current image in image list
                flatted_val = input_image.reshape(1, -1)
                flatted_curr_image = curr_img.reshape(1, -1)

                # Normalize the flattened image and the current flattened image in image list
                normalized_val = flatted_val / np.linalg.norm(flatted_val)
                normalized_curr_image = flatted_curr_image / np.linalg.norm(flatted_curr_image)

                # Get the cosine Similarities
                getVal = cosine_similarity(normalized_val, normalized_curr_image)[0][0]
                sublist_scores.append(getVal)
            else:
                # If current image is None, append a placeholder score (e.g., -1)
                sublist_scores.append(-1)
        
        # Append the sublist of scores to the result list
        result.append(sublist_scores)
    
    return result


### Processing on the Dataset

In [14]:
# import ast
# # To store the image features
# all_image_features = []

# # To store the text features
# all_text_features = []

# # Iterate over the data
# for index, row in data.iterrows(): 
#     if(row['Image'] is not None):
#         image_links = ast.literal_eval(row['Image'])
#         temp_folder = []
#         for image in image_links:
#             # Get image from url
#             getImage = getImagefromURL(image)
    
#             # Get the Processed image
#             getPreprocessedImage = imagePreprocessing(getImage)
    
#             # get the features from image
#             getFeatures = feature_ExtractionImage(getPreprocessedImage)
#             if(getFeatures is not None):
#                 getNormalizedFeatures = getFeatures/np.linalg.norm(getFeatures)
#             else:
#                 getNormalizedFeatures = None
    
#             # Storing the image features
#             temp_folder.append(getNormalizedFeatures)
#         all_image_features.append(temp_folder)
#         # Process the Text Query 
#         getProcessedText = queryProcessing(row['Review Text'])

#         # Store the text features
#         all_text_features.append(getProcessedText)



In [15]:
# Image Features
# all_image_features

### Storing and Loading Image Features, Text Fetures and TF-IDF Scores using Pickle

In [16]:
import pickle

def imageFeature_store():
    f = open("file_image_features.pkl","wb")
    pickle.dump(all_image_features,f)
    f.close()
def imageFeature_get():
    f = open("file_image_features.pkl","rb")
    return pickle.load(f)
    f.close()

def textFeature_store():
    f = open("file_text_features.pkl","wb")
    pickle.dump(all_text_features,f)
    f.close()
def textFeature_get():
    f = open("file_text_features.pkl","rb")
    return pickle.load(f)
    f.close()

In [17]:
# imageFeature_store()
all_image_features = imageFeature_get()
# textFeature_store()
all_text_features = textFeature_get()

In [18]:
# Get the TF-IDF Scores
# get_alltfidf_scores = tf_idfScoring(all_text_features)

In [19]:
def tdidf_store():
    f = open("tfidf_features.pkl","wb")
    pickle.dump(get_alltfidf_scores,f)
    f.close()
def tdidf_get():
    f = open("tfidf_features.pkl","rb")
    return pickle.load(f)
    f.close()

In [20]:
# tdidf_store()
get_alltfidf_scores = tdidf_get()

In [21]:
# Get the images filtered
filtered_image_list = [img for img in all_image_features if img is not None]

### Similarity Score calculation of Text

In [22]:
def cosineScores_of_text(input_tfidf, all_text_tfidf):
    getResult = []
    scoring = math.sqrt(sum(value ** 2 for value in input_tfidf.values()))
    
    for curr in all_text_tfidf:
        getVal = sum(input_tfidf.get(word, 0) * curr.get(word, 0) for word in set(input_tfidf) & set(curr))
        getNum = math.sqrt(sum(value ** 2 for value in curr.values()))
        
        val = getVal / (scoring * getNum)
        getResult.append(val)
    
    return getResult

### Tf-IDF for the input text

In [23]:
def tf_idfScoringforInput(input_text,str):
    # Get the TF List
    tf_list = []
    temp = input_text.split(" ")
    d = {}
    for i in temp:
        if i not in d:
            d[i]=1
        else:
            d[i] = d[i]+1

    for key in d:
        d[key] = 1+math.log2(d[key])
    tf_list.append(d)

    # Get the IDF List
    idf_list = []
    total_doc = len(str)
    temp = input_text.split(" ")
    d={}
    for i in temp:
        val = 0
        for j in str:
            check_curr_doc = set(j.split(" "))
            if(i in check_curr_doc):
                val = val+1
        d[i] = math.log2(total_doc/val)

    idf_list.append(d)

    result_all_tfidf = []

    # Get the TF-IDF results
    for d1, d2 in zip(tf_list, idf_list):
        result_d = {}
        for key in d1.keys() | d2.keys():
            result_d[key] = d1.get(key, 0) * d2.get(key, 0)
        result_all_tfidf.append(result_d)
    return result_all_tfidf

### Get the input

In [24]:
def getInput():
    print("Image and Text Query Input :")
    getImageLink = input()
    getTextReview = input()
    return getImageLink,getTextReview

In [25]:
getImageLink,getTextReview = getInput()
print(getImageLink)
print(getTextReview)

Image and Text Query Input :


 https://images-na.ssl-images-amazon.com/images/I/81q5+IxFVUL._SY88.jpg
 Loving these vintage springs on my vintage strat. They have a good tension and great stability. If you are floating your bridge and want the most out of your springs than these are the way to go.


https://images-na.ssl-images-amazon.com/images/I/81q5+IxFVUL._SY88.jpg
Loving these vintage springs on my vintage strat. They have a good tension and great stability. If you are floating your bridge and want the most out of your springs than these are the way to go.


### Get the preprocessing for input image and input text

In [26]:
input_image_lr = getImageLink
getImage_lr = getImagefromURL(input_image_lr)
getPreprocessedImage_lr = imagePreprocessing(getImage_lr)
getFeatures_lr = feature_ExtractionImage(getPreprocessedImage_lr)
if(getFeatures_lr is not None):
    getNormalizedFeatures_lr = getFeatures_lr/np.linalg.norm(getFeatures_lr)
else:
    getNormalizedFeatures_lr = None
similarities_lr=find_similar_images(getNormalizedFeatures_lr,filtered_image_list)



In [27]:
getProcessedText_rm = queryProcessing(getTextReview)
get_input_scores_rm = tf_idfScoringforInput(getProcessedText_rm,all_text_features)

In [28]:
similarities_rm = cosineScores_of_text(get_input_scores_rm[0],get_alltfidf_scores)

### Get the dataset image and text in form of list

In [29]:
reviews = data['Review Text'].tolist()

In [30]:
import ast
all_images=[]
for index, row in data.iterrows(): 
    if(row['Image'] is not None):
        curr_image_links = ast.literal_eval(row['Image'])
        temp_curr_folder = []
        for image in curr_image_links:
            # Get image from url
            temp_curr_folder.append(image)
        all_images.append(temp_curr_folder)

### Get the Top 3 Results according to Image Similarity Score

In [31]:
flattened_data_lr = [(value, (i, j)) for i, sublist in enumerate(similarities_lr) for j, value in enumerate(sublist)]
sorted_data_lr = sorted(flattened_data_lr, key=lambda x: x[0], reverse=True)
top_3_values_with_positions_lr = sorted_data_lr[:3]

In [32]:
top_3_values_with_positions_lr

[(1.0000002, (0, 0)), (0.9984887, (892, 0)), (0.99823564, (572, 1))]

In [34]:
index=1
print("Top 3 Results according to Image Similarity Score:")
print()
for value, position in top_3_values_with_positions_lr:
    i, j = position 
    print(f"{index}.")
    print("Cosine similarity of images : ",end="")
    print(all_images[i][j])
    print("Cosine similarity of text : ",end="")
    print(reviews[i])
    print("Cosine similarity Score of images : ",end="")
    print(value)
    print("Cosine similarity Score of text : ",end="")
    print(similarities_rm[i])
    print("Composite similarity Score : ",end="")
    print((value+similarities_rm[i])/2)
    print()
    print()
    index +=1

Top 3 Results according to Image Similarity Score:

1.
Cosine similarity of images : https://images-na.ssl-images-amazon.com/images/I/81q5+IxFVUL._SY88.jpg
Cosine similarity of text : Loving these vintage springs on my vintage strat. They have a good tension and great stability. If you are floating your bridge and want the most out of your springs than these are the way to go.
Cosine similarity Score of images : 1.0000002
Cosine similarity Score of text : 1.0000000000000002
Composite similarity Score : 1.0000001192092896


2.
Cosine similarity of images : https://images-na.ssl-images-amazon.com/images/I/71iWTdFA4oL._SY88.jpg
Cosine similarity of text : Better thab the terrible string trees my guitar had,  had to drill a small hole though to mount as yoy can see in the pics
Cosine similarity Score of images : 0.9984887
Cosine similarity Score of text : 0.0
Composite similarity Score : 0.49924436211586


3.
Cosine similarity of images : https://images-na.ssl-images-amazon.com/images/I/71

### Get the Top 3 Results according to Text Similarity Score

In [57]:
n = 3
top_indices_rm  = np.argsort(similarities_rm)[-n:][::-1]


index=1
all_scores = []
print("Top 3 Results according to Text Similarity Score:")
print()
for idx in top_indices_rm:
    print(f"{index}.")
    print("Cosine similarity of images : ",end="")
    print(all_images[idx])
    print("Cosine similarity of text : ",end="")
    print(reviews[idx])
    print("Cosine similarity Score of images : ",end="")
    print(similarities_lr[idx])
    print("Cosine similarity Score of text : ",end="")
    print(similarities_rm[idx])
    print("Composite similarity Score : ",end="")
    curr_list = similarities_lr[idx]
    temp_list = []
    for i in curr_list:
        val = (similarities_rm[idx] + i)/2
        temp_list.append(val)
    all_scores.append((idx, temp_list))
    print(temp_list)
    print()
    print()
    index +=1

Top 3 Results according to Text Similarity Score:

1.
Cosine similarity of images : ['https://images-na.ssl-images-amazon.com/images/I/81q5+IxFVUL._SY88.jpg']
Cosine similarity of text : Loving these vintage springs on my vintage strat. They have a good tension and great stability. If you are floating your bridge and want the most out of your springs than these are the way to go.
Cosine similarity Score of images : [1.0000002]
Cosine similarity Score of text : 1.0000000000000002
Composite similarity Score : [1.0000001192092896]


2.
Cosine similarity of images : ['https://images-na.ssl-images-amazon.com/images/I/81Z1d7HaBfL._SY88.jpg']
Cosine similarity of text : Nice solid springs and defeinitely more silent. Easy installation and the black looks cool.

Pictured with some old uninstalled springs next to them.
Cosine similarity Score of images : [0.9959261]
Cosine similarity Score of text : 0.32783549222606095
Composite similarity Score : [0.6618807871801539]


3.
Cosine similarity of 

### Get the composite Scores

In [None]:
all_composite_list_similarities=[]

for i in range(0,len(similarities_lr)):
    curr_list_image_score = similarities_lr[i]
    temp=[]
    for curr_score in curr_list_image_score:
        val = (curr_score + similarities_rm[i])/2
        temp.append(val)
    all_composite_list_similarities.append(temp)

### Composite Score

In [None]:
all_composite_list_similarities

### Similarity Scores using the Image

In [None]:
similarities_lr

### Similarity Scores using the Text

In [None]:
similarities_rm

### Get the Top 3 Results according to Composite Similarity Score

In [None]:
flattened_data_composite = [(value, (i, j)) for i, sublist in enumerate(all_composite_list_similarities) for j, value in enumerate(sublist)]
sorted_data_composite = sorted(flattened_data_composite, key=lambda x: x[0], reverse=True)
top_3_values_with_positions_composite = sorted_data_composite[:3]

In [None]:
top_3_values_with_positions_composite

In [None]:
index=1
print("Top 3 Results according to Composite Similarity Score: ")
print()
for value, position in top_3_values_with_positions_composite:
    i, j = position 
    print(f"{index}.")
    print("Cosine similarity of images : ",end="")
    print(all_images[i][j])
    print("Cosine similarity of text : ",end="")
    print(reviews[i])
    print("Composite similarity Score of images and text : ",end="")
    print(value)
    print()
    print()
    index +=1

### Get the Top 3 images based on Image ranking followed by composite ranking

In [43]:
text_scores=[]
for value, position in top_3_values_with_positions_lr:
    i, j = position 
    text_scores.append(similarities_rm[i])

In [63]:
# Get the sorting according to composite similarities after image similarity sorting
top_3_composite_after_image = [(val1 + val2) / 2 for (val1, _), val2 in zip(top_3_values_with_positions_lr, text_scores)]
top_3_composite_after_image_total = [(val, index) for val, (_, index) in zip(top_3_composite_after_image, top_3_values_with_positions_lr)]
top_3_composite_after_image_total.sort(reverse=True)

In [51]:
index=1
print("Top 3 Results according to Image Similarity Score:")
print()
for value, position in top_3_composite_after_image_total:
    i, j = position 
    print(f"{index}.")
    print("Cosine similarity of images : ",end="")
    print(all_images[i][j])
    print("Cosine similarity of text : ",end="")
    print(reviews[i])
    print("Cosine similarity Score of images : ",end="")
    print(similarities_lr[i][j])
    print("Cosine similarity Score of text : ",end="")
    print(similarities_rm[i])
    print("Composite similarity Score : ",end="")
    print(value)
    print()
    print()
    index +=1

Top 3 Results according to Image Similarity Score:

1.
Cosine similarity of images : https://images-na.ssl-images-amazon.com/images/I/81q5+IxFVUL._SY88.jpg
Cosine similarity of text : Loving these vintage springs on my vintage strat. They have a good tension and great stability. If you are floating your bridge and want the most out of your springs than these are the way to go.
Cosine similarity Score of images : 1.0000002
Cosine similarity Score of text : 1.0000000000000002
Composite similarity Score : 1.0000001192092896


2.
Cosine similarity of images : https://images-na.ssl-images-amazon.com/images/I/713bq9GRJhL._SY88.jpg
Cosine similarity of text : Within a month of buying the device, the door that covers the SD card snapped off - a common problem with these recorders. Very poorly engineered.

Last week I pulled the device out of a padded camera bag and one of the plastic (painted to look like metal) XLR ejector tabs had broke off.

I needed to capture a recording off a mixing boar

### Get the Top 3 text based on Image ranking followed by composite ranking

In [58]:
all_scores

[(0, [1.0000001192092896]),
 (271, [0.6618807871801539]),
 (805, [0.6204257102491212])]

In [64]:
# Get the sorting according to composite similarities after text similarity sorting
composite_after_text_similarity = [idx for idx, _ in sorted(all_scores, key=lambda x: max(x[1]), reverse=True)]

In [65]:
index=1
all_scores = []
print("Top 3 Results according to Text Similarity Score:")
print()
for idx in composite_after_text_similarity:
    print(f"{index}.")
    print("Cosine similarity of images : ",end="")
    print(all_images[idx])
    print("Cosine similarity of text : ",end="")
    print(reviews[idx])
    print("Cosine similarity Score of images : ",end="")
    print(similarities_lr[idx])
    print("Cosine similarity Score of text : ",end="")
    print(similarities_rm[idx])
    print("Composite similarity Score : ",end="")
    curr_list = similarities_lr[idx]
    temp_list = []
    for i in curr_list:
        val = (similarities_rm[idx] + i)/2
        temp_list.append(val)
    all_scores.append((idx, temp_list))
    print(temp_list)
    print()
    print()
    index +=1

Top 3 Results according to Text Similarity Score:

1.
Cosine similarity of images : ['https://images-na.ssl-images-amazon.com/images/I/81q5+IxFVUL._SY88.jpg']
Cosine similarity of text : Loving these vintage springs on my vintage strat. They have a good tension and great stability. If you are floating your bridge and want the most out of your springs than these are the way to go.
Cosine similarity Score of images : [1.0000002]
Cosine similarity Score of text : 1.0000000000000002
Composite similarity Score : [1.0000001192092896]


2.
Cosine similarity of images : ['https://images-na.ssl-images-amazon.com/images/I/81Z1d7HaBfL._SY88.jpg']
Cosine similarity of text : Nice solid springs and defeinitely more silent. Easy installation and the black looks cool.

Pictured with some old uninstalled springs next to them.
Cosine similarity Score of images : [0.9959261]
Cosine similarity Score of text : 0.32783549222606095
Composite similarity Score : [0.6618807871801539]


3.
Cosine similarity of 