In [29]:
import numpy as np
import pandas as pd
import requests
from PIL import Image
from PIL import ImageEnhance
from io import BytesIO
from keras.applications.resnet50 import ResNet50, preprocess_input
from keras.applications.mobilenet_v2 import MobileNetV2, preprocess_input
from sklearn.preprocessing import MinMaxScaler
import pickle
import cv2
import random
import gzip

with open("/content/drive/MyDrive/CSE508_Winter2024_A2_MT23029/extracted_features.pkl", "rb") as f:
    data = pickle.load(f)


data = np.array(data)

# extract ids,image URLs, reviews, and normalized features from the NumPy array

id = [item['id'] for item in data]
image_urls = [item['image_url'] for item in data]
reviews = [item['review'] for item in data]

normalized_features = []
for item in data:
    features_str = item['normalized_features']
    # converting from string to float
    features_list = [float(f) for f in features_str.strip('[]').split(',')]
    normalized_features.append(features_list)

# converting the normalized features list into a NumPy array
normalized_features = np.array(normalized_features, dtype=np.float32)

print("Shape of normalized features array:", normalized_features.shape)


# loading the tf-idf pickle file
with open("/content/drive/MyDrive/CSE508_Winter2024_A2_MT23029/tfidf.pkl", "rb") as f:
    data = pd.read_pickle(f)

# extracting the TF-IDF column and review column from the DataFrame
tfidf_column = data['TF-IDF']
review_column = data['Review Text']

# converting the TF-IDF column to a numpy array
tfidf_matrix = np.array([np.fromstring(row, dtype=float, sep=',') for row in tfidf_column])
print("Shape of tfidf_matrix:", tfidf_matrix.shape)

length_of_reviews = len(review_column)
print("Length of TF-IDF reviews:", length_of_reviews)

# Print top 3 reviews corresponding to their TF-IDF matrix rows
print("\nTop 3 Reviews:")
for i, (tfidf_row, review) in enumerate(zip(tfidf_matrix[:3], review_column[:3])):
    print(f"Review {i + 1}: {review}")


Shape of normalized features array: (1640, 62720)
Shape of tfidf_matrix: (1000, 4688)
Length of TF-IDF reviews: 1000

Top 3 Reviews:
Review 1: Loving these vintage springs on my vintage strat. They have a good tension and great stability. If you are floating your bridge and want the most out of your springs than these are the way to go.
Review 2: Works great as a guitar bench mat. Not rugged enough for abuse but if you take care of it, it will take care of you. Makes organization of workspace much easier because screws won't roll around. Color is good too.
Review 3: We use these for everything from our acoustic bass down to our ukuleles. I know there is a smaller model available for ukes, violins, etc.; we haven't yet ordered those, but these will work on smaller instruments if one doesn't extend the feet to their maximum width. They're gentle on the instruments, and the grippy material keeps them secure.

The greatest benefit has been when writing music at the computer and needing to 

In [30]:
# 3a

import numpy as np

def calculate_similarity(vector1, vector2):

    dot_product = np.dot(vector1, vector2)
    magnitude_vector1 = np.linalg.norm(vector1)
    magnitude_vector2 = np.linalg.norm(vector2)

    if magnitude_vector1 == 0 or magnitude_vector2 == 0:
        return 0.0

    cosine_similarity = dot_product / (magnitude_vector1 * magnitude_vector2)
    return cosine_similarity

# Function to preprocess images
def preprocess_image(url):
    response = requests.get(url)
    if response.status_code == 200:
        # Open the image
        image = Image.open(BytesIO(response.content))

         # increasing contrast
        image = ImageEnhance.Contrast(image).enhance(1.2)

        # increasing brightness
        image = ImageEnhance.Brightness(image).enhance(1.3)

        # Convert to numpy array
        image_array = np.array(image)

        # resizing the images to a standard size
        image_array = cv2.resize(image_array, (224, 224), interpolation=cv2.INTER_AREA)

        return image_array
    else:
        return None

def extract_image_features(image_url):
  # preprocess the image
  preprocessed_image = preprocess_image(image_url)

  if preprocessed_image is not None:
    # Define image shape for reshaping
      image_shape = (224, 224, 3)
      reshaped_images = np.array([preprocessed_image.reshape(image_shape)])

      # Print preprocessed image shape for debugging
      # print("Preprocessed image shape:", preprocessed_image.shape)
      # loading MobileNetV2 model without top classification layers
      mobilenet_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

      # extracting features using MobileNetV2
      feat = mobilenet_model.predict(preprocess_input(reshaped_images), verbose=1)

      # flatten the features
      flat_feat = feat.reshape(feat.shape[0], -1)

      scaler = MinMaxScaler()
      norm_feat = scaler.fit_transform(flat_feat)

      return norm_feat
  else:
    return None

# user input for image URL and review text
image_url = input("Enter the image URL: ")
review_text = input("Enter the review text: ")


# check if image URL already exists
url_index = image_urls.index(image_url) if image_url in image_urls else None

# fetch the image_id
image_id = id[url_index] if url_index is not None else None

# extract features directly if URL exists
if url_index is not None:
    image_features = normalized_features[url_index]
    # print(image_id)
    # print(url_index)
else:
    image_features = extract_image_features(image_url)
    # print('hi')

# find the index of the review text in the review column
review_index = review_column[review_column == review_text].index

# if the review text exists in the dataset, retrieve its TF-IDF vector
if not review_index.empty:
    tf_idf = tfidf_matrix[review_index]
    # print(f"\nTF-IDF of input review:")
    # print(tf_idf)
else:
    print("Review text not found in the dataset.")
    # Assign a zero vector to tf_idf
    tf_idf = np.zeros_like(tfidf_matrix[0])


# print("Extracted features from the input image:", image_features)

if len(image_features.shape) == 1:
    image_features = image_features.reshape(1, -1)

# calculate cosine similarity between query image and all other images
similarities = np.array([calculate_similarity(image_features, normalized_features[i]) for i in range(normalized_features.shape[0])])

# print(similarities.shape)


# create a DataFrame to store the similarities
result_df = pd.DataFrame({'Image_URL': image_urls, 'Review': reviews, 'Cosine_Similarity': similarities[:, 0]})


# sort the DataFrame by Cosine Similarity values in descending order
sorted_df = result_df.sort_values(by='Cosine_Similarity', ascending=False)

# dictionary to store the sorted data
sorted_dict = {}

# iterate through the sorted DataFrame
for index, row in sorted_df.iterrows():
    image_url = row['Image_URL']
    review = row['Review']
    similarity = row['Cosine_Similarity']

    if review in sorted_dict:
        # append the new image URL and its similarity to the existing list of dictionaries
        sorted_dict[review].append({'Image_URL': image_url, 'Cosine_Similarity': similarity})
    else:
        # create a new list with the dictionary containing the image URL and similarity
        sorted_dict[review] = [{'Image_URL': image_url, 'Cosine_Similarity': similarity}]

# top 3 indices
top_3_indices = list(sorted_dict.keys())[:4]

# Print the top 4 indices along with their corresponding data

for idx in top_3_indices:
    print("Review:", idx)

    image_urls = []
    cosine_similarities = []

    # sort the image URLs within the review based on their cosine similarity values
    sorted_items = sorted(sorted_dict[idx], key=lambda x: x['Cosine_Similarity'], reverse=True)

    for item in sorted_items:
        image_url = item['Image_URL']
        cosine_similarity_value = item['Cosine_Similarity']

        # append image URL and cosine similarity value to the respective lists
        image_urls.append(image_url)
        cosine_similarities.append(cosine_similarity_value)

    # display only the maximum cosine similarity value among all URLs
    max_cosine_similarity = max(cosine_similarities)

    print("Image URLs:", ', '.join(image_urls))
    if(max_cosine_similarity>1.0):
      max_cosine_similarity =1.0
    print("Cosine similarity of images:", max_cosine_similarity)

    top_review_index = review_column[review_column == idx].index
    top_tf_idf = tfidf_matrix[top_review_index]

    # calculate cosine similarity between top_tf_idf vector and tf_idf vector
    matrix = calculate_similarity(top_tf_idf.flatten(), tf_idf.flatten())

    print("Cosine similarity of text:", matrix)
    print()



Enter the review text: sd
Review text not found in the dataset.
Review: Works great as a guitar bench mat. Not rugged enough for abuse but if you take care of it, it will take care of you. Makes organization of workspace much easier because screws won't roll around. Color is good too.
Image URLs: https://images-na.ssl-images-amazon.com/images/I/71domStNfIL._SY88.jpg, https://images-na.ssl-images-amazon.com/images/I/71dVsYejzTL._SY88.jpg, https://images-na.ssl-images-amazon.com/images/I/71HSx4Y-5dL._SY88.jpg
Cosine similarity of images: 1.0
Cosine similarity of text: 0.0

Review: I originally installed these on my semi-hollow body Gretch D6128 Thunderbass in an attempt to cut down on the string and fret noise I was getting with conventional strings.

These not only reduced those issues considerably, the tone is fantastic!  I have been using these on the Gretch now for over 18 months and have nothing but praise for these strings!  The red color also adds a very unique look to the instrum

In [21]:
# 3b

def calculate_similarity(vector1, vector2):

    dot_product = np.dot(vector1, vector2)
    magnitude_vector1 = np.linalg.norm(vector1)
    magnitude_vector2 = np.linalg.norm(vector2)

    if magnitude_vector1 == 0 or magnitude_vector2 == 0:
        return 0.0

    cosine_similarity = dot_product / (magnitude_vector1 * magnitude_vector2)
    return cosine_similarity


# Load the data from the compressed pickle file
with open("/content/drive/MyDrive/CSE508_Winter2024_A2_MT23029/new_tfidf.pkl", "rb") as f:
    d = pd.read_pickle(f)

# Convert TF-IDF lists to NumPy arrays
tfidf_mat = d['TF-IDF']
review_col = d['review']


tfidf_mat = np.array([np.fromstring(row, dtype=float, sep=',') for row in tfidf_mat])

# Function to preprocess images
def preprocess_image(url):
    response = requests.get(url)
    if response.status_code == 200:
        # Open the image
        image = Image.open(BytesIO(response.content))

         # increasing contrast
        image = ImageEnhance.Contrast(image).enhance(1.2)

        # increasing brightness
        image = ImageEnhance.Brightness(image).enhance(1.3)

        # Convert to numpy array
        image_array = np.array(image)

        # resizing the images to a standard size
        image_array = cv2.resize(image_array, (224, 224), interpolation=cv2.INTER_AREA)

        return image_array
    else:
        return None

def extract_image_features(image_url):
  # Preprocess the image
  preprocessed_image = preprocess_image(image_url)

  if preprocessed_image is not None:
    # Define image shape for reshaping
      image_shape = (224, 224, 3)
      reshaped_images = np.array([preprocessed_image.reshape(image_shape)])

      # Print preprocessed image shape for debugging
      # print("Preprocessed image shape:", preprocessed_image.shape)
      # loading MobileNetV2 model without top classification layers
      mobilenet_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

      # extracting features using MobileNetV2
      feat = mobilenet_model.predict(preprocess_input(reshaped_images), verbose=1)

      # flatten the features
      flat_feat = feat.reshape(feat.shape[0], -1)

      scaler = MinMaxScaler()
      norm_feat = scaler.fit_transform(flat_feat)

      return norm_feat
  else:
    return None

# user input for image URL and review text
image_url = input("Enter the image URL: ")
review_text = input("Enter the review text: ")

# check if image URL already exists
url_idx = image_urls.index(image_url) if image_url in image_urls else None


# Extract features directly if URL exists
if url_idx is not None:
    image_features = normalized_features[url_idx]
else:
    image_features = extract_image_features(image_url)
    # print('hi')


review_ind = review_col[review_col == review_text].index
first_index = review_ind[0]

# print("Index of the first occurrence of the review text:", first_index)
# If the review text exists in the dataset, retrieve its TF-IDF vector
if not review_ind.empty:
    tfidf = tfidf_mat[first_index]
    # print(f"\nTF-IDF of input review:")

else:
    print("Review text not found in the dataset.")
    # Assign a zero vector to tf_idf
    tfidf = np.zeros_like(tfidf_mat[0])

# print("Extracted features from the input image:", image_features)

if len(image_features.shape) == 1:
    image_features = image_features.reshape(1, -1)

tfidf = tfidf.reshape(1, -1)
tfidf_mat = tfidf_mat.reshape(len(tfidf_mat), -1)

# compute cosine similarity
similarity_array = np.array([calculate_similarity(tfidf, tfidf_mat[i]) for i in range(tfidf_mat.shape[0])])

# ensure the resulting similarity array has a shape of (1640,)
similarity_array = similarity_array.flatten()

similarity_df = pd.DataFrame({'Image_URL': image_urls, 'Review': reviews, 'Cosine_Similarity': similarity_array.flatten()})

# sort the DataFrame by Cosine Similarity values in descending order
sorted_df = similarity_df.sort_values(by='Cosine_Similarity', ascending=False)

# dictionary to store the sorted data
sorted_dict = {}

# Iterate through the sorted DataFrame
for index, row in sorted_df.iterrows():
    image_url = row['Image_URL']
    review = row['Review']
    similarity = row['Cosine_Similarity']

    if review in sorted_dict:
        # append the new image URL and its similarity to the existing list of dictionaries
        sorted_dict[review].append({'Image_URL': image_url, 'Cosine_Similarity': similarity})
    else:
        # create a new list with the dictionary containing the image URL and similarity
        sorted_dict[review] = [{'Image_URL': image_url, 'Cosine_Similarity': similarity}]

top_4_reviews = list(sorted_dict.keys())[:4]

# Print the top 4 reviews along with their corresponding data
for review in top_4_reviews:
    print("Review:", review)

    # list of image data dictionaries for the current review
    image_data_list = sorted_dict[review]

    # print the image URLs for all images in the current review
    url_list = [item['Image_URL'] for item in image_data_list]
    print("Image URLs:", ', '.join(url_list))


    max_similarity = -1

    # iterate through all image URLs in the current review
    for image_url in url_list:
        # find the index of the current image URL within the list of all image URLs
        image_index = image_urls.index(image_url)

        img_feat = normalized_features[image_index]

        cs = calculate_similarity(image_features.flatten(), img_feat)

        # update maximum similarity and corresponding image URL if a higher similarity is found
        if cs > max_similarity:
            max_similarity = cs

    max_similarity = round(max_similarity, 7)
    if max_similarity> 1.0:
      max_similarity = 1.0

    print("Cosine similarity of images:", max_similarity)

    # cosine similarity of text
    cs_text = sorted_dict[review][0]['Cosine_Similarity']
    cs_text_rounded = round(cs_text, 7)
    if cs_text_rounded > 1.0:
        cs_text_rounded = 1.0
    print("Cosine similarity of text:", cs_text_rounded)
    print()





Enter the image URL: https://images-na.ssl-images-amazon.com/images/I/71WPnpatx4L._SY88.jpg
Enter the review text: I bought this bass to split time as my primary bass with my Dean Edge. This might be winning me over. The bass boost is outstanding. The active pickups really allow you to adjust to the sound you want. I recommend this for anyone. If you're a beginner  like I was not too long ago, it's an excellent bass to start with. If you're on tour and/or music is making you money, this bass will be beatiful on stage. The color is a bit darker than in the picture. But, all around, this is a great buy.
Review: I bought this bass to split time as my primary bass with my Dean Edge. This might be winning me over. The bass boost is outstanding. The active pickups really allow you to adjust to the sound you want. I recommend this for anyone. If you're a beginner  like I was not too long ago, it's an excellent bass to start with. If you're on tour and/or music is making you money, this bass w