In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import requests
import nltk
import math
from collections import Counter
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from io import BytesIO
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import load_npz
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model
from sklearn.feature_extraction.text import TfidfVectorizer


In [29]:
nltk.download('punkt')
nltk.download('stopwords')
base_model = ResNet50(weights='imagenet')
model = Model(inputs=base_model.input, outputs=base_model.layers[-2].output)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [45]:
def extract_image_features(image_url):
    response = requests.get(image_url)
    img = image.load_img(BytesIO(response.content), target_size=(224, 224))
    img_array = image.img_to_array(img)
    expanded_img_array = np.expand_dims(img_array, axis=0)
    preprocessed_img = preprocess_input(expanded_img_array)
    features = model.predict(preprocessed_img)
    normalized_features = features / np.linalg.norm(features)
    return normalized_features.flatten()

def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        tokens = word_tokenize(text)
        tokens = [word for word in tokens if word.isalpha()]  # Remove punctuation
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if not word in stop_words]  # Remove stopwords
        porter = PorterStemmer()
        stemmed_tokens = [porter.stem(word) for word in tokens]
        return ' '.join(stemmed_tokens)
    else:
        return ""


def compute_tfidf(corpus):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)

    return tfidf_matrix, vectorizer


def tfidf(c):
    tokenized_docs = [doc.split() for doc in c]
    tf = [{word: doc.count(word) / len(doc) for word in set(doc)} for doc in tokenized_docs]
    df = defaultdict(int)
    num_docs = len(tokenized_docs)
    tfidf_matrix = []
    for doc in tf:
        tfidf_vector = {word: tf_value * idf[word] for word, tf_value in doc.items()}
        tfidf_matrix.append(tfidf_vector)
    return tfidf_matrix, idf


In [46]:
df = pd.read_csv('A2_Data_final.csv')
image_features = []
for urls in df['Image']:
    urls = eval(urls)
    entry_features = np.mean([extract_image_features(url) for url in urls], axis=0)
    image_features.append(entry_features)

processed_texts = [preprocess_text(text) for text in df['Review Text']]
tfidf_matrix, tfidf_vectorizer = compute_tfidf(processed_texts)

with open('image_features.pkl', 'wb') as f:
    pickle.dump(image_features, f)

with open('tfidf_matrix.pkl', 'wb') as f:
    pickle.dump(tfidf_matrix, f)

with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)




In [32]:
def find_top_similarities(input_vector, feature_matrix, top_n=3):
    similarities = cosine_similarity(input_vector.reshape(1, -1), feature_matrix)
    top_indices = similarities.argsort()[0][-top_n-1:-1][::-1]
    return top_indices, similarities[0][top_indices]

def retrieve_similar_items(input_index, feature_matrix, is_text=False, top_n=3):
    if is_text:
        input_vector = feature_matrix[input_index]
    else:
        input_vector = feature_matrix[input_index].reshape(1, -1)

    top_indices, top_similarities = find_top_similarities(input_vector, feature_matrix, top_n)
    return top_indices, top_similarities

def combined_retrieval(image_input_index, text_input_index, image_features, tfidf_matrix, top_n=3):
    image_feature_matrix = np.array(image_features)

    image_similarities = cosine_similarity(image_feature_matrix[image_input_index].reshape(1, -1), image_feature_matrix).flatten()
    text_similarities = cosine_similarity(tfidf_matrix[text_input_index], tfidf_matrix).flatten()

    combined_scores = (image_similarities + text_similarities) / 2

    top_indices = np.argsort(combined_scores)[-top_n-1:-1][::-1]

    results = [(idx, {'image': image_similarities[idx], 'text': text_similarities[idx], 'average': combined_scores[idx]}) for idx in top_indices]

    return results



In [33]:
def load_data():
    df = pd.read_csv('A2_Data_final.csv')
    return df

def load_features():
    with open('image_features.pkl', 'rb') as f:
        image_features = pickle.load(f)
    with open('tfidf_matrix.pkl', 'rb') as f:
        tfidf_matrix = pickle.load(f)
    with open('tfidf_vectorizer.pkl', 'rb') as f:
        tfidf_vectorizer = pickle.load(f)
    return image_features, tfidf_matrix, tfidf_vectorizer


In [50]:
def main():
    df = load_data()
    image_features, tfidf_matrix, tfidf_vectorizer = load_features()

    # # Sample test case input
    # image_url = "https://images-na.ssl-images-amazon.com/images/I/71bztfqdg+L._SY88.jpg"
    # review_text = "I have been using Fender locking tuners for about five years on various \
    # strats and teles. Definitely helps with tuning stability and way faster to restring if \
    # there is a break."


    # Take input from the user for image URL and review text
    image_url = input("Please enter the image URL: ")
    review_text = input("Please enter the review text: ")

    # Now you can use these variables in your code
    print("Image URL:", image_url)
    print("Review Text:", review_text)

    print("6. Sample Test Case:")
    print("a. Input:")
    print("Image and Text Query Input:")
    print("Image:")
    print(image_url)
    print("Review:", review_text)

    combined_results = combined_retrieval(0, 0, image_features, tfidf_matrix)

    print("b. Output:")
    print("—--------------------------------------------------------------------------------------------")
    print("USING IMAGE RETRIEVAL")
    for i, (idx, result) in enumerate(combined_results[:3], start=1):
        print(f"Image URL: {df['Image'][idx]}")
        print(f"Review: {df['Review Text'][idx]}")
        print(f"Cosine similarity of images - {result['image']:.4f}")
        print(f"Cosine similarity of text - {result['text']:.4f}")
        print(f"Composite similarity score: {result['average']:.4f}")
        print()

    combined_results = combined_retrieval(0, 0, image_features, tfidf_matrix)
    text_results = retrieve_similar_items(0, tfidf_matrix, is_text=True)

    print("—----------------------------------------------------------------------------------------------")
    print("USING TEXT RETRIEVAL")
    for i, (idx, similarity) in enumerate(zip(*text_results), start=1):
        print(f"Image URL: {df['Image'][idx]}")
        print(f"Review: {df['Review Text'][idx]}")
        print(f"Cosine similarity of images - {combined_results[i-1][1]['image']:.4f}")
        print(f"Cosine similarity of text - {similarity:.4f}")
        print(f"Composite similarity score: {combined_results[i-1][1]['average']:.4f}")
        print()

    # # Calculate composite similarity scores for images and text
    # composite_similarity_images = sum(result['image'] for _, result in combined_results)
    # composite_similarity_text = sum(similarity for _, similarity in zip(*text_results))

    # # Calculate final composite similarity score
    # final_composite_similarity = (composite_similarity_images + composite_similarity_text) / 2

    # print("--------------------------------------------------------------------------------------------------")
    # print("Composite similarity scores of images:", composite_similarity_images)
    # print("Composite similarity scores of text:", composite_similarity_text)
    # print("Final composite similarity score:", final_composite_similarity)

if __name__ == "__main__":
    main()

Please enter the image URL: https://images-na.ssl-images-amazon.com/images/I/71bztfqdg+L._SY88.jpg
Please enter the review text: I have been using Fender locking tuners for about five years on various \     # strats and teles. Definitely helps with tuning stability and way faster to restring if \     # there is a break
Image URL: https://images-na.ssl-images-amazon.com/images/I/71bztfqdg+L._SY88.jpg
Review Text: I have been using Fender locking tuners for about five years on various \     # strats and teles. Definitely helps with tuning stability and way faster to restring if \     # there is a break
6. Sample Test Case:
a. Input:
Image and Text Query Input:
Image:
https://images-na.ssl-images-amazon.com/images/I/71bztfqdg+L._SY88.jpg
Review: I have been using Fender locking tuners for about five years on various \     # strats and teles. Definitely helps with tuning stability and way faster to restring if \     # there is a break
b. Output:
—-------------------------------------------