In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
import edgedb
import json
import numpy as np
import os
import time
import textwrap
import ast
import warnings
warnings.filterwarnings('ignore')

from typing import List
import openai
from openai.embeddings_utils import cosine_similarity, distances_from_embeddings, indices_of_nearest_neighbors_from_distances
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
openai.api_key = os.environ["OPENAI_API_KEY"]
import tiktoken

# Inserting

In [None]:
import_product_df = pd.read_csv('final_full_product_df.csv')
import_reviews_df = pd.read_csv('final_reviews_df.csv')

In [None]:
print(import_product_df.shape)
print(import_reviews_df.shape)

(908, 25)
(11750, 10)


# Create SQL db

In [None]:
import_product_df.columns.tolist()

['asin',
 'title_text',
 'category',
 'Series',
 'Brand',
 'Item model number',
 'Operating System',
 'price',
 'RAM',
 'Hard Drive',
 'Processor Brand',
 'Processor',
 'Chipset Brand',
 'Graphics Coprocessor',
 'bestseller',
 'seller_text',
 'url',
 'stars',
 'reviewsCount',
 'thumbnailImage',
 'variantAsins',
 'embedding',
 'np_embedding',
 'combined_embedding',
 'similarity']

In [None]:
print(f'reviews: {len(import_reviews_df)}')
print(f'products: {len(import_product_df)}')

reviews: 11750
products: 908


# Map dirty to cleaned

### numPeopleFoundHelpful

In [None]:
dirtier_products = pd.read_excel('full_product_data.xlsx')
print(f'dirtier_products: {len(dirtier_products)}')
dirtier_reviews = pd.read_excel('final_reviews_data.xlsx')
print(f'dirtier_reviews: {len(dirtier_reviews)}')

dirtier_products: 1049
dirtier_reviews: 22846


In [None]:
dirtier_products.columns.tolist()

['asin',
 'title_text',
 'category',
 'Series',
 'Brand',
 'Item model number',
 'Operating System',
 'price',
 'RAM',
 'Hard Drive',
 'Processor Brand',
 'Processor',
 'Chipset Brand',
 'Graphics Coprocessor',
 'bestseller',
 'seller_text',
 'url',
 'stars',
 'reviewsCount',
 'thumbnailImage',
 'variantAsins']

In [None]:
dirtier_reviews.columns.tolist()

['productAsin',
 'ratingScore',
 'reviewTitle',
 'reviewUrl',
 'reviewReaction',
 'date',
 'reviewDescription',
 'all_review_text',
 'wavgHelpfulness']

In [None]:
dirtier_reviews.loc[0:10,'reviewReaction']

0                               NaN
1                               NaN
2                               NaN
3                               NaN
4       5 people found this helpful
5                               NaN
6                               NaN
7       5 people found this helpful
8                               NaN
9     One person found this helpful
10                              NaN
Name: reviewReaction, dtype: object

In [None]:
dirtier_reviews = pd.read_excel('final_reviews_data.xlsx')
print(f'dirtier_reviews: {len(dirtier_reviews)}')

dirtier_reviews: 22846


In [None]:
display(dirtier_reviews[['productAsin','reviewReaction']])
dirtier_reviews['reviewReaction'].fillna('', inplace=True)
dirtier_reviews['numPeopleFoundHelpful'] = dirtier_reviews['reviewReaction'].apply(lambda x: 1 if x == '' else 2 if x.startswith('One') else 1 + int(x.split(' ')[0].replace(',', '')))
display(dirtier_reviews.loc[0:10,'reviewReaction'])
dirtier_reviews['totalHelpful'] = dirtier_reviews.groupby('productAsin')['numPeopleFoundHelpful'].transform('sum')
dirtier_reviews['numPeopleFoundHelpful']= dirtier_reviews['numPeopleFoundHelpful'] - 1
display(dirtier_reviews[['productAsin','numPeopleFoundHelpful', 'totalHelpful']])

Unnamed: 0,productAsin,reviewReaction
0,B0BYJWRSC1,
1,B0BYJWRSC1,
2,B0BYJWRSC1,
3,B0BT6W36HL,
4,B0BT6W36HL,5 people found this helpful
...,...,...
22841,B08MMQH98H,2 people found this helpful
22842,B08MMQH98H,
22843,B08MMQH98H,4 people found this helpful
22844,B08MMQH98H,


0                                  
1                                  
2                                  
3                                  
4       5 people found this helpful
5                                  
6                                  
7       5 people found this helpful
8                                  
9     One person found this helpful
10                                 
Name: reviewReaction, dtype: object

Unnamed: 0,productAsin,numPeopleFoundHelpful,totalHelpful
0,B0BYJWRSC1,0,3
1,B0BYJWRSC1,0,3
2,B0BYJWRSC1,0,3
3,B0BT6W36HL,0,7
4,B0BT6W36HL,5,7
...,...,...,...
22841,B08MMQH98H,2,90
22842,B08MMQH98H,0,90
22843,B08MMQH98H,4,90
22844,B08MMQH98H,0,90


In [None]:
dirtier_reviews.loc[(dirtier_reviews.productAsin=='B0BYJWRSC1'),:]

Unnamed: 0,productAsin,ratingScore,reviewTitle,reviewUrl,reviewReaction,date,reviewDescription,all_review_text,wavgHelpfulness,numPeopleFoundHelpful,totalHelpful
0,B0BYJWRSC1,4,Painful Laptop,https://www.amazon.com/gp/customer-reviews/R6N...,,2023-05-06,This is a fast and elegant looking Windows 11 ...,Painful Laptop. This is a fast and elegant loo...,0.333333,0,3
1,B0BYJWRSC1,5,Has all the features,https://www.amazon.com/gp/customer-reviews/R39...,,2023-04-20,This laptop is fast! 64 mb of ram really crank...,Has all the features. This laptop is fast! 64 ...,0.333333,0,3
2,B0BYJWRSC1,5,Great Service,https://www.amazon.com/gp/customer-reviews/R3V...,,2023-04-26,Fast delivery and product exactly as described,Great Service. Fast delivery and product exact...,0.333333,0,3


In [None]:
dirtier_products.head()

Unnamed: 0,asin,title_text,category,Series,Brand,Item model number,Operating System,price,RAM,Hard Drive,Processor Brand,Processor,Chipset Brand,Graphics Coprocessor,bestseller,seller_text,url,stars,reviewsCount,thumbnailImage,variantAsins
0,B099P4T81H,"HP Chromebase 21.5"" All-in-One Desktop, Intel ...",Desktops,22-aa0022,HP,22-aa0022,chrome os,545.0,4 GB DDR4,128 GB SSD,Intel,2.4 GHz pentium_gold_g5600,Intel,UHD Graphics 600,0,FLEXIBLE FAMILY FUN Designed to live at the h...,https://www.amazon.com/dp/B099P4T81H,4.3,264,https://m.media-amazon.com/images/I/81w3miL-DH...,[]
1,B09YVWMLBP,Dell 2022 Newest Optiplex 3090 Micro Form Fact...,Desktops,Optiplex,Dell,3090,Windows,624.13,16 GB DDR4,512 GB SSD,Intel,2.3 GHz core_i5,Intel,UHD Graphics,0,High Speed RAM And Enormous Space16GB high-ban...,https://www.amazon.com/dp/B09YVWMLBP,5.0,10,https://m.media-amazon.com/images/I/61TIHYXkb4...,"['B0B2VB5ZT1', 'B0B2V1BJYX', 'B09YVWMLBP', 'B0..."
2,B0BS2LCB1X,2018 Apple Mac Mini with 3.2GHz Intel Core i7 ...,Desktops,Apple Mac Mini,Apple,MRTT2LL/A,macOS,515.0,DDR4,128 GB SSD,Intel,3.2 GHz apple_ci7,Intel,UHD Graphics 630,0,"This pre-owned product is not Apple certified,...",https://www.amazon.com/dp/B0BS2LCB1X,4.6,29,https://m.media-amazon.com/images/I/61mujJvG+C...,[]
3,B0BWPKK7RN,Dell OptiPlex 7080 Micro Form Factor Mini Busi...,Desktops,OptiPlex,Dell,7080,Windows,599.0,32 GB DDR4,1 TB SSD,Intel,2.3 GHz core_i5,Intel,UHD Graphics,0,High Speed RAM And Enormous Space32GB high-ban...,https://www.amazon.com/dp/B0BWPKK7RN,4.1,13,https://m.media-amazon.com/images/I/51qO-k6MY1...,"['B0BWQ3F343', 'B0BX21XTPP', 'B0BWQ1PXL3', 'B0..."
4,B0BM8YLTH8,[Gaming PC] KAMRUI Mini PC AMD Ryzen 5 5600U U...,Desktops,AMR5-Ryzen 5 5600U,KAMRUI,AMR5,Windows,479.0,16 GB DDR4,"512 GB 512GB M.2 SSD Included, Support NVME/NG...",AMD,4.2 GHz ryzen_5,AMD,Radeon Vega 7,1,THE KEY TO VICTORYThe KAMRUI AMR5 mini gaming ...,https://www.amazon.com/dp/B0BM8YLTH8,4.5,166,https://m.media-amazon.com/images/I/61oUaIuI0A...,"['B0BM8YLTH8', 'B0BX82ZBMG']"


In [None]:
url_pAsin_dict = dict(zip(dirtier_reviews['reviewUrl'], dirtier_reviews.numPeopleFoundHelpful))
url_pAsin_dict

In [None]:
len(url_pAsin_dict)

21499

In [None]:
len([u for u in url_pAsin_dict.keys() if u not in import_reviews_df['reviewUrl'].tolist()])

9749

In [None]:
21499-9749

11750

In [None]:
import_reviews_df['numPeopleFoundHelpful'] = import_reviews_df.reviewUrl.map(url_pAsin_dict)

In [None]:
import_reviews_df.head()

Unnamed: 0,productAsin,date,all_review_text,reviewTitle,reviewDescription,ratingScore,reviewUrl,wavgHelpfulness,embedding,np_embedding,numPeopleFoundHelpful
0,B09BS2LFBN,2023-04-24,"Great laptop, battery could be better. Great l...","Great laptop, battery could be better","Great laptop, battery could be better",4,https://www.amazon.com/gp/customer-reviews/R28...,0.0625,"[0.010526235, 0.015237368000000001, 0.00765077...",[ 0.01052624 0.01523737 0.00765078 ... -0.01...,0
1,B09BS2LFBN,2023-03-25,A pleasant laptop with a lot of Horse Power!. ...,A pleasant laptop with a lot of Horse Power!,"The laptop is fast, intuitive and does what i ...",5,https://www.amazon.com/gp/customer-reviews/R2X...,0.0625,"[-0.015369734000000001, 0.006135603000000001, ...",[-0.01536973 0.0061356 0.00851934 ... -0.02...,0
2,B09BS2LFBN,2020-10-27,Awesome Laptop. I purchased this for college o...,Awesome Laptop,I purchased this for college online. I will be...,5,https://www.amazon.com/gp/customer-reviews/RP6...,0.375,"[0.0061323424, 0.0032608750000000003, 0.011302...",[ 0.00613234 0.00326088 0.01130262 ... -0.02...,5
3,B09BS2LFBN,2021-07-02,New computer.. We have only had this a week. I...,New computer.,We have only had this a week. It loaded progra...,4,https://www.amazon.com/gp/customer-reviews/R38...,0.0625,"[-0.0074131703, -0.0014511476, 0.0050639263, -...",[-0.00741317 -0.00145115 0.00506393 ... -0.02...,0
4,B09BS2LFBN,2021-09-02,So far so good.. So far this has done what I h...,So far so good.,So far this has done what I have needed it to ...,5,https://www.amazon.com/gp/customer-reviews/RR0...,0.125,"[-0.016678335000000002, 0.006138554300000001, ...",[-0.01667834 0.00613855 -0.00290198 ... -0.02...,1


Get images from thumbnail URLs -- faster loading

In [None]:
import requests
from tqdm import tqdm

for id, url in zip(import_product_df.asin, import_product_df.thumbnailImage):
    picture_name = f'{id}.jpg'
    response = requests.get(url, stream=True)

    file_size = int(response.headers.get("Content-Length", 0))
    progress = tqdm(response.iter_content(1024), f"Downloading {picture_name}", total=file_size, unit="B", unit_scale=True, unit_divisor=1024)
    
    os.makedirs('assets', exist_ok=True)
    with open(os.path.join('assets', picture_name), 'wb') as f:
        for data in progress.iterable:
            f.write(data)
            progress.update(len(data))

!mkdir assets_resized
from PIL import Image

def get_most_common_size(directory):
    sizes = {}
    for filename in os.listdir(directory):
        with Image.open(os.path.join(directory, filename)) as img:
            size = img.size
            if size in sizes:
                sizes[size] += 1
            else:
                sizes[size] = 1
    most_common_size = max(sizes, key=sizes.get)
    return most_common_size

def resize_image(input_image_path, output_image_path, size):
    original_image = Image.open(input_image_path)
    width, height = original_image.size
    max_width, max_height = size
    # calculate ratio
    ratio = min(max_width/width, max_height/height)
    new_width = int(width * ratio)
    new_height = int(height * ratio)
    resized_image = original_image.resize((new_width, new_height))
    
    # create new image with white background
    new_image = Image.new("RGB", size, (255, 255, 255))
    # paste resized image into new image
    upper_left = (max_width-new_width)//2, (max_height-new_height)//2
    new_image.paste(resized_image, upper_left)

    new_image.save(output_image_path)

# get the most common size
directory1 = "assets"
directory2 = "assets_resized"
most_common_size = get_most_common_size(directory1)

# resize all images to most common size
for filename in os.listdir(directory1):
    resize_image(os.path.join(directory1, filename), os.path.join(directory2, filename), most_common_size)


# Upsert to pinecone

In [None]:
import json
from pprint import pprint
import os
import pandas as pd
import tiktoken

# from langchain.docstore.document import Document
# from langchain.document_loaders import ApifyDatasetLoader
# from langchain.indexes import VectorstoreIndexCreator
# from langchain.embeddings.openai import OpenAIEmbeddings

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
APIFY_API_TOKEN = os.getenv("APIFY_API_TOKEN")


In [None]:
import pinecone
# from langchain.vectorstores import Pinecone
# from sentence_transformers import SentenceTransformer

PINECONE_API_KEY = os.environ['PINECONE_API_KEY']
PINECONE_ENVIRONMENT = os.environ['PINECONE_ENVIRONMENT']

In [None]:
import tiktoken

tokenizer = tiktoken.get_encoding(tiktoken.encoding_for_model('gpt-4').name)

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

'cl100k_base'

In [None]:
import_product_df.columns.tolist()

['asin',
 'title_text',
 'category',
 'Series',
 'Brand',
 'Item model number',
 'Operating System',
 'price',
 'RAM',
 'Hard Drive',
 'Processor Brand',
 'Processor',
 'Chipset Brand',
 'Graphics Coprocessor',
 'bestseller',
 'seller_text',
 'url',
 'stars',
 'reviewsCount',
 'thumbnailImage',
 'variantAsins',
 'embedding',
 'np_embedding',
 'combined_embedding',
 'similarity']

In [None]:
import_product_df.head()

Unnamed: 0,asin,title_text,category,Series,Brand,Item model number,Operating System,price,RAM,Hard Drive,Processor Brand,Processor,Chipset Brand,Graphics Coprocessor,bestseller,seller_text,url,stars,reviewsCount,thumbnailImage,variantAsins,embedding,np_embedding,combined_embedding,similarity
0,B099P4T81H,"HP Chromebase 21.5"" All-in-One Desktop, Intel ...",Desktops,22-aa0022,HP,22-aa0022,chrome os,545.0,4 GB DDR4,128 GB SSD,Intel,2.4 GHz pentium_gold_g5600,Intel,UHD Graphics 600,0,FLEXIBLE FAMILY FUN Designed to live at the h...,https://www.amazon.com/dp/B099P4T81H,4.3,264,https://m.media-amazon.com/images/I/81w3miL-DH...,[],"[-0.0008702780469320714, 0.008141257800161839,...",[-0.00087028 0.00814126 0.00690294 ... 0.00...,"[-0.001907029206891233, 0.005891132240805329, ...",0.781002
1,B09YVWMLBP,Dell 2022 Newest Optiplex 3090 Micro Form Fact...,Desktops,Optiplex,Dell,3090,Windows,624.13,16 GB DDR4,512 GB SSD,Intel,2.3 GHz core_i5,Intel,UHD Graphics,0,High Speed RAM And Enormous Space16GB high-ban...,https://www.amazon.com/dp/B09YVWMLBP,5.0,10,https://m.media-amazon.com/images/I/61TIHYXkb4...,"['B0B2VB5ZT1', 'B0B2V1BJYX', 'B09YVWMLBP', 'B0...","[-0.0066131725907325745, 0.009173321537673473,...",[-0.00661317 0.00917332 0.00791616 ... -0.03...,"[-0.010428175670366288, 0.0038438382000867367,...",0.768828
2,B0BS2LCB1X,2018 Apple Mac Mini with 3.2GHz Intel Core i7 ...,Desktops,Apple Mac Mini,Apple,MRTT2LL/A,macOS,515.0,DDR4,128 GB SSD,Intel,3.2 GHz apple_ci7,Intel,UHD Graphics 630,0,"This pre-owned product is not Apple certified,...",https://www.amazon.com/dp/B0BS2LCB1X,4.6,29,https://m.media-amazon.com/images/I/61mujJvG+C...,[],"[0.015111690387129784, 0.0035265081096440554, ...",[ 0.01511169 0.00352651 -0.01433398 ... -0.00...,"[0.0016819539914815574, -0.0011620456422613066...",0.746716
3,B0BWPKK7RN,Dell OptiPlex 7080 Micro Form Factor Mini Busi...,Desktops,OptiPlex,Dell,7080,Windows,599.0,32 GB DDR4,1 TB SSD,Intel,2.3 GHz core_i5,Intel,UHD Graphics,0,High Speed RAM And Enormous Space32GB high-ban...,https://www.amazon.com/dp/B0BWPKK7RN,4.1,13,https://m.media-amazon.com/images/I/51qO-k6MY1...,"['B0BWQ3F343', 'B0BX21XTPP', 'B0BWQ1PXL3', 'B0...","[0.0010904214577749372, 0.006660059094429016, ...",[ 0.00109042 0.00666006 0.00955261 ... -0.01...,"[0.0031396948488874687, -0.007468305452785493,...",0.768732
4,B0BM8YLTH8,[Gaming PC] KAMRUI Mini PC AMD Ryzen 5 5600U U...,Desktops,AMR5-Ryzen 5 5600U,KAMRUI,AMR5,Windows,479.0,16 GB DDR4,"512 GB 512GB M.2 SSD Included, Support NVME/NG...",AMD,4.2 GHz ryzen_5,AMD,Radeon Vega 7,1,THE KEY TO VICTORYThe KAMRUI AMR5 mini gaming ...,https://www.amazon.com/dp/B0BM8YLTH8,4.5,166,https://m.media-amazon.com/images/I/61oUaIuI0A...,"['B0BM8YLTH8', 'B0BX82ZBMG']","[-0.010817659087479115, 0.00640911515802145, 0...",[-0.01081766 0.00640912 0.00751208 ... -0.03...,"[-0.006977158365810986, 0.002352492021153582, ...",0.788346


productAsin 0
date 0
all_review_text 0
reviewTitle 0
reviewDescription 0
ratingScore 0
reviewUrl 0
wavgHelpfulness 0
embedding 0
np_embedding 0
numPeopleFoundHelpful 0


In [None]:
['asin',
 'title_text',
 'category',
 'Series',
 'Brand',
 'Operating System',
 'price',
 'RAM',
 'Hard Drive',
 'Processor Brand',
 'Processor',
 'Chipset Brand',
 'Graphics Coprocessor',
 'bestseller',
 'seller_text',
 'url',
 'stars',
 'reviewsCount',

 'variantAsins',
 
 'embedding',
 'combined_embedding',
 ]

In [None]:
for c in ['asin',
 'title_text',
 'category',
 'Series',
 'Brand',
 'Operating System',
 'price',
 'RAM',
 'Hard Drive',
 'Processor Brand',
 'Processor',
 'Chipset Brand',
 'Graphics Coprocessor',
 'bestseller',
 'seller_text',
 'url',
 'stars',
 'reviewsCount',

 'variantAsins',
 
 'embedding',
 'combined_embedding',
 ]:
    print(f'{c}: {type(import_product_df[c][0])}')

asin: <class 'str'>
title_text: <class 'str'>
category: <class 'str'>
Series: <class 'str'>
Brand: <class 'str'>
Operating System: <class 'str'>
price: <class 'numpy.float64'>
RAM: <class 'str'>
Hard Drive: <class 'str'>
Processor Brand: <class 'str'>
Processor: <class 'str'>
Chipset Brand: <class 'str'>
Graphics Coprocessor: <class 'str'>
bestseller: <class 'numpy.int64'>
seller_text: <class 'str'>
url: <class 'str'>
stars: <class 'numpy.float64'>
reviewsCount: <class 'numpy.int64'>
variantAsins: <class 'str'>
embedding: <class 'str'>
combined_embedding: <class 'str'>
