# Artificial Persuasive Intelligence

In [1]:
pip install chromadb

Collecting chromadb
  Using cached chromadb-1.0.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting build>=1.0.3 (from chromadb)
  Using cached build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Using cached chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi==0.115.9 (from chromadb)
  Using cached fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Using cached posthog-3.23.0-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Using cached onnxruntime-1.21.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Using cached opentelemetry_exporter_otlp_proto_grpc-1.31.1-py3-none-any.whl.metadata (2.5 kB)
Collecting opentelemetry-instrumentation-fastapi>=0.41

## Libraries

In [3]:
from datasets import load_dataset, Value, Sequence, Features
import chromadb
import pandas as pd
import re
import numpy as np 
import chuck_gpt
import concurrent
import boto3
from datetime import datetime as dt
import inspect

from importlib import reload
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, Matern, DotProduct
from scipy.stats import ecdf, norm
import random
import requests
import json
import chuck_gpt

In [4]:
def extract_json(answer):

    cleaned_answer = re.sub('`', '', answer)
    cleaned_answer = re.sub('json', '', cleaned_answer)
    cleaned_answer = re.sub('null', 'np.nan', cleaned_answer)
    cleaned_answer = re.sub("\n", ' ', cleaned_answer)
    cleaned_answer = re.sub("”", "\"", cleaned_answer)
    cleaned_answer = re.sub("\$", '', cleaned_answer)
    
    return eval(cleaned_answer)
    
def remove_name(name, text):
    lines = [x for x in text.split("\n") if len(x) > 0]
    lines[0] = re.sub(name + ': ', '', lines[0])
    return lines[0] + "\n" + "\n".join(['     '  + x for x in lines[1:]])

def subsample(df, n):
    if df.shape[0] > n:
        sdf = df.sample(n)
    else:
        sdf = df
    return "\n".join(["Review: {}\n- {}\nRating: {}\n".format(t, v, r) for t, v, r in zip(sdf['title'], 
                                                                                          sdf['text'],
                                                                                          sdf['rating'])])

## Load Data from Huggingface

In [5]:
#metadata = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_All_Beauty", split="full")
metadata = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Appliances", split="full")


Reusing dataset amazon_review2023 (/home/sagemaker-user/.cache/huggingface/datasets/McAuley-Lab___amazon_review2023/raw_meta_Appliances/0.0.0/16b76e0823d73bb8cff1e9c5e3e37dbc46ae3daee380417ae141f5e67d3ea8e8)


In [6]:
features_example = Features({'rating': Value(dtype='float'),
 'title': Value(dtype='string'),
 'text': Value(dtype='string'),
 'images': Sequence(feature=Value(dtype='string'), id=None),
 'asin':  Value(dtype='string'),
 'parent_asin':  Value(dtype='string'),
 'user_id': Value(dtype='string'),
 'timestamp': Value(dtype='int64'),
 'helpful_vote': Value(dtype='int32'),
 'verified_purchase': Value(dtype='bool')})

### Just use products with descriptions

In [7]:
#dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty", features=features_example)
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Appliances", features=features_example)


Using custom data configuration raw_review_Appliances-8e916b816ac3c3e0
Reusing dataset amazon_review2023 (/home/sagemaker-user/.cache/huggingface/datasets/McAuley-Lab___amazon_review2023/raw_review_Appliances-8e916b816ac3c3e0/0.0.0/16b76e0823d73bb8cff1e9c5e3e37dbc46ae3daee380417ae141f5e67d3ea8e8)


  0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
reviews_df = pd.DataFrame(dataset['full'][slice(0, 300000)])
products_df = pd.DataFrame(metadata[:300000])
df = reviews_df.merge(products_df, on='parent_asin')
df.shape

(300000, 25)

In [15]:
reviews_df[['parent_asin', 'title', 'text', 'rating']].to_csv('appliances_reviews.csv', index=False)

In [29]:
#client = chromadb.PersistentClient(path="/home/sagemaker-user/user-default-efs/projects/persuasio/chroma_dbs")
client = chromadb.PersistentClient(path="/home/sagemaker-user/user-default-efs/projects/persuasio/persuasio/chroma_appliances")
client2 = chromadb.PersistentClient(path="/home/sagemaker-user/user-default-efs/projects/persuasio/persuasio/chroma_appliances2")
client3 = chromadb.PersistentClient(path="/home/sagemaker-user/user-default-efs/projects/persuasio/persuasio/chroma_appliances3")


In [17]:
with_description_df = df[[True  if len(x) > 0 else False for x in df['description']]]
descriptions = with_description_df[['asin', 'parent_asin', 'description', 'average_rating', 
                                    'title_y', 'price', 'rating_number', 'images_y']].drop_duplicates(subset='parent_asin', inplace=False)
descriptions.shape

(25005, 8)

### Vector DB

In [30]:
description_db = client.get_collection(name="amazon_appliances_descriptions")
description_db2 = client2.get_collection(name="amazon_appliances_descriptions2")
description_db3 = client3.create_collection(name="amazon_appliances_descriptions3")
print(description_db.count(), description_db2.count(), description_db3.count())


5000 5000 0


In [31]:
start = 10000
end = 15000
description_db3.add(
    documents=[x[0] for x in descriptions['description'].iloc[start:end]],
    ids=[str(x) for x in descriptions['parent_asin'].iloc[start:end]], # parent_asin makes it easier for now
    metadatas=[{'average_rating': r,
                'price': p,
                'rating_number': c,
#                'hi_res': "\n".join(i['hi_res']),
                'large': "\n".join(i['large']),
                'thumb': "\n".join(i['thumb']),
                'title': t} for r, p, c, i, t in zip(descriptions['average_rating'].iloc[start:end],
                                                     descriptions['price'].iloc[start:end],
                                                     descriptions['rating_number'].iloc[start:end],
                                                     descriptions['images_y'].iloc[start:end],
                                                     descriptions['title_y'].iloc[start:end])] # could pull this later
) 