<a href="https://colab.research.google.com/github/rabbitmetrics/cx-analytics/blob/main/notebooks/cx-analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Link to the data https://cseweb.ucsd.edu/~jmcauley/datasets/amazon_v2/

In [None]:
import os
import json
import gzip
import pandas as pd
from urllib.request import urlopen

from dotenv import load_dotenv,find_dotenv

load_dotenv(find_dotenv())

In [None]:
# Extract data from files
data = []
with gzip.open('AMAZON_FASHION.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))
        
metadata = []
with gzip.open('meta_AMAZON_FASHION.json.gz') as f:
    for l in f:
        metadata.append(json.loads(l.strip()))

In [None]:
# Load the data to dataframes

df = pd.DataFrame.from_dict(data)
df = df[df['reviewText'].notna()]

df_meta=pd.DataFrame.from_dict(metadata)

In [None]:
# Truncate the reviewText

max_text_length=400
def truncate_review(text):
    return text[:max_text_length]

df['truncated']=df.apply(lambda row: truncate_review(row['reviewText']),axis=1)

In [None]:
# Look for productIds with enough reviews

df.groupby('asin').count().sort_values('overall')

In [None]:
# Work on only a slice of the dataframe

df = df.loc[df['asin'] == 'B000KPIHQ4'].copy()

In [None]:
# Import and apply embeddings from HuggingFace
# Warning! Be careful when/if applying embeddings from OpenAI like this - the full review dataframe is more than 800k rows.

from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings()

df['embeddings']=df.apply(lambda row: embeddings.embed_query(row['truncated']),axis=1)

In [None]:
# Prepare training and test sets for training Random Forest Regressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

X_train, X_test, y_train, y_test = train_test_split(
    list(df.embeddings.values),
    df.overall,
    test_size = 0.2,
    random_state=1
)

In [None]:
# Train and calculate mean absolute error

from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=150)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mean_absolute_error(y_test, y_pred)

In [None]:
# Import Pinecone client

import pinecone
from langchain.vectorstores import Pinecone

# Initialize Pinecone
pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),  
    environment=os.getenv('PINECONE_ENV')  
)

In [None]:
# Create list with truncated review texts

texts=df['truncated'].tolist()

In [None]:
# Send embedding vectors to Pinecone with Langchain

vstore = Pinecone.from_texts(texts, embeddings, index_name='cxanalytics')

In [None]:
# Do a basic vector similarity search

query = "The quality is good"
result = vstore.similarity_search(query)
print(result)

In [None]:
# Import RetrievalQA adn ChatOpenAPI and define review_chain in order to have GPT-4 access the review data

from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

chat = ChatOpenAI(model_name="gpt-4",temperature=0.0)
review_chain = RetrievalQA.from_chain_type(llm=chat, chain_type="stuff", retriever=vstore.as_retriever())

In [None]:
# Define the task for GPT-4 and run the chain

q="""
The reviews you see are for a product called 'Powerstep Pinnacle Orthotic Shoe Insoles'.
What is the overall impression of these reviews? Give most prevalent examples in bullets. 
What do you suggest we focus on improving?
"""

result=review_chain.run(q)
print(result)

In [None]:
# Rename columns in dataframe and create metadatafield in order to do upserts with Pinecone's Python client directly

df=df.rename(columns={'embeddings':'values','reviewerID':'id'})
df['metadata']=df.apply(lambda row: dict(rating=row['overall']), axis=1)

In [None]:
# Create two copies of data, one for the upsert and one for extracting reviewText from ids return from the filtered similarity search

data=df[['metadata','values','id']].to_dict(orient='records')
data_local=df[['metadata','values','reviewText','id']].to_dict(orient='records')

In [None]:
# Create the Pinecone index

pinecone.create_index(name='filtered', metric='euclidean', dimension=768)
index = pinecone.Index('filtered')

In [None]:
# Upload the data in batches of 50

from tqdm.auto import tqdm

for i in tqdm(range(0, len(data), 50)):
    j= i + 50
    if j > len(data):
        j = len(data)
    batch = data[i: j]
    index.upsert(vectors=batch)

In [None]:
# Run a filtered similarity search

query=embeddings.embed_query("will buy again")
results = index.query(queries=[query], top_k=100, filter={'rating': {'$eq': 4.0}})
print(results)

In [None]:
# Get the rating from id

get_rating_from_id = {
    x['id']: {
        'rating': x['metadata']['rating'],
        'review': x['reviewText'],
    } for x in data_local}

In [None]:
# Python function that retrieves reviews matching query and specific rate

def review_and_rating(query,rating):
    query=embeddings.embed_query(query)
    results = index.query(queries=[query], top_k=100, filter={'rating': {'$eq': rating}})
    ids = [i['id'] for i in results['results'][0]['matches']]
    l=[]
    for i in ids:
        l.append(get_rating_from_id[i])
    return pd.DataFrame(l)

In [None]:
# Repurchase list and winback list

repurchase_list=review_and_rating('will purchase again', 5.0)

winback=review_and_rating('disappointed', 1.0)
