In [9]:
# !pip install python-dotenv
# !pip install langchain_openai
# !pip install langchain
# !pip install langchain_community
# !pip install jq
# !pip install gdown
# !pip install docarray
!pip install datasets
# %load_ext cudf.pandas

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
Downloading xxhash-3.5.0-cp310-cp310-win_amd64.whl (30 kB)
Installing collected packages: xxhash, multiprocess, datasets
Successfully installed datasets-3.5.0 multiprocess-0.70.16 xxhash-3.5.0


In [3]:
import pandas as pd
import dask.dataframe as dd
import gdown
import re
import numpy as np


In [4]:
amazonhkdatasetfileid = '14GcJAzyN2PFg2JuyzF0pRmxlMmimrz9o'
amazonhkdatasetfilename = 'AmazonHomeKitchenReviews.csv'

url = f"https://drive.google.com/uc?export=download&id={amazonhkdatasetfileid}"

gdown.download(url,amazonhkdatasetfilename, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?export=download&id=14GcJAzyN2PFg2JuyzF0pRmxlMmimrz9o
From (redirected): https://drive.google.com/uc?export=download&id=14GcJAzyN2PFg2JuyzF0pRmxlMmimrz9o&confirm=t&uuid=53bfbac2-7dec-4f96-88f7-9381a2e5b0dd
To: c:\Sowmya\Code\CapstoneProject-AI-Class\AmazonHomeKitchenReviews.csv
100%|██████████| 692M/692M [02:48<00:00, 4.11MB/s] 


'AmazonHomeKitchenReviews.csv'

In [5]:
df_data = pd.read_csv(amazonhkdatasetfilename)
#df_data = df_data.head(10000)
df_filtered = df_data[df_data['categories'] == "['Home & Kitchen', 'Bedding', 'Sheets & Pillowcases', 'Sheet & Pillowcase Sets']"]
len(df_filtered)

91691

In [6]:
df_renamed = df_filtered.rename(columns={'title_y' : 'product_title','title_x':'review_title','text':'review_text'})
df_renamed.groupby('product_title').size().sort_values(ascending=False).head(5)
print(df_renamed.columns)

Index(['Unnamed: 0', 'rating', 'review_title', 'review_text', 'images', 'asin',
       'parent_asin', 'user_id', 'timestamp', 'helpful_vote',
       'verified_purchase', 'product_title', 'description', 'price', 'Brand',
       'Material', 'Color', 'categories'],
      dtype='object')


In [7]:
qa_pairs = []
for index , row in df_renamed.iterrows():
  context = str(row['review_title']) + ". " + str(row["review_text"]) + " " + "$" + str(row['price']) + ". " + str(row['rating']) +" " +  str(row['product_title'])
  #print(context

    # Generate questions about user rating only if rating is not null
  if not pd.isna(row['rating']):
        qa_pairs.append({
            'context': context,
            'question': f"What was the average rating for product{str(row['product_title'])}?",
          #  'answer' : str(row['rating'])
            'answer': {'text': str(row['rating']), 'answer_start': context.find(str(row['rating'])) if str(row['rating']) in context else 0}

        })

        # Question about why they gave that rating
        qa_pairs.append({
            'context': context,
            'question': f"Why did the customer rate this product {str(row['rating'])} stars?",
             'answer': {'text': context[:150], 'answer_start': 0}
        })



In [10]:
from datasets import Dataset

dataset = Dataset.from_list(qa_pairs)
dataset = dataset.train_test_split(test_size=0.2)

print(dataset.keys())
print(dataset['test'][0])

dict_keys(['train', 'test'])
{'context': "Comfort. The sheet set was just what I wanted! They're great! Thanks $38.99. 5 Full Size Sheet Set - 6 Piece Set - Hotel Luxury Bed Sheets - Extra Soft - Deep Pockets - Easy Fit - Breathable & Cooling Sheets - Wrinkle Free - Comfy - Beige Tan Bed Sheets - Fulls Sheets - 6 PC", 'question': 'What was the average rating for productFull Size Sheet Set - 6 Piece Set - Hotel Luxury Bed Sheets - Extra Soft - Deep Pockets - Easy Fit - Breathable & Cooling Sheets - Wrinkle Free - Comfy - Beige Tan Bed Sheets - Fulls Sheets - 6 PC?', 'answer': {'answer_start': 77, 'text': '5'}}


In [11]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
# Load your fine-tuned model and tokenizer
model = AutoModelForQuestionAnswering.from_pretrained("Resources/finetunedmodel")
tokenizer = AutoTokenizer.from_pretrained("Resources/finetunedmodel")
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

Device set to use cpu


In [24]:
evaluatetest = dataset['test'].select(range(100))
print(evaluatetest)

Dataset({
    features: ['context', 'question', 'answer'],
    num_rows: 100
})


In [25]:
from sklearn.metrics import f1_score
# Function to calculate F1 score based on word overlap
def compute_f1_score(predicted_answer, true_answer):
    # Tokenize the answers by splitting them into words
    pred_tokens = set(predicted_answer.split())
    true_tokens = set(true_answer.split())
    
    # Calculate precision, recall, and F1 score using word overlap
    intersection = len(pred_tokens.intersection(true_tokens))
    if intersection == 0:
        return 0
    precision = intersection / len(pred_tokens)
    recall = intersection / len(true_tokens)
    
    # Calculate F1 as the harmonic mean of precision and recall
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return f1



f1_scores = []
em_scores = []

# Evaluate on the test dataset
for example in evaluatetest:
    question = example['question']
    context = example['context']
    true_answer = example['answer']

    # Get the predicted answer
    result = qa_pipeline(question=question, context=context)
    predicted_answer = result['answer']

    # Debugging: Print the structure of predicted_answer and true_answer
    print(f"Predicted Answer (raw): {result}")
    print(f"True Answer (raw): {true_answer}")
    
    # Ensure both answers are strings
    if isinstance(predicted_answer, dict):
        predicted_answer = predicted_answer.get('answer', '')  # Extract the answer from the dictionary
    if isinstance(true_answer, dict):
        true_answer = true_answer.get('text', '')  # If true_answer is also a dictionary, extract the answer
    

    # Calculate F1 and EM
    f1 = compute_f1_score(predicted_answer,true_answer )
    f1_scores.append(f1)
        
    em = 1 if predicted_answer.strip().lower() == true_answer.strip().lower() else 0
    em_scores.append(em)

    # Print or calculate metrics (e.g., F1, Exact Match)
    print(f"Question: {question}")
    print(f"True Answer: {true_answer}")
    print(f"Predicted Answer: {predicted_answer}")
    print(f"F1 Score: {f1}, EM Score: {em}")
    print("----")


# Calculate average F1 and EM scores
avg_f1 = sum(f1_scores) / len(f1_scores)
avg_em = sum(em_scores) / len(em_scores)
print(f"Average F1 Score: {avg_f1}")
print(f"Average Exact Match Score: {avg_em}")

Predicted Answer (raw): {'score': 0.9991750121116638, 'start': 77, 'end': 78, 'answer': '5'}
True Answer (raw): {'answer_start': 77, 'text': '5'}
Question: What was the average rating for productFull Size Sheet Set - 6 Piece Set - Hotel Luxury Bed Sheets - Extra Soft - Deep Pockets - Easy Fit - Breathable & Cooling Sheets - Wrinkle Free - Comfy - Beige Tan Bed Sheets - Fulls Sheets - 6 PC?
True Answer: 5
Predicted Answer: 5
F1 Score: 1.0, EM Score: 1
----
Predicted Answer (raw): {'score': 0.0003041128220502287, 'start': 0, 'end': 4, 'answer': 'Five'}
True Answer (raw): {'answer_start': 0, 'text': 'Five Stars. Sheets very soft, deep pockets will help keep fitted on! $24.99. 5 1500 Supreme Collection Twin XL Sheet Sets Lilac - 3 Piece Bed Sheets a'}
Question: Why did the customer rate this product 5 stars?
True Answer: Five Stars. Sheets very soft, deep pockets will help keep fitted on! $24.99. 5 1500 Supreme Collection Twin XL Sheet Sets Lilac - 3 Piece Bed Sheets a
Predicted Answer: Fi