In [1]:
!pip install python-dotenv
!pip install langchain_openai
!pip install langchain
!pip install langchain_community
!pip install jq
!pip install gdown
!pip install docarray
!pip install datasets

Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.0
Collecting langchain_openai
  Downloading langchain_openai-0.3.11-py3-none-any.whl.metadata (2.3 kB)
Collecting tiktoken<1,>=0.7 (from langchain_openai)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading langchain_openai-0.3.11-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken, langchain_openai
Successfully installed langchain_openai-0.

In [2]:
import pandas as pd
import dask.dataframe as dd
import gdown
import re
import numpy as np
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.decomposition import LatentDirichletAllocation
# from bertopic import BERTopic
# from sentence_transformers import SentenceTransformer

In [3]:
# Mount Google Drive (For Colab Users)
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
amazonhkdatasetfileid = '14GcJAzyN2PFg2JuyzF0pRmxlMmimrz9o'
amazonhkdatasetfilename = 'AmazonHomeKitchenReviews.csv'

url = f"https://drive.google.com/uc?export=download&id={amazonhkdatasetfileid}"

gdown.download(url,amazonhkdatasetfilename, quiet=False)


Downloading...
From (original): https://drive.google.com/uc?export=download&id=14GcJAzyN2PFg2JuyzF0pRmxlMmimrz9o
From (redirected): https://drive.google.com/uc?export=download&id=14GcJAzyN2PFg2JuyzF0pRmxlMmimrz9o&confirm=t&uuid=090e47f6-4a5a-4f9e-aff9-5cd422cb425d
To: /content/AmazonHomeKitchenReviews.csv
100%|██████████| 692M/692M [00:09<00:00, 73.3MB/s]


'AmazonHomeKitchenReviews.csv'

* Read the dataset csv  into dataframes

In [5]:
df_data = pd.read_csv(amazonhkdatasetfilename)


In [6]:
print(len(df_data))

754079


In [7]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 754079 entries, 0 to 754078
Data columns (total 18 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         754079 non-null  int64  
 1   rating             754079 non-null  int64  
 2   title_x            753916 non-null  object 
 3   text               753888 non-null  object 
 4   images             754079 non-null  object 
 5   asin               754079 non-null  object 
 6   parent_asin        754079 non-null  object 
 7   user_id            754079 non-null  object 
 8   timestamp          754079 non-null  object 
 9   helpful_vote       754079 non-null  int64  
 10  verified_purchase  754079 non-null  bool   
 11  title_y            754079 non-null  object 
 12  description        754079 non-null  object 
 13  price              754079 non-null  float64
 14  Brand              754077 non-null  object 
 15  Material           754079 non-null  object 
 16  Co

In [8]:
#reduce the dataset to 10000 records
# df_data = df_data.head(10000)



*   Analyse the datasets
*  Check total number of unique products and the review counts




In [9]:
df_renamed = df_data.rename(columns={'title_y' : 'product_title','title_x':'review_title','text':'review_text'})
df_renamed.groupby('product_title').size().sort_values(ascending=False).head(5)
print(df_renamed.columns)

Index(['Unnamed: 0', 'rating', 'review_title', 'review_text', 'images', 'asin',
       'parent_asin', 'user_id', 'timestamp', 'helpful_vote',
       'verified_purchase', 'product_title', 'description', 'price', 'Brand',
       'Material', 'Color', 'categories'],
      dtype='object')


* Pre-processing
* X = review_title,review_text
* y = rating

In [10]:
print(df_renamed.value_counts('price'))




price
19.99      25494
14.99      23353
29.99      21455
24.99      21141
34.97      20431
           ...  
651.68         1
2665.48        1
1015.16        1
799.00         1
795.00         1
Name: count, Length: 3013, dtype: int64


In [11]:
qa_pairs = []
for index , row in df_renamed.iterrows():
  context = str(row['review_title']) + str(row["review_text"])

    # Generate questions about user rating only if rating is not null
  if not pd.isna(row['rating']):
        qa_pairs.append({
            'context': context,
            'question': f"What was the average rating for product{str(row['product_title'])}?",
          #  'answer' : str(row['rating'])
            'answer': {'text': str(row['rating']), 'answer_start': context.find(str(row['rating'])) if str(row['rating']) in context else 0}

        })

        # Question about why they gave that rating
        qa_pairs.append({
            'context': context,
            'question': f"Why did the customer rate this product {str(row['rating'])} stars?",
             'answer': {'text': context[:150], 'answer_start': 0}
        })



In [12]:
print(qa_pairs[5:12])

[{'context': 'Pretty colors availableNice thin placemats of good size. Can be used also as table doilies.', 'question': 'Why did the customer rate this product 5 stars?', 'answer': {'text': 'Pretty colors availableNice thin placemats of good size. Can be used also as table doilies.', 'answer_start': 0}}, {'context': 'Nice materialVery pretty, wish they came bigger', 'question': 'What was the average rating for productPaperLanternStore.com 7 Inch Bloom Shaped Handmade Cotton Crochet Doilies - Beige (2 PACK)?', 'answer': {'text': '4', 'answer_start': 0}}, {'context': 'Nice materialVery pretty, wish they came bigger', 'question': 'Why did the customer rate this product 4 stars?', 'answer': {'text': 'Nice materialVery pretty, wish they came bigger', 'answer_start': 0}}, {'context': 'Love the zipper!The red is a deeper red rather than a bright red. Not as shiny as the polyester satin ones. Love the zipper enclosure since it’s a slippery material, the pillows won’t come out.', 'question': 'W

In [13]:
from transformers import pipeline

question_answerer = pipeline("question-answering", model='distilbert-base-uncased-distilled-squad')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Device set to use cpu


In [14]:
result = question_answerer(question="What is the average rating for dinner mugs", context=context)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

Answer: 'these will probably be fine', score: 0.091, start: 127, end: 154


In [15]:
result = question_answerer(question="What is the average rating for ", context=context)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

Answer: 'these will probably be fine', score: 0.1157, start: 127, end: 154




* Load the QA dataset
* Split the dataset for train test




In [16]:
from datasets import Dataset

dataset = Dataset.from_list(qa_pairs)
dataset = dataset.train_test_split(test_size=0.2)

print(dataset.keys())
print(dataset['train'][0])


dict_keys(['train', 'test'])
{'context': 'Material shedsThe sheets were very comfortable.  Needed to wash them a couple of times to remove the lint from the soft polyester.  They fit the king size bed perfectly', 'question': 'Why did the customer rate this product 4 stars?', 'answer': {'answer_start': 0, 'text': 'Material shedsThe sheets were very comfortable.  Needed to wash them a couple of times to remove the lint from the soft polyester.  They fit the king '}}


* Tokenize the dataset

In [17]:
from transformers import DistilBertTokenizerFast

# Load the tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-cased-distilled-squad")

# Function to tokenize and compute positions
def tokenize_and_align(batch):
    tokenized_inputs = tokenizer(
        batch["question"],
        batch["context"],
        truncation=True,
        max_length=512,
        return_offsets_mapping=True,  # To map tokens to character positions
        padding="max_length"  # Optional: Ensures all inputs are the same length
    )

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(tokenized_inputs["offset_mapping"]):
        answer = batch["answer"][i]
        answer_start = answer["answer_start"]
        answer_text = answer["text"]

        # Find the start and end token indices
        start_token_idx, end_token_idx = None, None
        for idx, (start, end) in enumerate(offsets):
            if start <= answer_start < end:
                start_token_idx = idx
            if start < answer_start + len(answer_text) <= end:
                end_token_idx = idx
                break

        # Default to 0 if answer is not found (e.g., truncated context)
        if start_token_idx is None or end_token_idx is None:
            start_token_idx, end_token_idx = 0, 0

        start_positions.append(start_token_idx)
        end_positions.append(end_token_idx)

    # Add the positions to the tokenized inputs
    tokenized_inputs["start_positions"] = start_positions
    tokenized_inputs["end_positions"] = end_positions
    tokenized_inputs.pop("offset_mapping")  # Remove offsets if not needed

    return tokenized_inputs


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

In [None]:
# Apply the tokenization function
tokenized_dataset = dataset.map(tokenize_and_align, batched=True)


Map:   0%|          | 0/1206526 [00:00<?, ? examples/s]

Map:   0%|          | 0/301632 [00:00<?, ? examples/s]

In [None]:
print(tokenized_dataset)
print(tokenized_dataset["train"].column_names)

* Load the pretrained model

In [None]:
# Save the tokenized dataset to disk in Arrow format
tokenized_dataset.save_to_disk("path_to_save_tokenized_dataset")


In [None]:
#  model_checkpoint = "distilbert-base-uncased-distilled-squad"

In [None]:
from transformers import DistilBertForQuestionAnswering

# model = DistilBertForQuestionAnswering.from_pretrained(model_checkpoint)



*   Define Training Arguments




In [None]:
# from transformers import TrainingArguments

# training_args = TrainingArguments(
#     output_dir="./amazonhkqa_model",
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     push_to_hub=False

# )

* Define the trainer and train the model with our dataset

In [None]:
# from transformers import Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_dataset["train"],
#     eval_dataset=tokenized_dataset["test"],
#     tokenizer=tokenizer
# )



In [None]:
# trainer.train()