In [None]:
!pip install python-dotenv
!pip install langchain_openai
!pip install langchain
!pip install langchain_community
!pip install jq
!pip install gdown
!pip install docarray
!pip install datasets
%load_ext cudf.pandas


In [None]:
import pandas as pd
import dask.dataframe as dd
import gdown
import re
import numpy as np
import cudf
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.decomposition import LatentDirichletAllocation
# from bertopic import BERTopic
# from sentence_transformers import SentenceTransformer

In [None]:
# Mount Google Drive (For Colab Users)
from google.colab import drive
drive.mount('/content/drive')


In [None]:
amazonhkdatasetfileid = '14GcJAzyN2PFg2JuyzF0pRmxlMmimrz9o'
amazonhkdatasetfilename = 'AmazonHomeKitchenReviews.csv'

url = f"https://drive.google.com/uc?export=download&id={amazonhkdatasetfileid}"

gdown.download(url,amazonhkdatasetfilename, quiet=False)


* Read the dataset csv  into dataframes

In [None]:
df_data = pd.read_csv(amazonhkdatasetfilename)


In [None]:
df_cuda_data = cudf.from_pandas(df_data)

In [None]:
print(len(df_data))

In [None]:
print(len(df_cuda_data))

In [None]:
df_data.info()

In [None]:
df_data = df_cuda_data
len(df_data)

In [None]:
#reduce the dataset to 10000 records
#df_data = df_data.head(10000)
df_filtered = df_data[df_data['categories'] == "['Home & Kitchen', 'Bedding', 'Sheets & Pillowcases', 'Sheet & Pillowcase Sets']"]
len(df_filtered)



*   Analyse the datasets
*  Check total number of unique products and the review counts




In [None]:
df_renamed = df_filtered.rename(columns={'title_y' : 'product_title','title_x':'review_title','text':'review_text'})
df_renamed.groupby('product_title').size().sort_values(ascending=False).head(5)
print(df_renamed.columns)

* Pre-processing
* X = review_title,review_text
* y = rating

In [None]:
print(df_renamed.value_counts('price'))




In [None]:
qa_pairs = []
for index , row in df_renamed.iterrows():
  context = str(row['review_title']) + ". " + str(row["review_text"]) + " " + "$" + str(row['price']) + ". " + str(row['rating']) +" " +  str(row['product_title'])
  #print(context

    # Generate questions about user rating only if rating is not null
  if not pd.isna(row['rating']):
        qa_pairs.append({
            'context': context,
            'question': f"What was the average rating for product{str(row['product_title'])}?",
          #  'answer' : str(row['rating'])
            'answer': {'text': str(row['rating']), 'answer_start': context.find(str(row['rating'])) if str(row['rating']) in context else 0}

        })

        # Question about why they gave that rating
        qa_pairs.append({
            'context': context,
            'question': f"Why did the customer rate this product {str(row['rating'])} stars?",
             'answer': {'text': context[:150], 'answer_start': 0}
        })



In [None]:
print(qa_pairs[5:12])

In [None]:
from transformers import pipeline

question_answerer = pipeline("question-answering", model='distilbert-base-uncased-distilled-squad')

In [None]:
result = question_answerer(question="What is the average rating for dinner mugs", context=context)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

In [None]:
result = question_answerer(question="What are customer saying about dinner mug", context=context)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")



* Load the QA dataset
* Split the dataset for train test




In [None]:
from datasets import Dataset

dataset = Dataset.from_list(qa_pairs)
dataset = dataset.train_test_split(test_size=0.2)

print(dataset.keys())
print(dataset['train'][0])


* Tokenize the dataset

In [None]:
from transformers import DistilBertTokenizerFast

# Load the tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-cased-distilled-squad")

# Function to tokenize and compute positions
def tokenize_and_align(batch):
    tokenized_inputs = tokenizer(
        batch["question"],
        batch["context"],
        truncation=True,
        max_length=512,
        return_offsets_mapping=True,  # To map tokens to character positions
        padding="max_length"  # Optional: Ensures all inputs are the same length
    )

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(tokenized_inputs["offset_mapping"]):
        answer = batch["answer"][i]
        answer_start = answer["answer_start"]
        answer_text = answer["text"]

        # Find the start and end token indices
        start_token_idx, end_token_idx = None, None
        for idx, (start, end) in enumerate(offsets):
            if start <= answer_start < end:
                start_token_idx = idx
            if start < answer_start + len(answer_text) <= end:
                end_token_idx = idx
                break

        # Default to 0 if answer is not found (e.g., truncated context)
        if start_token_idx is None or end_token_idx is None:
            start_token_idx, end_token_idx = 0, 0

        start_positions.append(start_token_idx)
        end_positions.append(end_token_idx)

    # Add the positions to the tokenized inputs
    tokenized_inputs["start_positions"] = start_positions
    tokenized_inputs["end_positions"] = end_positions
    tokenized_inputs.pop("offset_mapping")  # Remove offsets if not needed

    return tokenized_inputs


In [None]:
# Apply the tokenization function
tokenized_dataset = dataset.map(tokenize_and_align, batched=True, remove_columns=['context', 'question', 'answer'])


In [None]:
print(tokenized_dataset)
print(tokenized_dataset["train"].column_names)

* Load the pretrained model

In [None]:
# Save the tokenized dataset to disk in Arrow format
tokenized_dataset.save_to_disk("saved_tokenized_dataset")


In [None]:
model_checkpoint = "distilbert-base-uncased-distilled-squad"

In [None]:
from transformers import DistilBertForQuestionAnswering

model = DistilBertForQuestionAnswering.from_pretrained(model_checkpoint)



*   Define Training Arguments




In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./amazonhkqa_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False

)

* Define the trainer and train the model with our dataset

In [None]:
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    processing_class=tokenizer
)



In [None]:
cudf.cuda.is_available()

In [None]:
trainer.train()


In [None]:
trainer.save_model("/content/amazonhkqa_model")