# 3. Feature Engineering

# 3.1 Load Data
Loads the primary review dataset (filtered_reviews.csv) from Google Drive into a DataFrame for feature engineering.

In [None]:
from google.colab import drive
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Define project paths
project_dir = '/content/drive/MyDrive/bt4222_group_6/bt4222_group_6_amazon'
data_dir = '/content/drive/MyDrive/bt4222_group_6/bt4222_group_6_amazon/data'

df_reviews = pd.read_csv(os.path.join(data_dir, 'filtered_reviews.csv'))

Mounted at /content/drive


## 3.2 Temporal Feature Engineering
Adds several temporal features to the dataset:

- **temporal_purchase_sequence**: list of previously purchased products per user

- **previous_purchase** and **subsequent_purchase**: previous and next products purchased by the user

- **time_since_last_purchase**: days since the user's last purchase

- **purchases_last_4_years**: total number of purchased (based on number of reviews written by that user) by a user

- **monthly_purchase_frequency**: normalised monthly purchase rate in the past 4-years

In [None]:
def engineer_features(df):
    df['review_date'] = pd.to_datetime(df['review_date'])
    df = df.sort_values(by=['customer_id', 'review_date']).copy()

    def get_temporal_seq(series):
        result = []
        seqs = []
        for val in series:
            seqs.append(result.copy())
            result.append(val)
        return pd.Series(seqs, index=series.index)

    df['temporal_purchase_sequence'] = df.groupby('customer_id')['product_id'].transform(get_temporal_seq)
    df['previous_purchase'] = df.groupby('customer_id')['product_id'].shift(1)
    df['previous_purchase'] = df['previous_purchase'].fillna('')  # fill NaN with empty string

    df['subsequent_purchase'] = df.groupby('customer_id')['product_id'].shift(-1)
    df['subsequent_purchase'] = df['subsequent_purchase'].fillna('')  # fill NaN with empty string

    df['time_since_last_purchase'] = df.groupby('customer_id')['review_date'].diff().dt.days
    df['time_since_last_purchase'] = df['time_since_last_purchase'].fillna(0).astype(int)

    df['purchases_last_4_years'] = df.groupby('customer_id')['product_id'].transform('count')
    df['monthly_purchase_frequency'] = df['purchases_last_4_years'] / 48.0


    return df


df_reviews = engineer_features(df_reviews)
print(df_reviews.columns.tolist())

['customer_id', 'product_id', 'product_parent', 'product_title', 'product_category', 'star_rating', 'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_headline', 'review_body', 'review_date', 'temporal_purchase_sequence', 'previous_purchase', 'subsequent_purchase', 'time_since_last_purchase', 'purchases_last_4_years', 'monthly_purchase_frequency']


## 3.3 Product Feature Engineering

Add product and user specific features:

- **parent_product_average_rating**: average rating grouped by product_parent

- **product_id_average_rating**: average rating grouped by product_id

- **sum_helpfulvotes** and **sum_totalvotes**: total helpful and total votes per customer

In [None]:
# Group by product_parent and compute average star_rating
parent_avg_rating = df_reviews.groupby('product_parent')['star_rating'].mean().rename('parent_product_average_rating')

# Merge it back to the original df
df_reviews = df_reviews.merge(parent_avg_rating, on='product_parent', how='left')

# Group by product_id and compute average star_rating
product_id_avg_rating = df_reviews.groupby('product_id')['star_rating'].mean().rename('product_id_average_rating')

# Merge it back to the original df
df_reviews = df_reviews.merge(product_id_avg_rating, on='product_id', how='left')

# Group by customer_id and sum helpful_votes
user_helpful_votes = df_reviews.groupby('customer_id')['helpful_votes'].sum().rename('sum_helpfulvotes')

# Merge back
df_reviews = df_reviews.merge(user_helpful_votes, on='customer_id', how='left')

# Group by customer_id and sum total_votes
user_total_votes = df_reviews.groupby('customer_id')['total_votes'].sum().rename('sum_totalvotes')

# Merge back
df_reviews = df_reviews.merge(user_total_votes, on='customer_id', how='left')

print(df_reviews[['product_parent', 'parent_product_average_rating', 'customer_id', 'sum_helpfulvotes', 'sum_totalvotes']].head())

   product_parent  parent_product_average_rating  customer_id  \
0       219600481                       4.500000        11960   
1       682436048                       4.384615        11960   
2        32170248                       4.500000        11960   
3       614364353                       4.800000        11960   
4       928204157                       4.000000        11960   

   sum_helpfulvotes  sum_totalvotes  
0                 2               3  
1                 2               3  
2                 2               3  
3                 2               3  
4                 2               3  


## 3.4 Sentiment Analysis
We will be using a fine-tuned distiled BERT model for sentiment analysis on the review headers and description. This model was obtained from this website below and categorises each review into "positive" and "negative" respectively. The dataset is processed in 3 batches and saved as:

1. filtered_reviews_with_features_batch_1.csv

2. filtered_reviews_with_features_batch_2.csv

3. filtered_reviews_with_features_batch_3.csv

https://huggingface.co/sohan-ai/sentiment-analysis-model-amazon-reviews

In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
from tqdm import tqdm


# Load the tokenizer and model
model_name = "sohan-ai/sentiment-analysis-model-amazon-reviews"
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained(model_name)
model.eval()  # Evaluation mode

# Define the sentiment prediction function
def get_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    return "positive" if logits.argmax().item() == 1 else "negative"

# Combine headline and body (handle missing values)
df_reviews['full_review'] = df_reviews['review_headline'].fillna('') + '. ' + df_reviews['review_body'].fillna('')

# Apply the sentiment model row-by-row with a progress bar
tqdm.pandas()

batches = np.array_split(df_reviews, 3)

# Run 3 batchs here
for i in range(3):
  batch = batches[i]
  batch['sentiments'] = batch['full_review'].progress_apply(get_sentiment)
  batch.to_csv(os.path.join(data_dir, f"filtered_reviews_with_features_batch_{i+1}.csv"), index=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/646 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

  return bound(*args, **kwds)
  0%|          | 2/112343 [00:00<2:50:09, 11.00it/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

100%|██████████| 112343/112343 [2:59:53<00:00, 10.41it/s]


### Export Data to CSV

Combines the 3 batch CSV files into a single file filtered_reviews_with_features.csv. Also checks for duplicate rows across batches before saving.

In [None]:
batch1 = pd.read_csv(os.path.join(data_dir, "filtered_reviews_with_features_batch_1.csv"))
batch2 = pd.read_csv(os.path.join(data_dir, "filtered_reviews_with_features_batch_2.csv"))
batch3 = pd.read_csv(os.path.join(data_dir, "filtered_reviews_with_features_batch_3.csv"))

combined = pd.concat([batch1, batch2, batch3], axis=0).reset_index(drop=True)

# Check for duplicate rows
duplicates = combined[combined.duplicated(keep=False)]  # keep=False shows all duplicates

# Print results
if not duplicates.empty:
    print(f"Found {len(duplicates)} duplicate rows across batches.")
    display(duplicates.head())
else:
    print("No duplicate rows found across the batches.")

# Saved to CSV
combined.to_csv(os.path.join(data_dir, "filtered_reviews_with_features.csv"), index=False)

No duplicate rows found across the batches.


### Inspecting filtered_reviews_with_features dataset

Loads the final merged dataset (filtered_reviews_with_features.csv) for inspection. Displays the shape, column names, and distribution of sentiment labels.

In [None]:
filtered_reviews_with_features = pd.read_csv(os.path.join(data_dir, "filtered_reviews_with_features.csv"))
print(filtered_reviews_with_features.head())
print(filtered_reviews_with_features.shape)
print(filtered_reviews_with_features.columns.tolist())
print(filtered_reviews_with_features['sentiments'].value_counts())

   customer_id  product_id  product_parent  \
0        11960  B00LCJAW06       219600481   
1        11960  B008OTSEXY       682436048   
2        11960  B00KJ15KGY        32170248   
3        11960  B008ZL49WQ       614364353   
4        11960  B002WRGE5O       928204157   

                                       product_title product_category  \
0  Persian-Rugs T1007 Abstract Modern Area Rug Ca...        Furniture   
1  Flash Furniture High Back Black Ribbed Upholst...        Furniture   
2  Jackson Pollock Inspired Coffee Glass Table w/...        Furniture   
3                                  Eaze Lounge Chair        Furniture   
4         Walker Edison L-Shaped Glass Computer Desk        Furniture   

   star_rating  helpful_votes  total_votes vine verified_purchase  ...  \
0            4              1            1    N                 Y  ...   
1            4              0            0    N                 Y  ...   
2            4              1            1    N               