# Task 1: Feature Engineering with Modin on Ray
In this task, we'll perform some feature engineering for the Amazon Reviews dataset. We'll use Modin so that you can conveniently write pandas code and have your workload scale to a cluster.

In [1]:
import ray
ray.shutdown()
ray.init()
import os 
import json
import modin.pandas as pd
import numpy as np
from ast import literal_eval
from pathlib import Path

def compare_dicts(d1, d2, err=1e-5):
    keys1 = set(d1.keys())
    keys2 = set(d2.keys())
    if keys1 != keys2:
        return False
    for k in keys1:
        if isinstance(d1[k], list) and isinstance(d2[k], list):
            if sum([abs(x - y) for x, y in zip(d1[k], d2[k])]) > err:
                return False
        else:
            v1 = float(d1[k])
            v2 = float(d2[k])
            if abs(v1 - v2) > err:
                return False
    return True

2025-11-16 23:21:39,152	INFO worker.py:2003 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m
[36m(_remote_exec_single_chain pid=3935)[0m . fn=<function Map.register.<locals>.caller.<locals>.<lambda> at 0x7fd581afa2a0>, obj=                                                 related
[36m(_remote_exec_single_chain pid=3935)[0m 0      {'also_bought': ['9729375011', 'B004FN1AE8', '...
[36m(_remote_exec_single_chain pid=3935)[0m 1      {'also_bought': ['B000V5KPZ4', 'B001F8TLLU', '...
[36m(_remote_exec_single_chain pid=3935)[0m 2      {'also_bought': ['B0042D69W4', 'B00428LIZM', '...
[36m(_remote_exec_single_chain pid=3935)[0m 3      {'also_bought': ['B00000J3LC', 'B0043G4JOA', '...
[36m(_remote_exec_single_chain pid=3935)[0m 4      {'also_bought': ['B0013FCBJO', 'B0019QCGVK', '...
[36m(_remote_exec_single_chain pid=3935)[0m ...                                                  ...
[36m(_remote_exec_single_chain pid=3935)[0m 76330 

In [2]:
ROOT = ""
ROOT = os.path.expanduser(ROOT) # expand user contractions

In [3]:
product_data = pd.read_csv(os.path.join(ROOT, "~/public/pa3/metadata_header.csv"))

# make sure columns are of the right dtypes
for column in ["salesRankDict", "categories", "related"]:
    product_data[column] = product_data[column].apply(lambda x: literal_eval(x) if not pd.isna(x) else None)

# Task 1.1: flatten `categories` and `salesRank`

* From `categories` (a list of lists), take the first element of the first list (i.e., `categories[0][0]`) and place it in a new column `category`. If missing/empty, set `null`.
* `salesRankDict` contains at most one pair of (key, value). From `salesRankDict`, extract the key and the value into new columns `salesCategory` and `salesRank`. If missing/empty, set `null`.

In [4]:
def part1(product_data):
    
    # Work on a copy to avoid modifying the original
    product_data = product_data.copy()
    
    # Column names
    salesRankDict_column = 'salesRankDict'
    categories_column = 'categories' 
    asin_column = 'asin'
    
    # Outputs
    category_column = 'category' 
    salesCategory_column = 'salesCategory' 
    salesRank_column = 'salesRank' 

    res = {
        'count_total': None,
        'mean_salesRank': None,
        'variance_salesRank': None, 
        'numNulls_category': None,
        'countDistinct_category': None,
        'numNulls_salesCategory': None,
        'countDistinct_salesCategory': None
    }
    
    # Functions for preprocessing
    def preprocess_cat(cats):
        # Extract categories[0][0] if exists, otherwise None
        if cats is not None and len(cats) > 0 and len(cats[0]) > 0:
            return cats[0][0]
        return None

    def get_sales_cat(salesRank):
        # Extract the key from salesRankDict (category name)
        if salesRank is not None and len(salesRank) > 0:
            return list(salesRank.keys())[0]
        return None

    def get_sales_rank(salesRank):
        # Extract the value from salesRankDict (rank number)
        if salesRank is not None and len(salesRank) > 0:
            return list(salesRank.values())[0]
        return None

    # Apply preprocessing functions
    product_data[category_column] = product_data[categories_column].apply(preprocess_cat)
    product_data[salesCategory_column] = product_data[salesRankDict_column].apply(get_sales_cat)
    product_data[salesRank_column] = product_data[salesRankDict_column].apply(get_sales_rank)

    # Calculate statistics    
    count_total = len(product_data)
    mean_salesRank = product_data[salesRank_column].mean()
    variance_salesRank = product_data[salesRank_column].var()  # Sample variance (ddof=1 by default)
    numNulls_category = product_data[category_column].isna().sum()
    countDistinct_category = product_data[category_column].nunique()
    numNulls_salesCategory = product_data[salesCategory_column].isna().sum()
    countDistinct_salesCategory = product_data[salesCategory_column].nunique()

    # Put results in a dictionary
    res = {
        'count_total': int(count_total),
        'mean_salesRank': float(mean_salesRank),
        'variance_salesRank': float(variance_salesRank),
        'numNulls_category': int(numNulls_category),
        'countDistinct_category': int(countDistinct_category),
        'numNulls_salesCategory': int(numNulls_salesCategory),
        'countDistinct_salesCategory': int(countDistinct_salesCategory)
    }

    return res


In [5]:
res = part1(product_data)

In [6]:
import json
expected_path = Path("~/public/pa3/expected_1_1.json").expanduser()
with open(expected_path) as expected_file:
    expected = json.load(expected_file)

assert compare_dicts(expected, res), "Task 1.1 result mismatch"
print("✅ Task 1.1 output matches expected.")

✅ Task 1.1 output matches expected.


# Task 1.2: flatten `related`

* For each row, compute mean price of products referenced by `related["also_viewed"]`. Ignore the ASINs not present in `product` or with `price=null`. Do not impute None (e.g., do not use fillna(0) or similar).

In [11]:
def part2(product_data):
    #Column names
    asin_column = 'asin'
    price_column = 'price'
    related_column = 'related'
    
    attribute = 'also_viewed'
    
    #Outputs
    meanPriceAlsoViewed_column = 'meanPriceAlsoViewed'
    
    res = {
        'count_total': None,
        'mean_meanPriceAlsoViewed': None,
        'variance_meanPriceAlsoViewed':  None,
        'numNulls_meanPriceAlsoViewed':  None
    }
    
    # Get also_viewed (a list of asins) from related_column and make it a new column
    def get_also_viewed(related):
        # related is expected to be a dict or None. Return the list under 'also_viewed' or None.
        if related is None:
            return None
        # use .get so missing key returns None
        return related.get(attribute, None)
    
    product_data[attribute] = product_data[related_column].apply(get_also_viewed)

    # Build a mapping from asin -> price, excluding null prices (we should ignore nulls)
    # Convert to plain dict so lookups inside apply are efficient
    price_map = product_data[[asin_column, price_column]].dropna(subset=[price_column]).set_index(asin_column)[price_column].to_dict()

    # Function to compute mean price for a list of asins, ignoring missing asins and null prices
    def mean_price_for_asins(asins):
        if asins is None:
            return None
        # Filter prices that exist in price_map
        vals = [price_map.get(a) for a in asins if a in price_map]
        if not vals:
            return None
        return float(np.mean(vals))

    # Compute meanPriceAlsoViewed column
    product_data[meanPriceAlsoViewed_column] = product_data[attribute].apply(mean_price_for_asins)

    # Calculate Statistics
    count_total = len(product_data)
    # pandas (and modin) will ignore NaN when computing mean/var
    mean_meanPriceAlsoViewed = product_data[meanPriceAlsoViewed_column].mean()
    variance_meanPriceAlsoViewed = product_data[meanPriceAlsoViewed_column].var()
    numNulls_meanPriceAlsoViewed = product_data[meanPriceAlsoViewed_column].isna().sum()
    
    # Put results in res
    res = {
        'count_total': int(count_total),
        'mean_meanPriceAlsoViewed': float(mean_meanPriceAlsoViewed),
        'variance_meanPriceAlsoViewed': float(variance_meanPriceAlsoViewed),
        'numNulls_meanPriceAlsoViewed': int(numNulls_meanPriceAlsoViewed)
    }
    return res

In [12]:
res = part2(product_data)



In [13]:
import json
expected_path = Path("~/public/pa3/expected_1_2.json").expanduser()
with open(expected_path) as expected_file:
    expected = json.load(expected_file)

assert compare_dicts(expected, res), "Task 1.2 result mismatch"
print("✅ Task 1.2 output matches expected.")

✅ Task 1.2 output matches expected.


# Task 1.3: Impute `price`

- Impute `price` with the *mean* and write to `meanImputedPrice`.
- Impute `price` with the *median* and write to `medianImputedPrice`.
- For `title`, replace `null` with the string `"unknown"` and write to `unknownImputedTitle`.


In [14]:
from sklearn.impute import SimpleImputer

def part3(product_data):

    price_column = 'price'
    title_column = 'title'
    
    # Outputs
    meanImputedPrice_column = 'meanImputedPrice'
    medianImputedPrice_column = 'medianImputedPrice'
    unknownImputedTitle_column = 'unknownImputedTitle'
    
    res = {
        'count_total': None,
        'mean_meanImputedPrice': None,
        'variance_meanImputedPrice': None,
        'numNulls_meanImputedPrice': None,
        'mean_medianImputedPrice': None,
        'variance_medianImputedPrice': None,
        'numNulls_medianImputedPrice': None,
        'numUnknowns_unknownImputedTitle': None,
    }
    
    # Define Imputers
    mean_imputer = SimpleImputer(strategy='mean')
    median_imputer = SimpleImputer(strategy='median')

    # Work on a copy to avoid modifying the original
    df = product_data.copy()

    # Prepare price values for imputation (2D array as required by sklearn)
    price_vals = df[price_column].to_numpy().reshape(-1, 1)

    # Fit and transform for mean and median imputations
    # Note: If the column is entirely null, SimpleImputer will throw an error;
    # this dataset is expected to have at least some non-null prices.
    mean_imputed = mean_imputer.fit_transform(price_vals).ravel().astype(float)
    median_imputed = median_imputer.fit_transform(price_vals).ravel().astype(float)

    # Assign imputed columns
    df[meanImputedPrice_column] = mean_imputed
    df[medianImputedPrice_column] = median_imputed

    # For title, replace nulls with "unknown"
    df[unknownImputedTitle_column] = df[title_column].fillna("unknown")

    # Calculate Statistics
    count_total = len(df)
    mean_meanImputedPrice = df[meanImputedPrice_column].mean()
    variance_meanImputedPrice = df[meanImputedPrice_column].var()
    numNulls_meanImputedPrice = int(df[meanImputedPrice_column].isna().sum())

    mean_medianImputedPrice = df[medianImputedPrice_column].mean()
    variance_medianImputedPrice = df[medianImputedPrice_column].var()
    numNulls_medianImputedPrice = int(df[medianImputedPrice_column].isna().sum())

    # Count how many titles were unknown (i.e., were null before imputation)
    numUnknowns_unknownImputedTitle = int(df[unknownImputedTitle_column].eq("unknown").sum())

    # Put results in res 
    res = {
        'count_total': int(count_total),
        'mean_meanImputedPrice': float(mean_meanImputedPrice),
        'variance_meanImputedPrice': float(variance_meanImputedPrice),
        'numNulls_meanImputedPrice': int(numNulls_meanImputedPrice),
        'mean_medianImputedPrice': float(mean_medianImputedPrice),
        'variance_medianImputedPrice': float(variance_medianImputedPrice),
        'numNulls_medianImputedPrice': int(numNulls_medianImputedPrice),
        'numUnknowns_unknownImputedTitle': int(numUnknowns_unknownImputedTitle),
    }
    return res

In [15]:
res = part3(product_data)

In [16]:
import json
expected_path = Path("~/public/pa3/expected_1_3.json").expanduser()
with open(expected_path) as expected_file:
    expected = json.load(expected_file)

assert compare_dicts(expected, res), "Task 1.3 result mismatch"
print("✅ Task 1.3 output matches expected.")

✅ Task 1.3 output matches expected.


# Is that it?

To make this dataset ready for ML tasks, there are many more feature engineering steps to be done, which we haven't covered here. For example, textual data can be fed into a Word2Vec model for meaningful embeddings. We've stuck with a few simple operations here. Next, in task 2, you'll use preprocessed train and test datasets to train Xgboost models in a multi-node fashion!

In [17]:
# shutdown!
ray.shutdown()