# Task 1: Feature Engineering with Modin on Ray
In this task, we'll perform some feature engineering for the Amazon Reviews dataset. We'll use Modin so that you can conveniently write pandas code and have your workload scale to a cluster.

In [1]:
import ray
ray.shutdown()
ray.init()
import os 
import json
import modin.pandas as pd
import numpy as np
from ast import literal_eval
from pathlib import Path

def compare_dicts(d1, d2, err=1e-5):
    keys1 = set(d1.keys())
    keys2 = set(d2.keys())
    if keys1 != keys2:
        return False
    for k in keys1:
        if isinstance(d1[k], list) and isinstance(d2[k], list):
            if sum([abs(x - y) for x, y in zip(d1[k], d2[k])]) > err:
                return False
        else:
            v1 = float(d1[k])
            v2 = float(d2[k])
            if abs(v1 - v2) > err:
                return False
    return True

2025-11-16 14:51:43,030	INFO worker.py:2004 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


[36m(_remote_exec_multi_chain pid=10551)[0m . fn=<function Map.register.<locals>.caller.<locals>.<lambda> at 0x11d5a39a0>, obj=       categories
[36m(_remote_exec_multi_chain pid=10551)[0m 56630   [[Books]]
[36m(_remote_exec_multi_chain pid=10551)[0m 56631   [[Books]]
[36m(_remote_exec_multi_chain pid=10551)[0m 56632   [[Books]]
[36m(_remote_exec_multi_chain pid=10551)[0m 56633   [[Books]]
[36m(_remote_exec_multi_chain pid=10551)[0m 56634   [[Books]]
[36m(_remote_exec_multi_chain pid=10551)[0m ...           ...
[36m(_remote_exec_multi_chain pid=10551)[0m 139235  [[Books]]
[36m(_remote_exec_multi_chain pid=10551)[0m 139236  [[Books]]
[36m(_remote_exec_multi_chain pid=10551)[0m 139237  [[Books]]
[36m(_remote_exec_multi_chain pid=10551)[0m 139238  [[Books]]
[36m(_remote_exec_multi_chain pid=10551)[0m 139239  [[Books]]
[36m(_remote_exec_multi_chain pid=10551)[0m 
[36m(_remote_exec_multi_chain pid=10551)[0m [82610 rows x 1 columns], args=[], kwargs={}
[36m(_remo

In [2]:
ROOT = ""
ROOT = os.path.expanduser(ROOT) # expand user contractions

In [3]:
product_data = pd.read_csv(os.path.join(ROOT, "metadata_header.csv"))

# make sure columns are of the right dtypes
for column in ["salesRankDict", "categories", "related"]:
    product_data[column] = product_data[column].apply(lambda x: literal_eval(x) if not pd.isna(x) else None)

# Task 1.1: flatten `categories` and `salesRank`

* From `categories` (a list of lists), take the first element of the first list (i.e., `categories[0][0]`) and place it in a new column `category`. If missing/empty, set `null`.
* `salesRankDict` contains at most one pair of (key, value). From `salesRankDict`, extract the key and the value into new columns `salesCategory` and `salesRank`. If missing/empty, set `null`.

In [None]:
def part1(product_data):
    
    # Column names
    salesRankDict_column = 'salesRankDict'
    categories_column = 'categories' 
    asin_column = 'asin'
    
    # Outputs
    category_column = 'category' 
    salesCategory_column = 'salesCategory' 
    salesRank_column = 'salesRank' 

    res = {
        'count_total': None,
        'mean_salesRank': None,
        'variance_salesRank': None, 
        'numNulls_category': None,
        'countDistinct_category': None,
        'numNulls_salesCategory': None,
        'countDistinct_salesCategory': None
    }
    
    # Functions for preprocessing
    def preprocess_cat(cats):
        # YOUR CODE HERE
        raise NotImplementedError()

    def get_sales_cat(salesRank):
        # YOUR CODE HERE
        raise NotImplementedError()

    def get_sales_rank(salesRank):
        # YOUR CODE HERE
        raise NotImplementedError()

    # Apply preprocessing functions
    product_data[category_column] = product_data[categories_column].apply(preprocess_cat)
    product_data[salesCategory_column] = product_data[salesRankDict_column].apply(get_sales_cat)
    product_data[salesRank_column] = product_data[salesRankDict_column].apply(get_sales_rank)

    # Calculate statistics    
    # YOUR CODE HERE
    raise NotImplementedError()

    # Put results in a dictionary
    res = {
        'count_total': int(count_total),
        'mean_salesRank': float(mean_salesRank),
        'variance_salesRank': float(variance_salesRank),
        'numNulls_category': int(numNulls_category),
        'countDistinct_category': int(countDistinct_category),
        'numNulls_salesCategory': int(numNulls_salesCategory),
        'countDistinct_salesCategory': int(countDistinct_salesCategory)
    }

    return res


In [7]:
res = part1(product_data)

In [8]:
import json
expected_path = Path("expected_1_1.json").expanduser()
with open(expected_path) as expected_file:
    expected = json.load(expected_file)

assert compare_dicts(expected, res), "Task 1.1 result mismatch"
print("✅ Task 1.1 output matches expected.")

AssertionError: Task 1.1 result mismatch

# Task 1.2: flatten `related`

* For each row, compute mean price of products referenced by `related["also_viewed"]`. Ignore the ASINs not present in `product` or with `price=null`. Do not impute None (e.g., do not use fillna(0) or similar).

In [None]:
def part2(product_data):
    #Column names
    asin_column = 'asin'
    price_column = 'price'
    related_column = 'related'
    
    attribute = 'also_viewed'
    
    #Outputs
    meanPriceAlsoViewed_column = 'meanPriceAlsoViewed'
    
    res = {
        'count_total': None,
        'mean_meanPriceAlsoViewed': None,
        'variance_meanPriceAlsoViewed':  None,
        'numNulls_meanPriceAlsoViewed':  None
    }
    
    # Get also_viewed (a list of asins) from related_column and make it a new column
    def get_also_viewed(related):
        # YOUR CODE HERE
        raise NotImplementedError()
    
    product_data[attribute] = product_data[related_column].apply(get_also_viewed)

    # Make joins to compute meanPriceAlsoViewed
    # Then calculate Statistics
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # Put results in res
    res = {
        'count_total': int(count_total),
        'mean_meanPriceAlsoViewed': float(mean_meanPriceAlsoViewed),
        'variance_meanPriceAlsoViewed': float(variance_meanPriceAlsoViewed),
        'numNulls_meanPriceAlsoViewed': int(numNulls_meanPriceAlsoViewed)
    }
    return res

In [None]:
res = part2(product_data)

In [None]:
import json
expected_path = Path("expected_1_2.json").expanduser()
with open(expected_path) as expected_file:
    expected = json.load(expected_file)

assert compare_dicts(expected, res), "Task 1.2 result mismatch"
print("✅ Task 1.2 output matches expected.")

# Task 1.3: Impute `price`

- Impute `price` with the *mean* and write to `meanImputedPrice`.
- Impute `price` with the *median* and write to `medianImputedPrice`.
- For `title`, replace `null` with the string `"unknown"` and write to `unknownImputedTitle`.


In [None]:
from sklearn.impute import SimpleImputer

def part3(product_data):

    price_column = 'price'
    title_column = 'title'
    
    # Outputs
    meanImputedPrice_column = 'meanImputedPrice'
    medianImputedPrice_column = 'medianImputedPrice'
    unknownImputedTitle_column = 'unknownImputedTitle'
    
    res = {
        'count_total': None,
        'mean_meanImputedPrice': None,
        'variance_meanImputedPrice': None,
        'numNulls_meanImputedPrice': None,
        'mean_medianImputedPrice': None,
        'variance_medianImputedPrice': None,
        'numNulls_medianImputedPrice': None,
        'numUnknowns_unknownImputedTitle': None,
    }
    
    # Define Imputers
    mean_imputer = SimpleImputer(strategy='mean')
    median_imputer = SimpleImputer(strategy='median')

    # Apply impute data from price column using both mean and median, fill titles with unknowns.
    # Then calculate statistics.
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # Put results in res 
    res = {
        'count_total': int(count_total),
        'mean_meanImputedPrice': float(mean_meanImputedPrice),
        'variance_meanImputedPrice': float(variance_meanImputedPrice),
        'numNulls_meanImputedPrice': int(numNulls_meanImputedPrice),
        'mean_medianImputedPrice': float(mean_medianImputedPrice),
        'variance_medianImputedPrice': float(variance_medianImputedPrice),
        'numNulls_medianImputedPrice': int(numNulls_medianImputedPrice),
        'numUnknowns_unknownImputedTitle': int(numUnknowns_unknownImputedTitle),
    }
    return res

In [None]:
res = part3(product_data)

In [None]:
import json
expected_path = Path("expected_1_3.json").expanduser()
with open(expected_path) as expected_file:
    expected = json.load(expected_file)

assert compare_dicts(expected, res), "Task 1.3 result mismatch"
print("✅ Task 1.3 output matches expected.")

# Is that it?

To make this dataset ready for ML tasks, there are many more feature engineering steps to be done, which we haven't covered here. For example, textual data can be fed into a Word2Vec model for meaningful embeddings. We've stuck with a few simple operations here. Next, in task 2, you'll use preprocessed train and test datasets to train Xgboost models in a multi-node fashion!

In [None]:
# shutdown!
ray.shutdown()