# Data Cleaning and Preprocessing

The preprocessing will begin by loading the raw acquired dataset categories (review and meta) from disk and storing it in respective parquet files for easy storing and access. The dataset will be read from the parquet file into a spark dataframe where it will be first be merged with its corresponding meta dataset and then cleaned.

In [1]:
import findspark
from pyspark.sql import SparkSession
from pyspark import SparkContext 
import pyspark
from datasets import load_from_disk
from pyspark.sql.functions import col, length, trim, when, lit, from_json, split, size, from_unixtime, year
from pyspark.sql.types import StructType, StructField, StringType
from functools import reduce
from pyspark.sql import DataFrame

In [2]:
findspark.init()

In [3]:
spark = SparkSession.builder \
.master("local[2]") \
.appName("AmazonReviews") \
.config("spark.driver.memory", "6g") \
.config("spark.executor.memory", "6g") \
.getOrCreate()

In [4]:
all_categories_cleaned = []

In [5]:
categories = [
    "All_Beauty", "Amazon_Fashion", "Appliances", "Arts_Crafts_and_Sewing", "Automotive",
    "Baby_Products", "Beauty_and_Personal_Care", "Books", "CDs_and_Vinyl",
    "Cell_Phones_and_Accessories", "Clothing_Shoes_and_Jewelry", "Digital_Music", "Electronics",
    "Gift_Cards", "Grocery_and_Gourmet_Food", "Handmade_Products", "Health_and_Household",
    "Health_and_Personal_Care", "Home_and_Kitchen", "Industrial_and_Scientific", "Kindle_Store",
    "Magazine_Subscriptions", "Movies_and_TV", "Musical_Instruments", "Office_Products",
    "Patio_Lawn_and_Garden", "Pet_Supplies", "Software", "Sports_and_Outdoors",
    "Subscription_Boxes", "Tools_and_Home_Improvement", "Toys_and_Games", "Video_Games", "Unknown"
]

### Function: Assigning the Brand

This function will parse the 'details' column for brand or will otherwise extract the brand name from the non-null value in the 'store' column.

In [4]:
def set_brand(df):
    details_schema = StructType([
        StructField("Brand", StringType(), True)
    ])

    df = df.withColumn("details_parsed", from_json(col("details"), details_schema))

    df = df.withColumn(
        "brand",
        when(
            col("details_parsed.Brand").isNotNull() & (trim(col("details_parsed.Brand")) != ""),
            trim(col("details_parsed.Brand"))
        ).when(
            col("store").isNotNull() & (trim(col("store")) != ""),
            trim(col("store"))
        ).otherwise(lit("Unknown"))
    )

    df = df.drop("details_parsed")

    return df

### Function: Cleaning the Dataset

This function will combine all the necessary cleaning tasks. It also includes extracting the brand which has its own function call. After the review and meta dataset is merged for each category, this function will be called on the dataset.

In [5]:
def clean_dataset(df):
    df = df.filter(col("rating").isin([1, 2, 3, 4, 5]))
    df = df.filter((col("text").isNotNull()) & (length(trim(col("text"))) > 0))
    df = set_brand(df)
    df = df.dropDuplicates(["user_id", "asin", "text"])
    df = df.withColumn(
    "review_length", 
    size(split(col("text"), r"\s+"))
    )
    df = df.withColumn(
    "year", 
    when(
        col("timestamp").isNotNull(), 
        year(from_unixtime(col("timestamp") / 1000))
    ).otherwise(None)
    )
    return df

### All Beauty

In [7]:
reviews = load_from_disk("D:/BigData/raw_review_All_Beauty")["full"]
reviews.to_parquet("D:/BigData/review_parquet_All_Beauty")

meta = load_from_disk("D:/BigData/raw_meta_All_Beauty")["full"]
meta.to_parquet("D:/BigData/meta_parquet_All_Beauty")

Creating parquet from Arrow format:   0%|          | 0/702 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/113 [00:00<?, ?ba/s]

172622243

In [9]:
review_df = spark.read.parquet("D:/BigData/review_parquet_All_Beauty")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_All_Beauty")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)


### Amazon Fashion

In [None]:
reviews = load_from_disk("D:/BigData/raw_review_Amazon_Fashion")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Amazon_Fashion")

meta = load_from_disk("D:/BigData/raw_meta_Amazon_Fashion")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Amazon_Fashion")

Creating parquet from Arrow format:   0%|          | 0/2501 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/827 [00:00<?, ?ba/s]

1144976848

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Amazon_Fashion")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Amazon_Fashion")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

### Appliances

In [11]:
reviews = load_from_disk("D:/BigData/raw_review_Appliances")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Appliances")

meta = load_from_disk("D:/BigData/raw_meta_Appliances")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Appliances")

Creating parquet from Arrow format:   0%|          | 0/2129 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/95 [00:00<?, ?ba/s]

249325037

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Appliances")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Appliances")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

### Arts Crafts and Sewing

In [13]:
reviews = load_from_disk("D:/BigData/raw_review_Arts_Crafts_and_Sewing")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Arts_Crafts_and_Sewing")

meta = load_from_disk("D:/BigData/raw_meta_Arts_Crafts_and_Sewing")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Arts_Crafts_and_Sewing")

Creating parquet from Arrow format:   0%|          | 0/8967 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/802 [00:00<?, ?ba/s]

1893261407

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Arts_Crafts_and_Sewing")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Arts_Crafts_and_Sewing")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it

### Automotive

In [15]:
reviews = load_from_disk("D:/BigData/raw_review_Automotive")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Automotive")

meta = load_from_disk("D:/BigData/raw_meta_Automotive")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Automotive")

Creating parquet from Arrow format:   0%|          | 0/19956 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/2004 [00:00<?, ?ba/s]

4700533152

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Automotive")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Automotive")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

### Baby Products

In [17]:
reviews = load_from_disk("D:/BigData/raw_review_Baby_Products")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Baby_Products")

meta = load_from_disk("D:/BigData/raw_meta_Baby_Products")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Baby_Products")

Creating parquet from Arrow format:   0%|          | 0/6029 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/218 [00:00<?, ?ba/s]

603639554

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Baby_Products")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Baby_Products")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

### Beauty and Personal Care

In [None]:
reviews = load_from_disk("D:/BigData/raw_review_Beauty_and_Personal_Care")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Beauty_and_Personal_Care")

meta = load_from_disk("D:/BigData/raw_meta_Beauty_and_Personal_Care")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Beauty_and_Personal_Care")

Creating parquet from Arrow format:   0%|          | 0/23912 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1029 [00:00<?, ?ba/s]

2448035609

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Beauty_and_Personal_Care")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Beauty_and_Personal_Care")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

### Books

In [22]:
reviews = load_from_disk("D:/BigData/raw_review_Books")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Books")

meta = load_from_disk("D:/BigData/raw_meta_Books")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Books")

Loading dataset from disk:   0%|          | 0/33 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/29476 [00:00<?, ?ba/s]

Loading dataset from disk:   0%|          | 0/28 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4449 [00:00<?, ?ba/s]

13734276777

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Books")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Books")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

### CDs and Vinyl

In [24]:
reviews = load_from_disk("D:/BigData/raw_review_CDs_and_Vinyl")["full"]
reviews.to_parquet("D:/BigData/review_parquet_CDs_and_Vinyl")

meta = load_from_disk("D:/BigData/raw_meta_CDs_and_Vinyl")["full"]
meta.to_parquet("D:/BigData/meta_parquet_CDs_and_Vinyl")

Creating parquet from Arrow format:   0%|          | 0/4828 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/702 [00:00<?, ?ba/s]

804107486

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_CDs_and_Vinyl")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_CDs_and_Vinyl")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

### Cell Phones and Accessories

In [26]:
reviews = load_from_disk("D:/BigData/raw_review_Cell_Phones_and_Accessories")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Cell_Phones_and_Accessories")

meta = load_from_disk("D:/BigData/raw_meta_Cell_Phones_and_Accessories")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Cell_Phones_and_Accessories")

Creating parquet from Arrow format:   0%|          | 0/20813 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1289 [00:00<?, ?ba/s]

3497601110

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Cell_Phones_and_Accessories")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Cell_Phones_and_Accessories")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

### Clothing Shoes and Jewelry

In [28]:
reviews = load_from_disk("D:/BigData/raw_review_Clothing_Shoes_and_Jewelry")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Clothing_Shoes_and_Jewelry")

meta = load_from_disk("D:/BigData/raw_meta_Clothing_Shoes_and_Jewelry")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Clothing_Shoes_and_Jewelry")

Loading dataset from disk:   0%|          | 0/38 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/66034 [00:00<?, ?ba/s]

Loading dataset from disk:   0%|          | 0/31 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/7219 [00:00<?, ?ba/s]

15430352328

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Clothing_Shoes_and_Jewelry")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Clothing_Shoes_and_Jewelry")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

### Digital Music

In [30]:
reviews = load_from_disk("D:/BigData/raw_review_Digital_Music")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Digital_Music")

meta = load_from_disk("D:/BigData/raw_meta_Digital_Music")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Digital_Music")

Creating parquet from Arrow format:   0%|          | 0/131 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/71 [00:00<?, ?ba/s]

52063488

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Digital_Music")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Digital_Music")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

### Electronics

In [32]:
reviews = load_from_disk("D:/BigData/raw_review_Electronics")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Electronics")

meta = load_from_disk("D:/BigData/raw_meta_Electronics")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Electronics")

Loading dataset from disk:   0%|          | 0/34 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/43887 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1611 [00:00<?, ?ba/s]

4603611085

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Electronics")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Electronics")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

### Gift Cards

In [34]:
reviews = load_from_disk("D:/BigData/raw_review_Gift_Cards")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Gift_Cards")

meta = load_from_disk("D:/BigData/raw_meta_Gift_Cards")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Gift_Cards")

Creating parquet from Arrow format:   0%|          | 0/153 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

1740761

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Gift_Cards")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Gift_Cards")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

### Grocery and Gourmet Food

In [36]:
reviews = load_from_disk("D:/BigData/raw_review_Grocery_and_Gourmet_Food")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Grocery_and_Gourmet_Food")

meta = load_from_disk("D:/BigData/raw_meta_Grocery_and_Gourmet_Food")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Grocery_and_Gourmet_Food")

Creating parquet from Arrow format:   0%|          | 0/14319 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/604 [00:00<?, ?ba/s]

1178355442

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Grocery_and_Gourmet_Food")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Grocery_and_Gourmet_Food")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

### Handmade Products

In [38]:
reviews = load_from_disk("D:/BigData/raw_review_Handmade_Products")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Handmade_Products")

meta = load_from_disk("D:/BigData/raw_meta_Handmade_Products")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Handmade_Products")

Creating parquet from Arrow format:   0%|          | 0/665 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/165 [00:00<?, ?ba/s]

340772183

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Handmade_Products")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Handmade_Products")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

### Health and Household

In [40]:
reviews = load_from_disk("D:/BigData/raw_review_Health_and_Household")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Health_and_Household")

meta = load_from_disk("D:/BigData/raw_meta_Health_and_Household")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Health_and_Household")

Creating parquet from Arrow format:   0%|          | 0/25632 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/798 [00:00<?, ?ba/s]

2152348034

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Health_and_Household")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Health_and_Household")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

### Health and Personal Care

In [42]:
reviews = load_from_disk("D:/BigData/raw_review_Health_and_Personal_Care")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Health_and_Personal_Care")

meta = load_from_disk("D:/BigData/raw_meta_Health_and_Personal_Care")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Health_and_Personal_Care")

Creating parquet from Arrow format:   0%|          | 0/495 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/61 [00:00<?, ?ba/s]

97345359

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Health_and_Personal_Care")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Health_and_Personal_Care")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

### Home and Kitchen

In [44]:
reviews = load_from_disk("D:/BigData/raw_review_Home_and_Kitchen")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Home_and_Kitchen")

meta = load_from_disk("D:/BigData/raw_meta_Home_and_Kitchen")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Home_and_Kitchen")

Loading dataset from disk:   0%|          | 0/45 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/67410 [00:00<?, ?ba/s]

Loading dataset from disk:   0%|          | 0/21 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3736 [00:00<?, ?ba/s]

10319696112

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Home_and_Kitchen")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Home_and_Kitchen")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

### Industrial and Scientific

In [46]:
reviews = load_from_disk("D:/BigData/raw_review_Industrial_and_Scientific")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Industrial_and_Scientific")

meta = load_from_disk("D:/BigData/raw_meta_Industrial_and_Scientific")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Industrial_and_Scientific")

Creating parquet from Arrow format:   0%|          | 0/5184 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/428 [00:00<?, ?ba/s]

986634833

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Industrial_and_Scientific")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Industrial_and_Scientific")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

### Kindle Store

In [48]:
reviews = load_from_disk("D:/BigData/raw_review_Kindle_Store")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Kindle_Store")

meta = load_from_disk("D:/BigData/raw_meta_Kindle_Store")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Kindle_Store")

Loading dataset from disk:   0%|          | 0/25 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/25578 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1592 [00:00<?, ?ba/s]

6494217841

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Kindle_Store")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Kindle_Store")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

### Magazine Subscriptions

In [50]:
reviews = load_from_disk("D:/BigData/raw_review_Magazine_Subscriptions")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Magazine_Subscriptions")

meta = load_from_disk("D:/BigData/raw_meta_Magazine_Subscriptions")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Magazine_Subscriptions")

Creating parquet from Arrow format:   0%|          | 0/72 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

3247117

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Magazine_Subscriptions")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Magazine_Subscriptions")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

### Movies and TV

In [52]:
reviews = load_from_disk("D:/BigData/raw_review_Movies_and_TV")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Movies_and_TV")

meta = load_from_disk("D:/BigData/raw_meta_Movies_and_TV")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Movies_and_TV")

Creating parquet from Arrow format:   0%|          | 0/17329 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/749 [00:00<?, ?ba/s]

834700736

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Movies_and_TV")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Movies_and_TV")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

### Musical Instruments

In [54]:
reviews = load_from_disk("D:/BigData/raw_review_Musical_Instruments")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Musical_Instruments")

meta = load_from_disk("D:/BigData/raw_meta_Musical_Instruments")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Musical_Instruments")

Creating parquet from Arrow format:   0%|          | 0/3018 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/214 [00:00<?, ?ba/s]

553296902

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Musical_Instruments")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Musical_Instruments")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

### Office Products

In [56]:
reviews = load_from_disk("D:/BigData/raw_review_Office_Products")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Office_Products")

meta = load_from_disk("D:/BigData/raw_meta_Office_Products")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Office_Products")

Creating parquet from Arrow format:   0%|          | 0/12846 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/711 [00:00<?, ?ba/s]

1878241877

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Office_Products")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Office_Products")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

### Patio Lawn and Garden

In [58]:
reviews = load_from_disk("D:/BigData/raw_review_Patio_Lawn_and_Garden")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Patio_Lawn_and_Garden")

meta = load_from_disk("D:/BigData/raw_meta_Patio_Lawn_and_Garden")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Patio_Lawn_and_Garden")

Creating parquet from Arrow format:   0%|          | 0/16491 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/852 [00:00<?, ?ba/s]

2373221087

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Patio_Lawn_and_Garden")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Patio_Lawn_and_Garden")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

### Pet Supplies

In [60]:
reviews = load_from_disk("D:/BigData/raw_review_Pet_Supplies")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Pet_Supplies")

meta = load_from_disk("D:/BigData/raw_meta_Pet_Supplies")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Pet_Supplies")

Creating parquet from Arrow format:   0%|          | 0/16828 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/493 [00:00<?, ?ba/s]

1362181702

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Pet_Supplies")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Pet_Supplies")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

### Software

In [62]:
reviews = load_from_disk("D:/BigData/raw_review_Software")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Software")

meta = load_from_disk("D:/BigData/raw_meta_Software")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Software")

Creating parquet from Arrow format:   0%|          | 0/4881 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/90 [00:00<?, ?ba/s]

232470037

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Software")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Software")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

### Sports and Outdoors

In [64]:
reviews = load_from_disk("D:/BigData/raw_review_Sports_and_Outdoors")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Sports_and_Outdoors")

meta = load_from_disk("D:/BigData/raw_meta_Sports_and_Outdoors")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Sports_and_Outdoors")

Creating parquet from Arrow format:   0%|          | 0/19596 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1588 [00:00<?, ?ba/s]

3618269396

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Sports_and_Outdoors")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Sports_and_Outdoors")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

### Subscription Boxes

In [66]:
reviews = load_from_disk("D:/BigData/raw_review_Subscription_Boxes")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Subscription_Boxes")

meta = load_from_disk("D:/BigData/raw_meta_Subscription_Boxes")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Subscription_Boxes")

Creating parquet from Arrow format:   0%|          | 0/17 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

1169913

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Subscription_Boxes")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Subscription_Boxes")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

### Tools and Home Improvement

In [68]:
reviews = load_from_disk("D:/BigData/raw_review_Tools_and_Home_Improvement")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Tools_and_Home_Improvement")

meta = load_from_disk("D:/BigData/raw_meta_Tools_and_Home_Improvement")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Tools_and_Home_Improvement")

Loading dataset from disk:   0%|          | 0/19 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/26983 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1474 [00:00<?, ?ba/s]

4276745510

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Tools_and_Home_Improvement")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Tools_and_Home_Improvement")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

### Toys and Games

In [70]:
reviews = load_from_disk("D:/BigData/raw_review_Toys_and_Games")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Toys_and_Games")

meta = load_from_disk("D:/BigData/raw_meta_Toys_and_Games")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Toys_and_Games")

Creating parquet from Arrow format:   0%|          | 0/16261 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/891 [00:00<?, ?ba/s]

2291740523

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Toys_and_Games")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Toys_and_Games")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

### Unknown

In [73]:
reviews = load_from_disk("D:/BigData/raw_review_Unknown")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Unknown")

meta = load_from_disk("D:/BigData/raw_meta_Unknown")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Unknown")

Loading dataset from disk:   0%|          | 0/43 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/63815 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/391 [00:00<?, ?ba/s]

548067424

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Unknown")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Unknown")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)

### Video Games

In [75]:
reviews = load_from_disk("D:/BigData/raw_review_Video_Games")["full"]
reviews.to_parquet("D:/BigData/review_parquet_Video_Games")

meta = load_from_disk("D:/BigData/raw_meta_Video_Games")["full"]
meta.to_parquet("D:/BigData/meta_parquet_Video_Games")

Creating parquet from Arrow format:   0%|          | 0/4625 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/138 [00:00<?, ?ba/s]

383799939

In [None]:
review_df = spark.read.parquet("D:/BigData/review_parquet_Video_Games")
meta_df = spark.read.parquet("D:/BigData/meta_parquet_Video_Games")

review_df = review_df.withColumnRenamed("images", "review_images")
meta_df = meta_df.withColumnRenamed("images", "meta_images")
review_df = review_df.withColumnRenamed("title", "review_title")
meta_df = meta_df.withColumnRenamed("title", "meta_title")

merged_df = review_df.join(meta_df, on="parent_asin", how="inner")

cleaned_df = clean_dataset(merged_df)