In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
OUTPUT_FILE_CLEAN = '../dataset/master_data_cleaned.csv'

In [None]:
df_1 = pd.read_csv("../dataset/amazon_products.csv")
df_1.head()

In [None]:
df_2 = pd.read_csv("../dataset/amz_uk_processed_data.csv")
df_2.head()

In [None]:
df_1.columns

In [None]:
df_2.columns

In [None]:
def clean_reviews(review_str):
    if pd.isna(review_str):
        return 0
    cleaned = re.sub(r'[^\d]', '', str(review_str)) 
    return int(cleaned) if cleaned else 0

In [None]:
def create_and_clean_master_dataset():
    try:
        df1 = df_1
        df2 = df_2
    except FileNotFoundError as e:
        print(f"ERROR: File not found: {e}")
        return None

    print(f"df1: {len(df1)} row | df2: {len(df2)} row")

    df_master = pd.concat([df1, df2], ignore_index=True)
    print(f"Master Dataset has been join. Total datasest: {len(df_master)}")

    df_master.drop_duplicates(subset=['asin'], keep='first', inplace=True)
    print(f"Total baris unik setelah penghapusan duplikat: {len(df_master)}")

    text_cols = ['title', 'categoryName', 'category_id']
    for col in text_cols:
        if col in df_master.columns:
            df_master[col] = df_master[col].astype(str).str.lower().str.strip().replace('nan', np.nan)

    df_master['reviews'] = df_master['reviews'].apply(clean_reviews)
    df_master['stars'] = pd.to_numeric(df_master['stars'], errors='coerce').fillna(0.0).clip(0.0, 5.0)
    df_master['price'] = pd.to_numeric(df_master['price'], errors='coerce').fillna(0.0)
    df_master['listPrice'] = pd.to_numeric(df_master['listPrice'], errors='coerce').fillna(0.0)

    df_master['boughtInLastMonth'] = pd.to_numeric(df_master['boughtInLastMonth'], errors='coerce').fillna(0).astype(int)
    df_master.dropna(subset=['asin', 'title'], inplace=True)
    df_master = df_master[df_master['price'] > 0]
    df_master.to_csv(OUTPUT_FILE_CLEAN, index=False)
    print(f"Master Data yang sudah dibersihkan disimpan di: {OUTPUT_FILE_CLEAN}")

In [None]:
create_and_clean_master_dataset()

In [None]:
data = pd.read_csv(OUTPUT_FILE_CLEAN)
data.head()

In [None]:
data = data.drop('category_id', axis=1)
data = data.drop('categoryName', axis=1)
data.head()

In [None]:
data.to_csv(OUTPUT_FILE_CLEAN, index=False)