In [4]:
import pandas as pd
import json
import os
import re

pd.set_option('display.max_columns', None)

In [5]:
# customer repeat orders -customer_id

with open("iseller_api_response.json", "r") as file:
    data = json.load(file)

In [6]:
def normalize_list_dictionary(dictionary):
    normalized_dict_list = []
    for element in dictionary:
        num_orders = len(element)
        index = 0
        while num_orders > index:
            normalized_dict_list.append(element[index])
            index += 1
    # Remove Keys with None values
    filtered_orders_dict = [order for order in normalized_dict_list if order is not None]
    return filtered_orders_dict

In [7]:
def proccess_iseller_data(data):
    orders_df = pd.DataFrame(data['orders'])
    
    order_details_dict = orders_df["order_details"].to_list()
    normalized_order_details = normalize_list_dictionary(order_details_dict)
    order_details = pd.DataFrame(normalized_order_details)

    #rename columns, grand_total_amount refers to the entire amount of the whole order while total_order_amount refers to total product for a product type
    orders_df.rename(columns = {'total_order_amount':' grand_total_amount'}, inplace = True) 

    #merge columns
    relevant_order_details = order_details[["order_id", "product_id", "product_name", "product_type", "fulfillment_status", "quantity", "base_price", "total_order_amount"]]
    merged_orders = orders_df.merge(relevant_order_details, how="left", on="order_id")
    
    # convert to correct dtypes
    merged_orders['order_date'] = pd.to_datetime(merged_orders['order_date'])
    merged_orders['closed_date'] = pd.to_datetime(merged_orders['closed_date'])

    return merged_orders


def get_list_json(input_folder):
    # Initialize a list to store all JSON data
    all_data = []

    # Iterate through each JSON file in the folder
    for filename in os.listdir(input_folder):
        if filename.endswith('.json'):
            file_path = os.path.join(input_folder, filename)
            with open(file_path, 'r', encoding='utf-8') as json_file:
                data = json.load(json_file)
                if isinstance(data, list):
                    all_data.extend(data)
                elif isinstance(data, dict):
                    all_data.append(data)

    return all_data


In [8]:
def extract_volume(product_name):
    volume_regex = r'(\d+(\.\d+)?\s*(ml|l))'  # Regex pattern to match both 'ml' and 'l' formats
    #match = re.search(volume_regex, product_name)
    match = re.search(volume_regex, str(product_name))  # Ensure product_name is converted to string

    if match:
        return match.group(1)
    else:
        return 'unspecified'
    
def clean_product_name(product_name):
    # volume_regex = r'\b\d+(\.\d+)?\s*(ml|l)\b'  # '\b' for word boundaries
    # pattern = r'\b\d+(\.\d+)?\s*(ml|l)\b|bli - |gof - '  # '\b' for word boundaries
    pattern = r'\b\d+(\.\d+)?\s*(ml|l)\b|bli - |gof - |- resell ecer| - resell dus|-resell dus|- dus|cabang - '  # '\b' for word boundaries

    return re.sub(pattern, '', str(product_name))  # Ensure product_name is converted to string


In [10]:
# Store list of json data 
list_json_data = get_list_json('data/raw_data')
test_json = list_json_data[:5]
list_df = []

# concat list of df 
for i in list_json_data:
    df = proccess_iseller_data(i)
    list_df.append(df)

full_iseller_data = pd.concat(list_df)

In [11]:
full_iseller_data['order_date'] = pd.to_datetime(full_iseller_data['order_date'])
full_iseller_data['hour'] = full_iseller_data['order_date'].dt.hour
full_iseller_data['day'] = full_iseller_data['order_date'].dt.day
full_iseller_data['month'] = full_iseller_data['order_date'].dt.month
full_iseller_data['week'] = full_iseller_data['order_date'].dt.isocalendar().week
full_iseller_data['Quarter'] = full_iseller_data['order_date'].dt.quarter


# get location columns, get coordinates ?
full_iseller_data['location'] = full_iseller_data['outlet_name'].str.split(' - ').str[0]


# standardize naming
full_iseller_data['product_name'] = full_iseller_data['product_name'].str.lower()
full_iseller_data['Volume'] = full_iseller_data['product_name'].apply(extract_volume)
full_iseller_data['clean_product_name'] = full_iseller_data['product_name'].apply(clean_product_name)
full_iseller_data['clean_product_name'] = full_iseller_data['clean_product_name'].str.strip()


In [12]:
full_iseller_data.to_csv('full_iseller_data.csv')

In [13]:
print("First Order: ", full_iseller_data["order_date"].min())
print("Latest Order: ", full_iseller_data["order_date"].max())

First Order:  2024-01-01 00:03:37
Latest Order:  2024-05-09 12:12:41


In [154]:
# full_iseller_data.sort_values(by="order_date", ascending=True).tail()

In [None]:
# anggur merah gold cap orang tua anggur merah premium	anggur kolesom 17.5% cap orang tua are these (are all amer cap orang tua)