In [7]:
import pandas as pd
import os

In [31]:
class CustomerDataExtractor:
    def __init__(self, customer_orders=None, vip_customers=None):
        self.customer_orders_path = customer_orders
        self.vip_customers_path = vip_customers
        self.customer_orders = []
        self.vip_customer_ids = set()

    def read_data(self):
        if os.path.exists(self.customer_orders_path):
            self.customer_orders = pd.read_pickle(self.customer_orders_path)
        if os.path.exists(self.vip_customers_path):
            with open(self.vip_customers_path, 'r') as f:
                self.vip_customer_ids = set(int(line.strip()) for line in f if line.strip())
        else:
            raise FileNotFoundError(f"VIP customers file not found: {self.vip_customers_path}")
        
    def transform_data(self):
        data_structured = []
        category_conversion = {
            1: "Electronics",
            2: "Apparel",
            3: "Books",
            4: "Home Goods"
        }

        for customer in self.customer_orders:
            customer_id = int(customer['id'])
            customer_name = customer['name']
            customer_registration_date = pd.to_datetime(customer['registration_date'], errors='coerce')

            vip_status = True if customer_id in self.vip_customer_ids else False

            for order in customer.get('orders', []):
                order_id_raw = order['order_id']
                if isinstance(order_id_raw, int):
                    order_id = str(order_id_raw)
                else:
                    order_id = pd.NA
                order_date = pd.to_datetime(order['order_date'], errors='coerce')
                try:
                    order_total_value = float(order.get('order_total_value', 0))
                except (ValueError, TypeError):
                    order_total_value = 0.0

                for item in order.get('items', []):
                    item_id_raw = item['item_id']
                    if isinstance(item_id_raw, int):
                        item_id = str(item_id_raw).strip()
                    else:
                        item_id = pd.NA
                    item_name = item['product_name']
                    item_category = category_conversion.get(item['category'], "Misc")
                    try:
                        item_price = float(item['price'])
                    except (ValueError, TypeError):
                        item_price = 0.0

                    try:
                        item_quantity = int(item['quantity'])
                    except (ValueError, TypeError):
                        item_quantity = 0
                    
                    total_item_price = item_price * item_quantity
                    if order_total_value:
                        total_order_value_percentage = (total_item_price / order_total_value) * 100
                    else:
                        total_order_value_percentage = 0.0

                    data_structured.append({
                        'customer_id': customer_id,
                        'customer_name': customer_name,
                        'registration_date': customer_registration_date,
                        'is_vip': vip_status,
                        'order_id': order_id,
                        'order_date': order_date,
                        'product_id': item_id,
                        'product_name': item_name,
                        'category': item_category,
                        'unit_price': item_price,
                        'item_quantity': item_quantity,
                        'total_item_price': total_item_price,
                        'total_order_value_percentage': total_order_value_percentage
                    })
        dataframe = pd.DataFrame(data_structured)
        bad_ids_df = dataframe[dataframe['product_id'].isna() | dataframe['order_id'].isna()]

        dataframe['product_id'] = dataframe['product_id'].fillna(-1).astype('int64')
        dataframe['order_id'] = dataframe['order_id'].fillna(-1).astype('int64')
        dataframe = dataframe.astype({
            'customer_id': 'int64',
            'customer_name': 'string',
            'registration_date': 'datetime64[ns]',
            'is_vip': 'bool',
            'order_id': 'int64',
            'order_date': 'datetime64[ns]',
            'product_id': 'int64',
            'product_name': 'string',
            'category': 'string',
            'unit_price': 'float64',
            'item_quantity': 'int64',
            'total_item_price': 'float64',
            'total_order_value_percentage': 'float64'
        })

        dataframe.sort_values(by=['customer_id', 'order_id', 'product_id'], inplace=True)
        dataframe.reset_index(drop=True, inplace=True)
        return dataframe


In [32]:
extractor = CustomerDataExtractor('customer_orders.pkl', 'vip_customers.txt')
extractor.read_data()
final_df = extractor.transform_data()
print(final_df.dtypes)
print(final_df.head())

customer_id                              int64
customer_name                           string
registration_date               datetime64[ns]
is_vip                                    bool
order_id                                 int64
order_date                      datetime64[ns]
product_id                               int64
product_name                            string
category                                string
unit_price                             float64
item_quantity                            int64
total_item_price                       float64
total_order_value_percentage           float64
dtype: object
   customer_id customer_name   registration_date  is_vip  order_id  \
0            1    Customer 1 2022-12-31 04:19:19    True        -1   
1            1    Customer 1 2022-12-31 04:19:19    True        -1   
2            1    Customer 1 2022-12-31 04:19:19    True        -1   
3            1    Customer 1 2022-12-31 04:19:19    True         3   
4            2    Custome