In [9]:
import re
import pandas as pd
import numpy as np
import tannson_re

pd.set_option('display.max.rows', 4000)

paid_customer_pat = r'PAID\b.*?\bCUSTOMER'

paid_customer_compiler = re.compile(paid_customer_pat)

no_repair_keys = [
    'NO REPAIR', r'\bW[^\W_]{0,15}OUT\b', r'\b\"?NO"?\b[\W_]{0,15}FIX\b',
    r'CUSTOMER DOESN’T WANT TO \b(?:FIX|REPAIR|REPLACE)\b', 
    r'SUGGEST\b.*?\bFOR REPAIR', 'NOT TO REPAIR', 'RETURN DEVICE', 'NO ISSUE',
    r'\bCANCEL\b[^\W]ORDER\b', 'TRANSFER JOB', r'\bWON\'?T\b\s+BOOT\b', 'EXPENSIVE',
    r'\bCAN\'?T\b\s+FIX\b', 'VOID', 'NOT WORTH', 'IS WORKING', 'DAMAGED', 'COSTLY',
    'CANNOT FIX', 'RECYCLE', 'DIAGNOSTIC'
]

no_repair_pat = '|'.join(no_repair_keys)

no_repair_compiler = re.compile(no_repair_pat)

courtesy_keys = ['COURTESY', 'COMPLAIN', 'FREE', 'COMPLIMENTARY', 'WARRANTY']

courtesy_pat = '|'.join(courtesy_keys)

courtesy_compiler = re.compile(courtesy_pat)

no_tax_pat = r'CASH|CHECK'

no_tax_compiler = re.compile(no_tax_pat)

def paid_customers_fnc(cell):
    """
    Function references to Payment Method column.
    Capture scenarios of having paid customers in list objects and strings.
    Capture inclination of excluding payment method when no sale is made.
    """
    if isinstance(cell, list): # Search pattern in list objects
        return any(paid_customer_compiler.search(pym) for pym in cell)
    if isinstance(cell, str): # Search pattern in strings
        return bool(paid_customer_compiler.search(cell))
    if pd.isna(cell): # Consider absence of payment method as a no-sale
        return True
    return False # Return everything else as False
    
def n_price_coherence_fnc(cell):
    """
    Function references to Price column.
    Ensure service fees are consistent with the negative totals.
    """
    if isinstance(cell, list): # If list, return True if sum is negative
        return sum(cell) <= 0
    if isinstance(cell, float): # If float, return True if negative
        return cell <= 0
    if pd.isna(cell): # If value is null or missing, return True
        return True
    return False # Return everything else as False

def base_fnc(cell, compiler):
    """
    Function references to Service column for mask_no_repairs and mask_courtesies.
    Capture scenarios where no repair was requested or needed.
    Capture scenarios where the company gives a free repair or warranty repair.
    
    Function references to Payment Method column for mask_zero_tax.
    Capture scenarios where the payment method was cash, resulting in the zero tax.
    """
    if isinstance(cell, list): # Search pattern in list objects
        return any(compiler.search(key) for key in cell)
    if isinstance(cell, str): # Search pattern in strings
        return bool(compiler.search(cell))
    return False # Return everything else as False

def infer_payment_fnc(row):
    """ 
    Infer cash payment if...
    a) zero or null tax AND
    b) subtotal == total OR
    c) subtotal - discount == total
    
    Infer credit card payment if...
    a) tax AND
    b) subtotal + tax - discount == total
    """
    payment = row['Payment Method']
    tax = row['Tax']
    subtotal = row['Subtotal']
    discount = row['Discount']
    total = row['Total']

    if pd.notna(subtotal) and pd.notna(total):
        if (
            (pd.isna(tax) or tax == 0)
            and (
                (subtotal == total) 
                or (np.isclose(subtotal - discount, total))
            )
        ):
            payment = 'PAID CASH OR CHECK'
            return payment
        
        if (
            (pd.notna(tax) and tax > 0)
            and (
                (pd.notna(tax))
                and (np.isclose(subtotal + tax - discount, total))
            )
        ):
            payment = 'PAID CC'
            return payment

def correct_subs_and_tots(row):
    """
    Function is only applicable to cash payments.
    Replace null or zero subtotals with total value, if algebraically sound.
    Replace null or zero totals with subtotal value, if algebraically sound.
    """
    payment = row['Payment Method']
    subtotal = row['Subtotal']
    discount = row['Discount']
    total = row['Total']

    if re.search(no_tax_pat, payment):
        if (pd.notna(total) and total != 0) and (pd.isna(subtotal) or subtotal == 0):
            subtotal = total + discount
            return pd.Series({'Subtotal': subtotal, 'Total': total})
        if (pd.notna(subtotal) and subtotal != 0) and (pd.isna(total) or total == 0):
            total = subtotal - discount
            return pd.Series({'Subtotal': subtotal, 'Total': total})
    return pd.Series({'Subtotal': subtotal, 'Total': total})

def return_standardizations(entry, regex_container):
    """
    Iterate through either item_dict or services_list.
    Return the first standardized name, brand, and category if item.
    Return the first standardized name if service.
    """
    if isinstance(regex_container, dict):
        for brand, categories in regex_container.items():
            for category, rules in categories.items():
                for standardized_item, compiler in rules:
                    if compiler.search(entry):
                        return standardized_item, brand, category
        return None, None, None

    if isinstance(regex_container, list):
        for standardized_service, compiler in regex_container:
            if compiler.search(entry):
                return standardized_service
        return 'UNCATEGORIZED'
                 
def standardize_items(items, item_dict, exclude_words):
    """
    Apply item name standardizations for free text in lists and strings.
    Disregard accessories and other unrelated words captured in the exclude_words list.
    Return standardized item name, brand, and category.
    """
    standardized_items_list = []
    brands_list = []
    categories_list = []
    
    fallback = pd.Series({
        'Item_Standardized': None, 
        'Brand': None, 
        'Category': None
    })

    # Do nothing if unable to have captured the unstandardized item name in the Data Loading Phase
    if items is None:
        pass
        
    if isinstance(items, list):
        items = [item for item in items if not any(re.search(word, item) for word in exclude_words)]

        # Guard against empty lists after cleanup
        if not items:
            return fallback

        if len(items) > 1:
            for item in items:
                standardized_item, brand, category = return_standardizations(item, item_dict)
                standardized_items_list.append(standardized_item)
                brands_list.append(brand)
                categories_list.append(category)
                
            return pd.Series({
                'Item_Standardized': standardized_items_list,
                'Brand': brands_list,
                'Category': categories_list
        })

        else:
            # Guard against one item in list after cleanup
            items = items[0]
            
            standardized_item, brand, category = return_standardizations(items, item_dict)
            
            return pd.Series({
                'Item_Standardized': standardized_item,
                'Brand': brand,
                'Category': category
            })
    
    if isinstance(items, str):
        if any(re.search(word, items) for word in exclude_words):
            return fallback
            
        standardized_item, brand, category = return_standardizations(items, item_dict)
            
        return pd.Series({
            'Item_Standardized': standardized_item,
            'Brand': brand,
            'Category': category
        })
        
    return fallback

def standardize_services(services, services_list, exclude_words_2):
    """
    Apply service name standardizations for free text in lists and strings.
    Disregard unrelated words captured in the exclude_words_2 list.
    Return standardized service name.
    """
    standardized_services_list = []

    if services is None:
        pass
        
    if isinstance(services, list):
        services = [service for service in services if not any(re.search(word, service) for word in exclude_words_2)]

        if not services:
            return 'UNMAPPED'

        if len(services) > 1:
            for service in services:
                standardized_service = return_standardizations(service, services_list)
                standardized_services_list.append(standardized_service)
            return standardized_services_list

        else:
            services = services[0]
            standardized_service = return_standardizations(services, services_list)
            return standardized_service

    if isinstance(services, str):
        if any(re.search(word, services) for word in exclude_words_2):
            return 'UNMAPPED'
        standardized_service = return_standardizations(services, services_list)
        return standardized_service

    return None

In [10]:
# Initial setup:

# Load data:
o_df = pd.read_json(r'C:\Users\shuju\shus_workspace\tannson_proj\order_details.json')
m_df = pd.read_json(r'C:\Users\shuju\shus_workspace\tannson_proj\monetary_amts.json')

# Create new identifier:
m_df['Transaction Type'] = 'SALE'
voided_transactions_indices = m_df.index[m_df['File Type'] == 'V']
m_df.loc[voided_transactions_indices, 'Transaction Type'] = 'VOID' # Label voided transactions
m_df = m_df.drop(columns='File Type')

# Create new col for missing payment inferences:
m_df['Explicit Payment Method'] = 'YES'
move_col = m_df.pop('Explicit Payment Method')
m_df.insert(2, 'Explicit Payment Method', move_col)

# Correct out-of-bound date:
o_df.loc[5119, 'Date'] = pd.to_datetime('2017-05-24')
m_df.loc[5119, 'Date'] = pd.to_datetime('2017-05-24')

# Correct Subtotal:
m_df['Discount'] = m_df['Discount'].abs()
m_df['Subtotal'] = m_df['Subtotal'] + m_df['Discount']

In [11]:
# Correct negative Total discrepancies:

# Join Price and Service columns from o_df with m_df by indices
temp_df = m_df.join(o_df[['Price', 'Service']])
# Ensure no overlap with voided transactions
temp_df = temp_df.iloc[temp_df.index.difference(voided_transactions_indices)]
# Filter for negatives in Total
negative_tots = temp_df[temp_df['Total'] < 0].copy()
# Create mask for when customers are paid by Tannson
mask_negatives = negative_tots.apply(
    lambda row: paid_customers_fnc(row['Payment Method']) and n_price_coherence_fnc(row['Price']), axis=1
)

# Get indices and define appropriate labels
negative_indices = negative_tots.index[mask_negatives]
m_df.loc[negative_indices, 'Transaction Type'] = 'COMPANY PAID CUSTOMER'
m_df.loc[negative_indices, 'Payment Method'] = 'CASH TO CUSTOMER'
m_df.loc[negative_indices, 'Tax'] = 0
m_df.loc[negative_indices, 'Subtotal'] = m_df.loc[negative_indices, 'Total']
o_df.loc[negative_indices, 'Price'] = 0
o_df.loc[negative_indices, 'Service'] = 'TRADE IN'

# Correct exception cases based on manual inspection of files
m_df.loc[2960, ['Subtotal', 'Total']] = 160
m_df.loc[2960, 'Payment Method'] = 'PAID CASH'
o_df.loc[2960, 'Price'] = 160

m_df.loc[4115, ['Subtotal', 'Total']] = -30
m_df.loc[4115, 'Transaction Type'] = 'COMPANY PAID CUSTOMER'
m_df.loc[4115, 'Payment Method'] = 'CASH TO CUSTOMER'
o_df.loc[4115, 'Price'] = 0
o_df.loc[4115, 'Service'] = 'TRADE IN'

In [12]:
# Correct zero Total discrepancies:

# Filter for zeros in Total
zero_tots = temp_df[temp_df['Total'] == 0].copy()
# Create mask for no repairs
mask_no_repairs = zero_tots.apply(
    lambda row: base_fnc(row['Service'], no_repair_compiler), axis=1
)

# Get indices and define entries as no repairs
no_repair_indices = zero_tots.index[mask_no_repairs]
m_df.loc[no_repair_indices, 'Transaction Type'] = 'VOID'
# o_df.loc[no_repair_indices, 'Service'] = 'DIAGNOSIS - NO REPAIR'

# Create mask for courtesies
mask_courtesies = zero_tots.apply(
    lambda row: base_fnc(row['Service'], courtesy_compiler), axis=1
)
# Get indices and define entries as company courtesies
# Note: Overlap in no_repair_indices checked for accuracy 
courtesy_indices = zero_tots.index[mask_courtesies]
m_df.loc[courtesy_indices, 'Transaction Type'] = 'COMPANY COURTESY'
m_df.loc[courtesy_indices, 'Payment Method'] = None

# Filter for zero totals but relevant prices
w_prices = zero_tots[zero_tots['Price'] != 0].copy()
# Create mask for refunds, defined by abs. value of price == abs. value of refund amt
mask_refunds = w_prices.apply(
    lambda row: (
        abs(
            sum(row['Price']) if isinstance(row['Price'], list) # For list types
            else row['Price'] # For scalars
        )
        == abs(row['Refund Amt'])
    ),
    axis=1
)
# Get indices and void entries
refund_indices = w_prices.index[mask_refunds]
m_df.loc[refund_indices, 'Transaction Type'] = 'VOID'
# Obtain remaining indices and set Price val as the Subtotal and Total val
sum_prices_indices = w_prices.index.difference(refund_indices).to_numpy()
sum_prices = w_prices.loc[sum_prices_indices, 'Price'].apply(
    lambda cell: sum(cell) if isinstance(cell, list) else cell
)
m_df.loc[sum_prices_indices, ['Total', 'Subtotal']] = sum_prices

# Label all remaining zero tots as needing examination
exclude_indices = (
    set(no_repair_indices) | set(courtesy_indices) | set(refund_indices) | set(sum_prices_indices)
)
include_indices = zero_tots.index.difference(exclude_indices).to_numpy()
m_df.loc[include_indices, 'Transaction Type'] = 'FLAGGED — MISSING TOTAL'
m_df.loc[include_indices, 'Explicit Payment Method'] = 'NO'

m_df = m_df.drop('Refund Amt', axis=1)

In [13]:
# Correct Payment Method:

# Set no payment method for voided transactions
m_df.loc[4371, 'Transaction Type'] = 'VOID'
m_df.loc[m_df['Transaction Type'] == 'VOID', 'Payment Method'] = None

sales = m_df[m_df['Transaction Type'].str.contains('SALE')].copy()

# Standardize payment method strings
cash_compiler = re.compile('CASH')
mask_cash = sales.apply(
    lambda row: base_fnc(row['Payment Method'], cash_compiler), axis=1
)
i_cash = sales.index[mask_cash]
m_df.loc[i_cash, 'Payment Method'] = 'PAID CASH'

cc_compiler = re.compile('CC|CREDIT|CARD|VISA|AMEX|AMX|AE')
mask_cc = sales.apply(
    lambda row: base_fnc(row['Payment Method'], cc_compiler), axis=1
)
i_cc = sales.index[mask_cc]
m_df.loc[i_cc, 'Payment Method'] = 'PAID CC'

check_compiler = re.compile('CHECK|CHK')
mask_check = sales.apply(
    lambda row: base_fnc(row['Payment Method'], check_compiler), axis=1
)
i_check = sales.index[mask_check]
m_df.loc[i_check, 'Payment Method'] = 'PAID CHECK'

# Guess missing payment methods
m_inferred_pym = sales[sales['Payment Method'].isna()].copy()
m_inferred_pym['Payment Method'] = m_inferred_pym.apply(infer_payment_fnc, axis=1)

# Manual inspection of remaining files for payment method.
# Transactions 3521, 5676 paid with cash.
# Transaction 4371 voided.
# Payment status of other transactions not mentioned.
m_df.loc[[3521, 5676], 'Payment Method'] = 'PAID CASH'
m_inferred_pym = m_inferred_pym.drop(index=[3521, 5676])
m_inferred_pym.loc[m_inferred_pym['Payment Method'].isna(), 'Payment Method'] = 'UNCLASSIFIED'

# Assign inferences to m_df
m_inferred_pym_indices = m_inferred_pym.index.tolist()
m_df.loc[m_inferred_pym_indices, 'Payment Method'] = m_inferred_pym['Payment Method']
m_df.loc[m_inferred_pym_indices, 'Explicit Payment Method'] = 'NO'

# Guess ambiguous payment methods
a_inferred_pym = m_df[
    ~(m_df['Payment Method'].str.contains('PAID CASH|PAID CC|PAID CHECK|UNCLASSIFIED', na=False)) 
    & (m_df['Transaction Type'].str.contains('SALE', na=False))
    & (m_df['Payment Method'].notna())
    ].copy()
a_inferred_pym['Payment Method'] = a_inferred_pym.apply(infer_payment_fnc, axis=1)
a_inferred_pym.loc[a_inferred_pym['Payment Method'].isna(), 'Payment Method'] = 'UNCLASSIFIED'

# Assign inferences to m_df
a_inferred_pym_indices = a_inferred_pym.index.tolist()
m_df.loc[a_inferred_pym_indices, 'Payment Method'] = a_inferred_pym['Payment Method']
m_df.loc[a_inferred_pym_indices, 'Explicit Payment Method'] = 'NO'

In [14]:
# Correct algebraic discrepancies:

# Create new identifier for sales to define valid transactions in the business setting
m_df.loc[m_df['Transaction Type'].str.contains('SALE', na=False), 'Business Logic Consistency'] = True
# Set unclassified payment methods as being inapplicable for defining logic consistency
m_df.loc[m_df['Payment Method'].str.contains('UNCLASSIFIED', na=False), 'Business Logic Consistency'] = None

# Check for when Subtotal == Total w/o Tax, which violates paying by credit card rule
not_cc_logic = m_df.index[
    (m_df['Payment Method'].str.contains('PAID CC'))
    & (np.isclose
        (m_df['Subtotal'] - m_df['Discount'],
         m_df['Total'], atol=0.02)
      )
    ]
# Set transactions that violate the credit card rule to failing logic consistency
m_df.loc[not_cc_logic, 'Business Logic Consistency'] = False

# Filter for other algebraic discrepancies
tot_mismatches = m_df[
    (m_df['Transaction Type'] == 'SALE')
    & (m_df['Business Logic Consistency'] == True)
    & ~(np.isclose
        (m_df['Subtotal'] - m_df['Discount'] + m_df['Tax'],
         m_df['Total'], atol=0.02)
       )
    ].copy()

# Add Price column
tot_mismatches = tot_mismatches.join(o_df['Price'])

# Create mask where there should be zero tax
mask_zero_tax = tot_mismatches.apply(
    lambda row: (
        (base_fnc(row['Payment Method'], no_tax_compiler)) # Match on CASH or CHECK payments
        or (row['Subtotal'] == row['Total']) # Capture when Subtotal == Total
    ),
    axis=1
)

# Set tax values that match mask_zero_tax conditions to 0
zero_tax_indices = tot_mismatches.index[mask_zero_tax]
m_df.loc[zero_tax_indices, 'Tax'] = 0
tot_mismatches.loc[mask_zero_tax, 'Tax'] = 0

# Compute algebraic discrepancies again (2)
tot_mismatches = tot_mismatches[
    ~np.isclose(
        tot_mismatches['Subtotal']
        - tot_mismatches['Discount']
        + tot_mismatches['Tax'],
        tot_mismatches['Total'],
        atol=0.02
    )
    ]

# Use subtotal as total and total as subtotal if valid
tot_mismatches[['Subtotal', 'Total']] = tot_mismatches.apply(correct_subs_and_tots, axis=1)

# Transactions remain algebraically inconsistent after manual inspection ->
# Set remaining algebraic discrepancies as failing logic consistency
tot_mismatches.loc[
    ~np.isclose(
        tot_mismatches['Subtotal'] 
        - tot_mismatches['Discount'] 
        + tot_mismatches['Tax'], 
        tot_mismatches['Total'], 
        atol=0.02
    ), 'Business Logic Consistency'] = False

# Get indices of previous transactions and reassign altered columns to m_df
fin_mismatches_indices = tot_mismatches.index.tolist()
m_df.loc[fin_mismatches_indices, ['Subtotal', 'Total']] = tot_mismatches[['Subtotal', 'Total']]
m_df.loc[fin_mismatches_indices, 'Business Logic Consistency'] = tot_mismatches['Business Logic Consistency']

In [17]:
# Standardize Item and Service:

o_df_standardized = o_df.copy()
o_df_standardized[['Item_Standardized', 'Brand', 'Category', 'Service_Standardized']] = 'NEW'
o_df_standardized[['Item_Standardized', 'Brand', 'Category']] = o_df_standardized['Item'].apply(lambda row: standardize_items(row, tannson_re.item_dict, tannson_re.exclude_words))
o_df_standardized['Service_Standardized'] = o_df_standardized['Service'].apply(lambda row: standardize_services(row, tannson_re.services_list, tannson_re.exclude_words_2))

new_order = [
    'Item_Standardized', 'Brand', 'Category', 'Service', 'Service_Standardized', 'Price'
]
o_df_standardized = o_df_standardized[new_order]

o_df_standardized = o_df_standardized.explode('Service_Standardized')
# o_df_standardized = o_df_standardized.explode('Item_Standardized')
o_df_standardized['Service_Standardized'].value_counts(dropna=False).sum()
o_df_standardized.groupby('Service_Standardized', dropna=False).size().sort_values(ascending=False)

Service_Standardized
SCREEN REPLACEMENT OR REPAIR                  3175
UNCATEGORIZED                                 1003
SOFTWARE RESTORATION                           560
STARTUP REPAIR                                 512
BATTERY REPLACEMENT OR REPAIR                  499
UNMAPPED                                       435
CHARGING PORT REPLACEMENT OR REPAIR            306
HARDWARE DIAGNOSTICS                           276
DATA TRANSFER/BACKUP                           252
MISC. HARDWARE REPLACEMENT OR REPAIR           248
SOFTWARE INSTALLATION OR UPDATE                225
GENERAL DIAGNOSTICS                            170
DEVICE OR SERVICE UNLOCK                       141
CHARGING/POWER DIAGNOSTICS                     120
STORAGE DRIVE REPLACEMENT OR REPAIR            112
COVER REPLACEMENT OR REPAIR                    110
MALWARE REMOVAL                                106
SOFTWARE DIAGNOSTICS                            74
SYSTEM CUSTOMIZATION                            71
PERSONAL A

12/31/2025:
* 332 of 421 zero totals fixed. **Remaining 89** requires manual inspection or inquiry to client.

01/01/2026:
* 991 of 1076 algebraic discrepancies in totals fixed. Remaining: 85.

01/03/2026:
* Created 'Explicit Payment Type' and identified VOIDs. Tomorrow: Infer missing payment method fnc (CASH if zero or null tax and CC if tax), get indices of inferences, set EPM to 'NO'. Note: Example of reconstructing variables from noisy observational data.

01/04/2026:
* Created CASH/CHECK or CC inference fnc. Standardized payment method strings. Need to: 1) Incorporate DIAGNOSTIC and COURTESY key words found in Payment column to mask_no_repairs and mask_courtesies. 2) Standardize strings in lists (if list contains CASH, return CASH as the sole value instead of the list — might not need to create a new fnc.)

01/06/2026:
* Classified payment methods of all sales.

01/08/2026:
* Classified refunds as VOID. Set Price col value or sum of Price col values as Total.

01/11/2026:
* 1071 of 1078 algebraic discrepancies in totals fixed. **Remaining: 7**. All other entries are algebraically unsound even with manual inspection, so mark as violating business logic consistency.

01/18/2026:
* Standardized: ASUS, APPLE, SAMSUNG.

01/19/2026:
* Created function to apply item name standardizations.

01/20/2026:
* Standardized: GOOGLE, MICROSOFT, NOKIA, LG.

01/21/2026:
* Standardized: TOSHIBA, HP, LENOVO.

01/22/2026:
* Standardized: DELL, SONY.

01/25/2026:
* Standardized: ACER, AMAZON, HUAWEI, MOTOROLA, GATEWAY.

01/26/2026:
* Standardized: ALCATEL, ALIENWARE, TMOBILE PRODUCTS, CYBERPOWERPC, MSI, ONSITE SERVICES. Total: ~96% standardized. Remaining 4% grouped to OTHER.

01/27/2026:
* Initial standardization of top 25 services.

02/01/2026:
* 64.4% of Services standardized.

02/04/2026:
* Turned base standardization fnc into a wrapper + created service standardization fnc.

02/12/2026:
* Fully standardized ~83% of services. ~5% of services not captured upon initial loading of data. Chose to not standardize remaining ~12% of services because of high free text variability. 