In [2]:
import pandas as pd

In [9]:
# Import data
snapshot_1 = pd.read_csv('data/snapshot_1.csv')
snapshot_2 = pd.read_csv('data/snapshot_2.csv')

# rename snapshot_2's columns to match snapshot_1's assuming they have the same column data in the same column order
snapshot_2.columns = snapshot_1.columns

In [16]:
# cleans the sku's to have uppercase letters for the 'SKU' part of their item code
snapshot_2['sku'] = snapshot_2['sku'].str.upper()
# adds a dash between the first capture group ('SKU') and second capture group (###)
snapshot_2['sku'] = snapshot_2['sku'].str.replace(r'(SKU)(\d+)', r'\1-\2', regex=True)

In [27]:
# check for duplicated sku's or names
print(
    snapshot_1['sku'].is_unique,
    snapshot_2['sku'].is_unique,
    snapshot_1['name'].is_unique,
    snapshot_2['name'].is_unique
)

True False True True


In [28]:
# Strip whitespace from item names
snapshot_1['name'] = snapshot_1['name'].str.strip()
snapshot_2['name'] = snapshot_2['name'].str.strip()

In [29]:
# Create slice to find duplicated sku
df_duplicates = snapshot_2[snapshot_2.duplicated(subset='sku', keep=False)]
df_duplicates

Unnamed: 0,sku,name,quantity,location,last_counted
42,SKU-045,Multimeter Professional,23.0,Warehouse A,2024-01-15
52,SKU-045,Multimeter Pro,-5.0,Warehouse B,2024-01-15


In [33]:
# groups and merges duplicated sku's
agg_rules = {
    'sku': 'first', # takes first value that appears
    'name': 'first', # unsure what official name is, went with first record
    'quantity': 'sum', # adds the two values together
    'location': 'first',
    'last_counted': 'first'
}

snapshot_2 = snapshot_2.groupby('sku', as_index=False).agg(agg_rules)

In [34]:
snapshot_2['sku'].is_unique

True

In [25]:
snapshot_1['name'].values

<StringArray>
[                 'Widget A',                  'Widget B',
                'Gadget Pro',               'Gadget Lite',
       'Connector Cable 6ft',      'Connector Cable 10ft',
         'Power Supply Unit',     'Power Supply Unit Pro',
    'Mounting Bracket Small',    'Mounting Bracket Large',
           'LED Panel 12x12',           'LED Panel 24x24',
        'Thermal Paste Tube',          'Cooling Fan 80mm',
         'Cooling Fan 120mm',            'USB Hub 4-Port',
            'USB Hub 7-Port',       'Ethernet Cable Cat5',
       'Ethernet Cable Cat6',      'Ethernet Cable Cat6a',
            'HDMI Cable 3ft',            'HDMI Cable 6ft',
           'HDMI Cable 10ft',         'DisplayPort Cable',
                 'VGA Cable',                 'DVI Cable',
         'Audio Cable 3.5mm',           'Audio Cable RCA',
       'Optical Audio Cable',  'Surge Protector 6-Outlet',
 'Surge Protector 12-Outlet',       'Extension Cord 10ft',
       'Extension Cord 25ft',             

In [24]:
snapshot_2['name'].values

<StringArray>
[                 'Widget A',                 ' Widget B',
                'Gadget Pro',               'Gadget Lite',
       'Connector Cable 6ft',      'Connector Cable 10ft',
         'Power Supply Unit',     'Power Supply Unit Pro',
    'Mounting Bracket Small',   'Mounting Bracket Large ',
           'LED Panel 12x12',           'LED Panel 24x24',
        'Thermal Paste Tube',          'Cooling Fan 80mm',
         'Cooling Fan 120mm',            'USB Hub 4-Port',
            'USB Hub 7-Port',       'Ethernet Cable Cat5',
       'Ethernet Cable Cat6',      'Ethernet Cable Cat6a',
          ' HDMI Cable 3ft ',            'HDMI Cable 6ft',
           'HDMI Cable 10ft',         'DisplayPort Cable',
         'Audio Cable 3.5mm',           'Audio Cable RCA',
       'Optical Audio Cable',  'Surge Protector 6-Outlet',
 'Surge Protector 12-Outlet',       'Extension Cord 10ft',
       'Extension Cord 25ft',               'Power Strip',
          'Cable Ties 100pk',          'Ca

Identifies:

Items present in both snapshots (and whether quantities changed)

Items only in snapshot 1 (removed/sold out)

Items only in snapshot 2 (newly added)

Any data quality issues worth flagging


Data issues:

    column names change between files
        name to product_name
        quantity to qty
        location to warehouse
        last_counted to updated_at
    sku
        format (missing dash)
        capital letters
    name
        whitespace