In [297]:
import pandas as pd
import json
from collections import Counter

# Data Exploration
This notebook explores raw_data to get a get better understanding of the data and find
potential limitations.

In [298]:
raw_data = {}

for f in [r'brands', r'receipts', r'users']:
    with open(f'data/{f}.json') as file:
        print(f"Reading {f}")
        raw_data[f] = [json.loads(line) for line in file]
        
df_brands = pd.DataFrame(raw_data['brands']).infer_objects()
df_receipts = pd.DataFrame(raw_data['receipts']).infer_objects()
df_users = pd.DataFrame(raw_data['users']).infer_objects()
# For easy iteration
dfs = {
    'receipts': df_receipts,
    'users': df_users,
    'brands': df_brands,
}

for data_name, df in dfs.items():
    print(f"{data_name}: {df.shape}")
    display(df.head())

Reading brands
Reading receipts
Reading users
receipts: (1119, 15)


Unnamed: 0,_id,bonusPointsEarned,bonusPointsEarnedReason,createDate,dateScanned,finishedDate,modifyDate,pointsAwardedDate,pointsEarned,purchaseDate,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId
0,{'$oid': '5ff1e1eb0a720f0523000575'},500.0,"Receipt number 2 completed, bonus point schedu...",{'$date': 1609687531000},{'$date': 1609687531000},{'$date': 1609687531000},{'$date': 1609687536000},{'$date': 1609687531000},500.0,{'$date': 1609632000000},5.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,26.0,5ff1e1eacfcf6c399c274ae6
1,{'$oid': '5ff1e1bb0a720f052300056b'},150.0,"Receipt number 5 completed, bonus point schedu...",{'$date': 1609687483000},{'$date': 1609687483000},{'$date': 1609687483000},{'$date': 1609687488000},{'$date': 1609687483000},150.0,{'$date': 1609601083000},2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,11.0,5ff1e194b6a9d73a3a9f1052
2,{'$oid': '5ff1e1f10a720f052300057a'},5.0,All-receipts receipt bonus,{'$date': 1609687537000},{'$date': 1609687537000},,{'$date': 1609687542000},,5.0,{'$date': 1609632000000},1.0,"[{'needsFetchReview': False, 'partnerItemId': ...",REJECTED,10.0,5ff1e1f1cfcf6c399c274b0b
3,{'$oid': '5ff1e1ee0a7214ada100056f'},5.0,All-receipts receipt bonus,{'$date': 1609687534000},{'$date': 1609687534000},{'$date': 1609687534000},{'$date': 1609687539000},{'$date': 1609687534000},5.0,{'$date': 1609632000000},4.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,28.0,5ff1e1eacfcf6c399c274ae6
4,{'$oid': '5ff1e1d20a7214ada1000561'},5.0,All-receipts receipt bonus,{'$date': 1609687506000},{'$date': 1609687506000},{'$date': 1609687511000},{'$date': 1609687511000},{'$date': 1609687506000},5.0,{'$date': 1609601106000},2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,1.0,5ff1e194b6a9d73a3a9f1052


users: (495, 7)


Unnamed: 0,_id,active,createdDate,lastLogin,role,signUpSource,state
0,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI
1,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI
2,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI
3,{'$oid': '5ff1e1eacfcf6c399c274ae6'},True,{'$date': 1609687530554},{'$date': 1609687530597},consumer,Email,WI
4,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI


brands: (1167, 8)


Unnamed: 0,_id,barcode,category,categoryCode,cpg,name,topBrand,brandCode
0,{'$oid': '601ac115be37ce2ead437551'},511111019862,Baking,BAKING,"{'$id': {'$oid': '601ac114be37ce2ead437550'}, ...",test brand @1612366101024,False,
1,{'$oid': '601c5460be37ce2ead43755f'},511111519928,Beverages,BEVERAGES,"{'$id': {'$oid': '5332f5fbe4b03c9a25efd0ba'}, ...",Starbucks,False,STARBUCKS
2,{'$oid': '601ac142be37ce2ead43755d'},511111819905,Baking,BAKING,"{'$id': {'$oid': '601ac142be37ce2ead437559'}, ...",test brand @1612366146176,False,TEST BRANDCODE @1612366146176
3,{'$oid': '601ac142be37ce2ead43755a'},511111519874,Baking,BAKING,"{'$id': {'$oid': '601ac142be37ce2ead437559'}, ...",test brand @1612366146051,False,TEST BRANDCODE @1612366146051
4,{'$oid': '601ac142be37ce2ead43755e'},511111319917,Candy & Sweets,CANDY_AND_SWEETS,"{'$id': {'$oid': '5332fa12e4b03c9a25efd1e7'}, ...",test brand @1612366146827,False,TEST BRANDCODE @1612366146827


## Data Schemes

### Receipts Data Schema

* **\_id:** uuid for this receipt
* **bonusPointsEarned:** Number of bonus points that were awarded upon receipt completion
* **bonusPointsEarnedReason:** event that triggered bonus points
* **createDate:** The date that the event was created
* **dateScanned:** Date that the user scanned their receipt
* **finishedDate:** Date that the receipt finished processing
* **modifyDate:** The date the event was modified
* **pointsAwardedDate:** The date we awarded points for the transaction
* **pointsEarned:** The number of points earned for the receipt
* **purchaseDate:** the date of the purchase
* **purchasedItemCount:** Count of number of items on the receipt
* **rewardsReceiptItemList:** The items that were purchased on the receipt
* **rewardsReceiptStatus:** status of the receipt through receipt validation and processing
* **totalSpent:** The total amount on the receipt
* **userId:** string id back to the User collection for the user who scanned the receipt

### Users Data Schema
* **\_id:** user Id
* **state:** state abbreviation
* **createdDate:** when the user created their account
* **lastLogin:** last time the user was recorded logging in to the app
* **role:** constant value set to 'CONSUMER'
* **active:** indicates if the user is active; only Fetch will de\-activate an account with this flag

### Brand Data Schema
* **\_id:** brand uuid
* **barcode:** the barcode on the item
* **brandCode:** String that corresponds with the brand column in a partner product file
* **category:** The category name for which the brand sells products in
* **categoryCode:** The category code that references a BrandCategory
* **cpg:** reference to CPG collection
* **topBrand:** Boolean indicator for whether the brand should be featured as a 'top brand'
* **name:** Brand name

# Sanity Checks

In [299]:
def sanity_checks(df):
    def analyze_dict(col_name):
        not_null_vals = df[col_name].dropna()
        
        # Has same keys?
        first_row_keys = set(not_null_vals.iloc[0].keys())
        
        has_same_keys = all(set(x.keys()) == first_row_keys for x in not_null_vals)
        print(f'\t{c} has same keys: {has_same_keys}')
        
        # if so, is it only 1 key?
        print(f"\t{c} is has 1 key: {len(first_row_keys) == 1}")
        if has_same_keys and len(first_row_keys) == 1:
            key_name = list(first_row_keys)[0]
            df[f"temp_{c}"] = df[c].map(lambda x: x[key_name] if not pd.isna(x) else x)
            print(f"\t{c} is unique: {df[f'temp_{c}'].is_unique}")
    
    # Other column uniqueness
    for c in df:
        is_dict = all(isinstance(x, dict) for x in df[c] if not pd.isna(x))
        if is_dict: 
            # Handle dicts specifically
            print(f'{c} is dict: {is_dict}')
            analyze_dict(c)
        else:
            try:
                # Uniqueness
                if df[c].is_unique:
                    print(f'{c} IS UNIQUE!')
                elif df[c].dropna().is_unique:
                    print(f'{c} IS UNIQUE ignoring nulls')
                else:
                    unique_items = df[c].unique()
                    if len(unique_items) < 10:
                        print(f'{c} possible vals: {unique_items}')
                    else:
                        print(f'{c} unique count: {len(unique_items)}')
                        
            except:
                print(f"{c} FAILED TO PARSE")
        
        # Null values
        null_percent = df[c].isnull().mean()
        if null_percent > 0:
            print(f'{c} HAS NULLS: %{100*null_percent:.2f}')


for data_name, df in dfs.items():
    print(data_name)
    print("----------")
    sanity_checks(df)
    print("")

receipts
----------
_id is dict: True
	_id has same keys: True
	_id is has 1 key: True
	_id is unique: True
bonusPointsEarned unique count: 13
bonusPointsEarned HAS NULLS: %51.39
bonusPointsEarnedReason unique count: 10
bonusPointsEarnedReason HAS NULLS: %51.39
createDate is dict: True
	createDate has same keys: True
	createDate is has 1 key: True
	createDate is unique: False
dateScanned is dict: True
	dateScanned has same keys: True
	dateScanned is has 1 key: True
	dateScanned is unique: False
finishedDate is dict: True
	finishedDate has same keys: True
	finishedDate is has 1 key: True
	finishedDate is unique: False
finishedDate HAS NULLS: %49.24
modifyDate is dict: True
	modifyDate has same keys: True
	modifyDate is has 1 key: True
	modifyDate is unique: False
pointsAwardedDate is dict: True
	pointsAwardedDate has same keys: True
	pointsAwardedDate is has 1 key: True
	pointsAwardedDate is unique: False
pointsAwardedDate HAS NULLS: %52.01
pointsEarned unique count: 121
pointsEarned HA


**PROBLEMS - Receipts** 
- lots of nulls in bonusPointsEarned. maybe this is 0? We should see if theres any 0s inputted
- rewardsReceiptItemList will require in detail parsing

**PROBLEMS - Users** 
- At least one of our users is not unique. Alot of these row entries look to be the same.
- lastLogin has nulls? Did these uses never log in?
- State has nulls, a little suprising but not worrying

**PROBLEMS - Brands** 
- Categories and category codes have a lot of nans, we might be able to fill these assuming there is a consistent mapping
- Top brands theres a lot of nans, we may be able to fill this, if there is a consistent mapping of brandCode to topBrand. Otherwise we could assume nan is False
- brand code has some nulls. Thats pretty suprising. I'd want to know why
- cpg will require in detail parsing

# Column Specific Investigations

## Receipts.bonusPointsEarned

Do we think we can fill these nans with 0?

In [300]:
sum(df_receipts['bonusPointsEarned'] == 0)

0

Yes

## user._id

### Matching with Receipts
Can we match User ids between users and receipts?

In [301]:
users_from_users = set(df_users['temp__id'])
users_from_receipts = set(df_receipts['userId'])

print(f"All recepit users in user data: {users_from_receipts.issubset(users_from_users)}")
print(f"Percent Missing: {100*len(users_from_receipts - users_from_users)/len(users_from_receipts):.2f}%")

All recepit users in user data: False
Percent Missing: 45.35%


This may not be a true data issue. Maybe not all our receipts are from our users. Or maybe our data snippet of the users table isn't big enough

### Duplicate keys

In [302]:
non_temp_cols = [c.replace("temp_", "") for c in df_users.columns if "temp" in c]
df_users_temp = df_users.drop(columns=non_temp_cols)

print(f"Percent Duplicate: {100*df_users_temp.duplicated(keep=False).mean():.2f}%")

Percent Duplicate: 71.31%


Its a suspicious that there are so many duplicates and so many missing users from receipts. Something may have happened to override some of the users

## Receipts.rewardsReceiptItemList
This will require some specific parsing.

In [303]:
# purchasedItemCount is not the same as len rewardsReceiptItemList
df_receipts['purchasedItemCount'] = df_receipts['rewardsReceiptItemList'].map(lambda x: len(x) if isinstance(x, list) else None)
print(sum(df_receipts['purchasedItemCount'] != df_receipts['rewardsReceiptItemList']))

1119


In [304]:
all_items = []
for items in df_receipts['rewardsReceiptItemList'].dropna():
    all_items.extend(items)
    
df_all_items = pd.DataFrame(all_items)
display(df_all_items.head())

Unnamed: 0,barcode,description,finalPrice,itemPrice,needsFetchReview,partnerItemId,preventTargetGapPoints,quantityPurchased,userFlaggedBarcode,userFlaggedNewItem,...,itemNumber,originalMetaBriteQuantityPurchased,pointsEarned,targetPrice,competitiveProduct,originalFinalPrice,originalMetaBriteItemPrice,deleted,priceAfterCoupon,metabriteCampaignId
0,4011.0,ITEM NOT FOUND,26.0,26.0,False,1,True,5.0,4011.0,True,...,,,,,,,,,,
1,4011.0,ITEM NOT FOUND,1.0,1.0,,1,,1.0,,,...,,,,,,,,,,
2,28400642255.0,DORITOS TORTILLA CHIP SPICY SWEET CHILI REDUCE...,10.0,10.0,True,2,True,1.0,28400642255.0,True,...,,,,,,,,,,
3,,,,,False,1,True,,4011.0,True,...,,,,,,,,,,
4,4011.0,ITEM NOT FOUND,28.0,28.0,False,1,True,4.0,4011.0,True,...,,,,,,,,,,


Might be good to make this an entire new table.

In [305]:
sanity_checks(df_all_items)

barcode unique count: 569
barcode HAS NULLS: %55.48
description unique count: 1890
description HAS NULLS: %5.49
finalPrice unique count: 829
finalPrice HAS NULLS: %2.51
itemPrice unique count: 829
itemPrice HAS NULLS: %2.51
needsFetchReview possible vals: [False nan True]
needsFetchReview HAS NULLS: %88.29
partnerItemId unique count: 916
preventTargetGapPoints possible vals: [True nan]
preventTargetGapPoints HAS NULLS: %94.84
quantityPurchased unique count: 14
quantityPurchased HAS NULLS: %2.51
userFlaggedBarcode possible vals: ['4011' nan '028400642255' '1234' '034100573065' '075925306254'
 '079400066619']
userFlaggedBarcode HAS NULLS: %95.14
userFlaggedNewItem possible vals: [True nan]
userFlaggedNewItem HAS NULLS: %95.35
userFlaggedPrice unique count: 14
userFlaggedPrice HAS NULLS: %95.69
userFlaggedQuantity possible vals: [ 5. nan  1.  3.  4.  2.]
userFlaggedQuantity HAS NULLS: %95.69
needsFetchReviewReason possible vals: [nan 'USER_FLAGGED' 'POINTS_GREATER_THAN_THRESHOLD']
needsFe

In [306]:
def camel_to_snake(camel_case_str):
    import re
    # Use a regular expression to insert an underscore before each uppercase letter
    # and convert the entire string to lowercase
    snake_case_str = re.sub(r'(?<!^)(?=[A-Z])', '_', camel_case_str).lower()
    return snake_case_str

for c in df_all_items:
    text = camel_to_snake(c)
    text = f"""* **{text}**\n\t* Desc: """
    print(text)

* **barcode**
	* Desc: 
* **description**
	* Desc: 
* **final_price**
	* Desc: 
* **item_price**
	* Desc: 
* **needs_fetch_review**
	* Desc: 
* **partner_item_id**
	* Desc: 
* **prevent_target_gap_points**
	* Desc: 
* **quantity_purchased**
	* Desc: 
* **user_flagged_barcode**
	* Desc: 
* **user_flagged_new_item**
	* Desc: 
* **user_flagged_price**
	* Desc: 
* **user_flagged_quantity**
	* Desc: 
* **needs_fetch_review_reason**
	* Desc: 
* **points_not_awarded_reason**
	* Desc: 
* **points_payer_id**
	* Desc: 
* **rewards_group**
	* Desc: 
* **rewards_product_partner_id**
	* Desc: 
* **user_flagged_description**
	* Desc: 
* **original_meta_brite_barcode**
	* Desc: 
* **original_meta_brite_description**
	* Desc: 
* **brand_code**
	* Desc: 
* **competitor_rewards_group**
	* Desc: 
* **discounted_item_price**
	* Desc: 
* **original_receipt_item_text**
	* Desc: 
* **item_number**
	* Desc: 
* **original_meta_brite_quantity_purchased**
	* Desc: 
* **points_earned**
	* Desc: 
* **target_price*

## Brand.cpg
This will require some specific parsing.


In [307]:
for x in df_brands['cpg'].head():
    print(x)

{'$id': {'$oid': '601ac114be37ce2ead437550'}, '$ref': 'Cogs'}
{'$id': {'$oid': '5332f5fbe4b03c9a25efd0ba'}, '$ref': 'Cogs'}
{'$id': {'$oid': '601ac142be37ce2ead437559'}, '$ref': 'Cogs'}
{'$id': {'$oid': '601ac142be37ce2ead437559'}, '$ref': 'Cogs'}
{'$id': {'$oid': '5332fa12e4b03c9a25efd1e7'}, '$ref': 'Cogs'}


Are they all Cogs?

In [308]:
refs = set()
for x in df_brands['cpg']:
    refs.add(x['$ref'])
print(refs)

{'Cogs', 'Cpgs'}


Okay, we can add another column like "cpg ref type" to convey hold on to this info. Is this type consistent? If so we may want to create a new table

In [309]:
df_cpg = []
for cpg in df_brands['cpg']:
    cpg = cpg.copy()
    cpg["$id"] = cpg["$id"]["$oid"]
    df_cpg.append(cpg)
df_cpg = pd.DataFrame(df_cpg)
df_cpg = df_cpg.drop_duplicates()

df_cpg["inconsistent"] = df_cpg.duplicated(subset="$id", keep=False)
display(df_cpg[df_cpg["inconsistent"]].sort_values("$id"))

Unnamed: 0,$id,$ref,inconsistent
34,5332f5f3e4b03c9a25efd0ae,Cpgs,True
36,5332f5f3e4b03c9a25efd0ae,Cogs,True
1,5332f5fbe4b03c9a25efd0ba,Cogs,True
88,5332f5fbe4b03c9a25efd0ba,Cpgs,True
15,5332f709e4b03c9a25efd0f1,Cpgs,True
48,5332f709e4b03c9a25efd0f1,Cogs,True
99,5332f7a7e4b03c9a25efd134,Cogs,True
104,5332f7a7e4b03c9a25efd134,Cpgs,True
4,5332fa12e4b03c9a25efd1e7,Cogs,True
379,5332fa12e4b03c9a25efd1e7,Cpgs,True


Its hard to tell if this is an error or not. Thus I will keep cpg type as a column on brand.

# brands.category
Categories and category codes have lots of nans. Are their nans consistent?

In [310]:
df_cat_mapping = (df_brands[["category", "categoryCode"]]
                  .drop_duplicates()
                  .sort_values("category")
                  )

display(df_cat_mapping.head(10))

Unnamed: 0,category,categoryCode
298,Baby,BABY
20,Baby,
0,Baking,BAKING
9,Baking,
19,Beauty,
286,Beauty & Personal Care,
249,Beer Wine Spirits,BEER_WINE_SPIRITS
15,Beer Wine Spirits,
1,Beverages,BEVERAGES
41,Beverages,


Yes, we can fill these. I'm going to assume this was an error in our datasample.
We will our database, fix the mapping, then assert that it is not null

# Brands table
This table was hard to interpret. Theres a unique brand id, but the brand code, brand name is not unique? Lets see if we can use the combinitation of brand code and barcode to get a unique id.


In [331]:
# see if we can use the combinitation of brand code and barcode to get a unique id.
df_brands["brandbarcode"] = df_brands["brandCode"] + df_brands["barode"]
print(f"is unique: {df_brands['brandbarcode'].is_unique}")
print(f"is unique (ignoring nans): {df_brands['brandbarcode'].dropna().is_unique}")

is unique: False
is unique (ignoring nans): True


In [334]:
non_temp_cols = [c.replace("temp_", "") for c in df_brands.columns if "temp" in c]
df_brands_temp = df_brands.drop(columns=non_temp_cols)
df_brands_temp = df_brands_temp.sort_values("brandbarcode")
df_brands_temp = df_brands_temp[df_brands_temp['brandbarcode'].isnull()]

df_brands_temp.head()

Unnamed: 0,barcode,category,categoryCode,cpg,name,topBrand,brandCode,temp__id,brandbarcode
0,511111019862,Baking,BAKING,"{'$id': {'$oid': '601ac114be37ce2ead437550'}, ...",test brand @1612366101024,False,,601ac115be37ce2ead437551,
11,511111102540,,,"{'$ref': 'Cpgs', '$id': {'$oid': '5332f5f2e4b0...",MorningStar,,,57c08106e4b0718ff5fcb02c,
18,511111317364,Baking,BAKING,"{'$ref': 'Cogs', '$id': {'$oid': '5fb28549be37...",test brand @1605535049181,False,,5fb28549be37ce522e165cb5,
23,511111303947,,,"{'$ref': 'Cpgs', '$id': {'$oid': '53e10d6368ab...",Bottled Starbucks,,,5332f5fee4b03c9a25efd0bd,
24,511111802914,,,"{'$ref': 'Cpgs', '$id': {'$oid': '5332f5ebe4b0...",Full Throttle,,,5332fa7ce4b03c9a25efd22e,


If we had codes for each brand I would make this pair a primary key