In [14]:
import pandas as pd
from pathlib import Path
import json

In [15]:
base_dir = Path().resolve().parent

brands_file = base_dir / 'data' / 'brands.json'
users_file = base_dir / 'data' / 'users.json'
receipts_file = base_dir / 'data' / 'receipts.json'

In [16]:
def read_json_lines(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return data

In [17]:
brands_data = read_json_lines(brands_file)
users_data = read_json_lines(users_file)
receipts_data = read_json_lines(receipts_file)

In [18]:
def flatten_and_normalize(data, record_path=None, meta=None):
    return pd.json_normalize(data, record_path, meta, sep='_')

def reorder_and_rename_columns(df, order, column_mapping):
    df = df[order]
    df = df.rename(columns=column_mapping)
    return df

def convert_to_datetime(df, date_columns):
    for col in date_columns:
        df[col] = pd.to_datetime(df[col], unit='ms')
    return df

In [19]:
brands_df = flatten_and_normalize(brands_data)

brands_order = [
    '_id_$oid', 'name', 'brandCode', 'barcode', 'category', 'categoryCode', 'topBrand', 'cpg_$id_$oid', 'cpg_$ref' 
]

brands_column_mapping = {
    '_id_$oid': 'brand_id',
    'barcode': 'barcode',
    'category': 'category',
    'categoryCode': 'category_code',
    'cpg_$id_$oid': 'cpg_id',
    'cpg_$ref': 'cpg_ref',
    'name': 'brand_name',
    'topBrand': 'top_brand',
    'brandCode': 'brand_code'
}

brands_df = reorder_and_rename_columns(brands_df, brands_order, brands_column_mapping)
print("No. of records: ",  len(brands_df))
brands_df.head()

No. of records:  1167


Unnamed: 0,brand_id,brand_name,brand_code,barcode,category,category_code,top_brand,cpg_id,cpg_ref
0,601ac115be37ce2ead437551,test brand @1612366101024,,511111019862,Baking,BAKING,False,601ac114be37ce2ead437550,Cogs
1,601c5460be37ce2ead43755f,Starbucks,STARBUCKS,511111519928,Beverages,BEVERAGES,False,5332f5fbe4b03c9a25efd0ba,Cogs
2,601ac142be37ce2ead43755d,test brand @1612366146176,TEST BRANDCODE @1612366146176,511111819905,Baking,BAKING,False,601ac142be37ce2ead437559,Cogs
3,601ac142be37ce2ead43755a,test brand @1612366146051,TEST BRANDCODE @1612366146051,511111519874,Baking,BAKING,False,601ac142be37ce2ead437559,Cogs
4,601ac142be37ce2ead43755e,test brand @1612366146827,TEST BRANDCODE @1612366146827,511111319917,Candy & Sweets,CANDY_AND_SWEETS,False,5332fa12e4b03c9a25efd1e7,Cogs


In [20]:
users_df = flatten_and_normalize(users_data)

users_order = [
    '_id_$oid', 'role', 'signUpSource', 'state', 'createdDate_$date', 'active', 'lastLogin_$date' 
]

users_column_mapping = {
    '_id_$oid': 'user_id',
    'active': 'active',
    'createdDate_$date': 'created_date',
    'lastLogin_$date': 'last_login',
    'role': 'role',
    'signUpSource': 'sign_up_source',
    'state': 'state'
}

users_df = reorder_and_rename_columns(users_df, users_order, users_column_mapping)
users_df = convert_to_datetime(users_df, ['created_date', 'last_login'])

print("No. of records: ",  len(users_df))
users_df.head()

No. of records:  495


Unnamed: 0,user_id,role,sign_up_source,state,created_date,active,last_login
0,5ff1e194b6a9d73a3a9f1052,consumer,Email,WI,2021-01-03 15:24:04.800,True,2021-01-03 15:25:37.857999872
1,5ff1e194b6a9d73a3a9f1052,consumer,Email,WI,2021-01-03 15:24:04.800,True,2021-01-03 15:25:37.857999872
2,5ff1e194b6a9d73a3a9f1052,consumer,Email,WI,2021-01-03 15:24:04.800,True,2021-01-03 15:25:37.857999872
3,5ff1e1eacfcf6c399c274ae6,consumer,Email,WI,2021-01-03 15:25:30.554,True,2021-01-03 15:25:30.596999936
4,5ff1e194b6a9d73a3a9f1052,consumer,Email,WI,2021-01-03 15:24:04.800,True,2021-01-03 15:25:37.857999872


In [21]:
receipts_df = flatten_and_normalize(receipts_data)

receipts_order = [
    '_id_$oid', 'userId', 'bonusPointsEarned', 'bonusPointsEarnedReason', 'createDate_$date', 
    'dateScanned_$date', 'finishedDate_$date', 'modifyDate_$date', 'pointsAwardedDate_$date', 
    'pointsEarned', 'purchaseDate_$date', 'purchasedItemCount', 'rewardsReceiptStatus', 'totalSpent'
]

receipts_column_mapping = {
    '_id.$oid': 'receipt_id',
    'userId': 'user_id',
    'bonusPointsEarned': 'bonus_points_earned',
    'bonusPointsEarnedReason': 'bonus_points_earned_reason',
    'createDate_$date': 'create_date',
    'dateScanned_$date': 'date_scanned',
    'finishedDate_$date': 'finished_date',
    'modifyDate_$date': 'modify_date',
    'pointsAwardedDate_$date': 'points_awarded_date',
    'pointsEarned': 'points_earned',
    'purchaseDate_$date': 'purchase_date',
    'purchasedItemCount': 'purchased_item_count',
    'rewardsReceiptStatus': 'rewards_receipt_status',
    'totalSpent': 'total_spent'
}

receipts_only_df = reorder_and_rename_columns(receipts_df, receipts_order, receipts_column_mapping)

receipts_only_df = convert_to_datetime(receipts_only_df, [
    'create_date', 'date_scanned', 'finished_date', 'modify_date', 'points_awarded_date', 'purchase_date'
])

receipts_only_df.head()

Unnamed: 0,_id_$oid,user_id,bonus_points_earned,bonus_points_earned_reason,create_date,date_scanned,finished_date,modify_date,points_awarded_date,points_earned,purchase_date,purchased_item_count,rewards_receipt_status,total_spent
0,5ff1e1eb0a720f0523000575,5ff1e1eacfcf6c399c274ae6,500.0,"Receipt number 2 completed, bonus point schedu...",2021-01-03 15:25:31,2021-01-03 15:25:31,2021-01-03 15:25:31,2021-01-03 15:25:36,2021-01-03 15:25:31,500.0,2021-01-03 00:00:00,5.0,FINISHED,26.0
1,5ff1e1bb0a720f052300056b,5ff1e194b6a9d73a3a9f1052,150.0,"Receipt number 5 completed, bonus point schedu...",2021-01-03 15:24:43,2021-01-03 15:24:43,2021-01-03 15:24:43,2021-01-03 15:24:48,2021-01-03 15:24:43,150.0,2021-01-02 15:24:43,2.0,FINISHED,11.0
2,5ff1e1f10a720f052300057a,5ff1e1f1cfcf6c399c274b0b,5.0,All-receipts receipt bonus,2021-01-03 15:25:37,2021-01-03 15:25:37,NaT,2021-01-03 15:25:42,NaT,5.0,2021-01-03 00:00:00,1.0,REJECTED,10.0
3,5ff1e1ee0a7214ada100056f,5ff1e1eacfcf6c399c274ae6,5.0,All-receipts receipt bonus,2021-01-03 15:25:34,2021-01-03 15:25:34,2021-01-03 15:25:34,2021-01-03 15:25:39,2021-01-03 15:25:34,5.0,2021-01-03 00:00:00,4.0,FINISHED,28.0
4,5ff1e1d20a7214ada1000561,5ff1e194b6a9d73a3a9f1052,5.0,All-receipts receipt bonus,2021-01-03 15:25:06,2021-01-03 15:25:06,2021-01-03 15:25:11,2021-01-03 15:25:11,2021-01-03 15:25:06,5.0,2021-01-02 15:25:06,2.0,FINISHED,1.0


In [27]:
for receipt in receipts_data:
    receipt['id'] = receipt['_id']['$oid']
    if 'rewardsReceiptItemList' not in receipt:
        receipt['rewardsReceiptItemList'] = []
    for item in receipt['rewardsReceiptItemList']:
        item['receipt_id'] = receipt['id']
        
# Normalize the data to extract rewardsReceiptItemList and associated _id
items_df = flatten_and_normalize(
    receipts_data,
    record_path=['rewardsReceiptItemList'],
    meta=['id']
)

items_df.drop('id', inplace=True, axis=1)
items_df.head()

Unnamed: 0,barcode,description,finalPrice,itemPrice,needsFetchReview,partnerItemId,preventTargetGapPoints,quantityPurchased,userFlaggedBarcode,userFlaggedNewItem,...,itemNumber,originalMetaBriteQuantityPurchased,pointsEarned,targetPrice,competitiveProduct,originalFinalPrice,originalMetaBriteItemPrice,deleted,priceAfterCoupon,metabriteCampaignId
0,4011.0,ITEM NOT FOUND,26.0,26.0,False,1,True,5.0,4011.0,True,...,,,,,,,,,,
1,4011.0,ITEM NOT FOUND,1.0,1.0,,1,,1.0,,,...,,,,,,,,,,
2,28400642255.0,DORITOS TORTILLA CHIP SPICY SWEET CHILI REDUCE...,10.0,10.0,True,2,True,1.0,28400642255.0,True,...,,,,,,,,,,
3,,,,,False,1,True,,4011.0,True,...,,,,,,,,,,
4,4011.0,ITEM NOT FOUND,28.0,28.0,False,1,True,4.0,4011.0,True,...,,,,,,,,,,


In [28]:
items_df.columns

Index(['barcode', 'description', 'finalPrice', 'itemPrice', 'needsFetchReview',
       'partnerItemId', 'preventTargetGapPoints', 'quantityPurchased',
       'userFlaggedBarcode', 'userFlaggedNewItem', 'userFlaggedPrice',
       'userFlaggedQuantity', 'receipt_id', 'needsFetchReviewReason',
       'pointsNotAwardedReason', 'pointsPayerId', 'rewardsGroup',
       'rewardsProductPartnerId', 'userFlaggedDescription',
       'originalMetaBriteBarcode', 'originalMetaBriteDescription', 'brandCode',
       'competitorRewardsGroup', 'discountedItemPrice',
       'originalReceiptItemText', 'itemNumber',
       'originalMetaBriteQuantityPurchased', 'pointsEarned', 'targetPrice',
       'competitiveProduct', 'originalFinalPrice',
       'originalMetaBriteItemPrice', 'deleted', 'priceAfterCoupon',
       'metabriteCampaignId'],
      dtype='object')

In [29]:
items_barcode = set(items_df['barcode'])
brands_barcode = set(brands_df['barcode'])

known_brands = items_barcode.intersection(brands_barcode)
known_brands

{'511111001485',
 '511111001768',
 '511111003960',
 '511111004127',
 '511111101451',
 '511111104186',
 '511111104537',
 '511111204206',
 '511111502142',
 '511111518044',
 '511111602118',
 '511111704140',
 '511111802358',
 '511111901587',
 '511111902690',
 '511111904175'}

In [40]:
len(known_brands)

16

In [32]:
items_df.isna().sum()

barcode                               3851
description                            381
finalPrice                             174
itemPrice                              174
needsFetchReview                      6128
partnerItemId                            0
preventTargetGapPoints                6583
quantityPurchased                      174
userFlaggedBarcode                    6604
userFlaggedNewItem                    6618
userFlaggedPrice                      6642
userFlaggedQuantity                   6642
receipt_id                               0
needsFetchReviewReason                6722
pointsNotAwardedReason                6601
pointsPayerId                         5674
rewardsGroup                          5210
rewardsProductPartnerId               4672
userFlaggedDescription                6736
originalMetaBriteBarcode              6870
originalMetaBriteDescription          6931
brandCode                             4341
competitorRewardsGroup                6666
discountedI

In [33]:
receipts_only_df.isna().sum()

_id_$oid                        0
user_id                         0
bonus_points_earned           575
bonus_points_earned_reason    575
create_date                     0
date_scanned                    0
finished_date                 551
modify_date                     0
points_awarded_date           582
points_earned                 510
purchase_date                 448
purchased_item_count          484
rewards_receipt_status          0
total_spent                   435
dtype: int64

In [34]:
users_df.isna().sum()

user_id            0
role               0
sign_up_source    48
state             56
created_date       0
active             0
last_login        62
dtype: int64

In [35]:
brands_df.isna().sum()

brand_id           0
brand_name         0
brand_code       234
barcode            0
category         155
category_code    650
top_brand        612
cpg_id             0
cpg_ref            0
dtype: int64

In [38]:
items_brandcode = set(items_df['brandCode'])
brands_barcode = set(brands_df['brand_code'])

known_brands_codes = items_brandcode.intersection(brands_barcode)
known_brands_codes

{'ARNOLD',
 'CHEETOS',
 'CLASSICO',
 'COOL WHIP',
 'COTTONELLE',
 'CRACKER BARREL',
 'DOLE CHILLED FRUIT JUICES',
 'DORITOS',
 'FINISH',
 'GREY POUPON',
 "HELLMANN'S/BEST FOODS",
 'HUGGIES',
 'JELL-O',
 'JUST CRACK AN EGG',
 'KETTLE BRAND',
 'KLEENEX',
 'KLONDIKE',
 'KNORR',
 'KRAFT',
 'LUNCHABLES',
 'MOUNTAIN DEW',
 'NATURE VALLEY',
 'ORE-IDA',
 'OSCAR MAYER',
 'PACIFIC FOODS',
 'PEPPERIDGE FARM',
 'PEPSI',
 'PHILADELPHIA',
 'PLANTERS',
 'PREGO',
 'QUAKER',
 'RICE-A-RONI',
 'SARGENTO',
 'STOVE TOP',
 'SWANSON',
 'TACO BELL',
 'TOSTITOS',
 'V8',
 'VELVEETA',
 'VIVA',
 'YUBAN',
 nan}

In [39]:
len(known_brands_codes)

42

In [41]:
receipts_only_df['rewards_receipt_status'].unique()

array(['FINISHED', 'REJECTED', 'FLAGGED', 'SUBMITTED', 'PENDING'],
      dtype=object)