### Import Necessary Packages

In [19]:
import pandas as pd
import json

### Load data from a JSON file

In [20]:
# Load data from a JSON file
with open('brands.json', 'r') as file:
    data = [json.loads(line) for line in file]

# flatten JSON into DataFrame
df = pd.json_normalize(data)

# Rename columns for clarity
df.rename(columns={
    "_id.$oid": "id",
    "cpg.$id.$oid": "cpg_id",
    "cpg.$ref": "cpg_ref"
}, inplace=True)

# check the data info
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1167 entries, 0 to 1166
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   barcode       1167 non-null   object
 1   category      1012 non-null   object
 2   categoryCode  517 non-null    object
 3   name          1167 non-null   object
 4   topBrand      555 non-null    object
 5   id            1167 non-null   object
 6   cpg_id        1167 non-null   object
 7   cpg_ref       1167 non-null   object
 8   brandCode     933 non-null    object
dtypes: object(9)
memory usage: 82.2+ KB


Unnamed: 0,barcode,category,categoryCode,name,topBrand,id,cpg_id,cpg_ref,brandCode
0,511111019862,Baking,BAKING,test brand @1612366101024,False,601ac115be37ce2ead437551,601ac114be37ce2ead437550,Cogs,
1,511111519928,Beverages,BEVERAGES,Starbucks,False,601c5460be37ce2ead43755f,5332f5fbe4b03c9a25efd0ba,Cogs,STARBUCKS
2,511111819905,Baking,BAKING,test brand @1612366146176,False,601ac142be37ce2ead43755d,601ac142be37ce2ead437559,Cogs,TEST BRANDCODE @1612366146176
3,511111519874,Baking,BAKING,test brand @1612366146051,False,601ac142be37ce2ead43755a,601ac142be37ce2ead437559,Cogs,TEST BRANDCODE @1612366146051
4,511111319917,Candy & Sweets,CANDY_AND_SWEETS,test brand @1612366146827,False,601ac142be37ce2ead43755e,5332fa12e4b03c9a25efd1e7,Cogs,TEST BRANDCODE @1612366146827


### Data Quality Check


In [21]:
# Check for duplicates
duplicates_count = df.duplicated().sum()

# Check for missing values
missing_values = df.isnull().sum()

# Verify uniqueness of IDs
unique_ids = df['id'].nunique()
total_ids = len(df)
non_unique_ids = total_ids - unique_ids

# Inspect categorical fields
unique_categories = df['category'].unique()
unique_category_codes = df['categoryCode'].dropna().unique()

# Check for logical consistency
missing_barcodes = df['barcode'].isnull().sum()

### Data Quality Report

In [22]:
# Output data quality report
data_quality_report = {
    "Duplicate Rows": duplicates_count,
    "Missing Values by Column": missing_values.to_dict(),
    "Non-Unique IDs": non_unique_ids,
    "Unique Categories": unique_categories.tolist(),
    "Unique Category Codes": unique_category_codes.tolist(),
    "Missing Barcodes": missing_barcodes
}

print("Data Quality Report:")
for key, value in data_quality_report.items():
    print(f"{key}: {value}")

Data Quality Report:
Duplicate Rows: 0
Missing Values by Column: {'barcode': 0, 'category': 155, 'categoryCode': 650, 'name': 0, 'topBrand': 612, 'id': 0, 'cpg_id': 0, 'cpg_ref': 0, 'brandCode': 234}
Non-Unique IDs: 0
Unique Categories: ['Baking', 'Beverages', 'Candy & Sweets', 'Condiments & Sauces', 'Canned Goods & Soups', nan, 'Magazines', 'Breakfast & Cereal', 'Beer Wine Spirits', 'Health & Wellness', 'Beauty', 'Baby', 'Frozen', 'Grocery', 'Snacks', 'Household', 'Personal Care', 'Dairy', 'Cleaning & Home Improvement', 'Deli', 'Beauty & Personal Care', 'Bread & Bakery', 'Outdoor', 'Dairy & Refrigerated']
Unique Category Codes: ['BAKING', 'BEVERAGES', 'CANDY_AND_SWEETS', 'HEALTHY_AND_WELLNESS', 'GROCERY', 'PERSONAL_CARE', 'CLEANING_AND_HOME_IMPROVEMENT', 'BEER_WINE_SPIRITS', 'BABY', 'BREAD_AND_BAKERY', 'OUTDOOR', 'DAIRY_AND_REFRIGERATED', 'MAGAZINES', 'FROZEN']
Missing Barcodes: 0
