In [9]:
'''
1. Summary of Normalization Steps
Step	Action
1	Keep only key fields
2	Load JSON into DataFrame
3	Lowercase, trim text
4	Split categories into lists
5	Remove duplicates

6	Drop rows with missing critical fields (dropped all rows with null values)
________________________________________________________________________________
7	Validate ZIP codes & coordinates
8	Optional: normalize addresses & remove accents
9	Save clean CSV

2. Gather data from different sources and conflate so entries refer to same place


Step 1: Identify Key Fields

From the Yelp JSON, keep only the attributes you care about:

business_id → unique identifier
name → business name
address → street address
city → city name
state → 2-letter state
postal_code → ZIP code
latitude & longitude → geolocation
stars → rating
review_count → number of reviews
categories → business categories

'''

import pandas as pd
import json
import numpy as np
from unidecode import unidecode
from pathlib import Path
import re


def clean_text(x):
    if pd.isnull(x) or str(x).strip() == "":
        return np.nan
    return unidecode(str(x).strip().lower())




def normalize_yelp_json(input_file):
    df = pd.read_json(input_file, lines=True)

    key_fields = [
        "business_id", "name", "address", "city", "state",
        "postal_code", "latitude", "longitude","categories"
    ]
    #select only the key fields we highlighted
    df = df[key_fields]
    #cleaning the data from these fields
    text_columns = ["name", "address", "city", "state"]
    for col in text_columns:

        df[col] = df[col].apply(clean_text)



    df.dropna(subset=["name", "address"], inplace=True)
    df['categories'] = df['categories'].apply(
        lambda x: [clean_text(c) for c in x.split(',')] if pd.notnull(x) else []
    )
    
    return df

normalize_yelp_json('../data/sample_data.json')

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,categories
1,mpf3x-BjTdTEA3yCZrAYPw,the ups store,87 grasso plaza shopping center,affton,mo,63123,38.551126,-90.335695,"[shipping centers, local services, notaries, m..."
2,tUFrWirKiKi_TAnsVWINQQ,target,5255 e broadway blvd,tucson,az,85711,32.223236,-110.880452,"[department stores, shopping, fashion, home & ..."
3,MTSW4McQd7CbVtyjqoe9mw,st honore pastries,935 race st,philadelphia,pa,19107,39.955505,-75.155564,"[restaurants, food, bubble tea, coffee & tea, ..."
4,mWMc6_wTdE0EUBKIGXDVfA,perkiomen valley brewery,101 walnut st,green lane,pa,18054,40.338183,-75.471659,"[brewpubs, breweries, food]"
5,mWMc6_wTdE0EUBKIGXDVfA,perkiomen valley brewery,101 walnut st,green lane,pa,18054,40.338183,-75.471659,"[brewpubs, breweries, food]"
6,mWMc6_wTdE0EUBKIGXDGfA,perkiomen valley brewery,101 walnut st,green lane,pa,18054,40.338183,-75.471659,"[brewpubs, breweries, food]"
