In [None]:
# Dependencies
import pandas as pd

In [None]:
# Import the csv file as pandas dataframe
csv_file_one = "../00_input/datafiniti-fast-food-restaurants-across-america/Datafiniti_Fast_Food_Restaurants.csv"
restaurant_df = pd.read_csv(csv_file_one)
restaurant_df.head()

#### Check the dataframe before transformation 

In [None]:
# There are 10,000 records in the dataframe
restaurant_df.info()

#### Drop duplicates in "keys" column so that restaurants will be unique

In [None]:
# There are only 9343 unique restaurants out of 10,000 rows in the dataframe.
restaurant_df["keys"].nunique()

In [None]:
# Drop duplicates in restaurant
restaurant_df.drop_duplicates(subset ="keys", keep = 'first', inplace = True) 

In [None]:
# The total count has been reduced from 10,000 to 9343. 
restaurant_df.info()

####  Some zip_codes are in xxxxx-xxxx format. They need to be diced to match the standard format. Digits after '-' will be dropped.

In [None]:
# Cleanse ZIP Code to retain only 5 characters
restaurant_df["postalCode"] = restaurant_df["postalCode"].apply(lambda x: x.split("-")[0].strip())

In [None]:
# Verify ZIP Codes all cleansed to 5 characters only
zip_check = restaurant_df['postalCode'].apply(len)
print(f"Max zip_code length is {zip_check.max()}.")
print(f"Min zip_code length is {zip_check.min()}.")

## Data normalisation

### Normalise zip_code

If keeping zip_code, city and state in the same table, there will be transitive dependency. City and state depend on the restaurant address but also on zip_code. Hence to achieve the Third Normal Form, we'll put zip, city and state into a seperate table.

In [None]:
# Extract ZIP Code, City and State to a separate table
zip_code_df = restaurant_df[['postalCode','city','province']]
zip_code_df.head()

In [None]:
# Rename ZIP Code table column headings so they are more user friendly
zip_code_df = zip_code_df.rename(columns={"postalCode":"zip_code", "province":"state"})

In [None]:
# Check renamed columns
zip_code_df.info()

Identify if there are any duplicate ZIP Codes

In [None]:
zip_code_df["zip_code"].nunique()

In [None]:
# Drop duplicate ZIP Codes from the ZIP Code dataframe
zip_code_df.drop_duplicates(subset ='zip_code', keep = 'first', inplace=True) 

In [None]:
zip_code_df.head()

In [None]:
zip_code_df.info()

### Test zip_code against zip_code in ZCTA table

In [None]:
zip_to_zcta_df = pd.read_csv("../02_transform_zip_zcta/zcta_to_match.csv", dtype="str")
zip_to_zcta_df.head()

In [None]:
zip_to_zcta_df.info()

In [None]:
common_zip_mix = pd.merge(zip_code_df, zip_to_zcta_df, on="zip_code")
common_zip_mix.head()

In [None]:
common_zip_mix.info()

In [None]:
common_zip_mix[common_zip_mix["city"] != common_zip_mix["PO_NAME"]]

In [None]:
final_zip_code = zip_to_zcta_df[["zip_code", "PO_NAME", "STATE"]].rename(columns={"PO_NAME": "city", "STATE": "state"})

In [None]:
final_zip_code.to_csv("final_zip_code.csv", index=False)

In [None]:
final_zcta = 

#### Drop unnecessary columns

In [None]:
# Save restaurant dataframe to a new dataframe

restaurant_clean_df = restaurant_df[["name","address","postalCode"]]
restaurant_clean_df.head()

In [None]:
# Extract unique restaurant names to a seperate table
restaurant_uniques = pd.Series(restaurant_clean_df.name.unique(), name="restaurant_name").to_frame()
restaurant_uniques

In [None]:
restaurant_uniques["restaurant_id"] = np.arange(1, len(restaurant_uniques)+1)

In [None]:
clean_restaurant = restaurant_uniques[["restaurant_id", "restaurant_name"]]
clean_restaurant.head()

In [None]:
clean_restaurant.to_csv('restaurant_id.csv', index=False)

In [None]:
address_name_id_merge = pd.merge(restaurant_uniques, restaurant_clean_df, left_on="restaurant_name", right_on="name")

In [None]:
address_name_id_merge.head()

In [None]:
short_address_name_id_merge = address_name_id_merge[["restaurant_id", "address", "postalCode"]]
short_address_name_id_merge.head()

In [None]:
street_no_name = short_address_name_id_merge.address.str.split(" ", n=1, expand=True)
street_no_name.head()

In [None]:
concatenated_restaurant = pd.concat([short_address_name_id_merge, street_no_name], axis=1)
concatenated_restaurant.head()

In [None]:
final_restaurant_address = concatenated_restaurant[["restaurant_id", 0, 1, "postalCode"]]
final_restaurant_address.head()

In [None]:
clean_restaurant_address = final_restaurant_address.rename(columns={"postalCode":"zip_code", 0: "street_no", 1: "street_name"})

In [None]:
clean_restaurant_address.head()

In [None]:
clean_restaurant_address.info()

In [None]:
# Export restaurant data to CSV
clean_restaurant_address.to_csv('restaurant_address.csv', index=False)

#### Rename the columns so that columns will be consistant throughout database 

In [None]:
restaurant_clean_df = restaurant_clean_df.rename(columns={"name":"restaurant_name", "postalCode":"zip_code"})
restaurant_clean_df

In [None]:
restaurant_clean_df.head()

In [None]:
# Create restaurant address dataframe to store restaurant addresses
restaurant_address_df = restaurant_clean_df[['restaurant_name','street_address_no','street_address_name','zip_code']]

In [None]:
restaurant_address_df.reset_index(drop = True, inplace = True)
restaurant_address_df.head()

In [None]:
restaurant_address_df.shape

In [None]:
# Create restaurant address ID
restaurant_address_df.index.name='resturant_address_id'

In [None]:
restaurant_address_df.head()

In [None]:
# Create restaurant names dataframe to store restaurant ID and name
restaurant_names_df= pd.DataFrame(restaurant_clean_df["restaurant_name"])
restaurant_names_df.info()

In [None]:
# Identify if there are duplicate restaurant names
restaurant_names_df["restaurant_name"].nunique()

In [None]:
# Keep unique names only noting limitation on variation in string for the same restaurant name
restaurant_names_df.drop_duplicates(subset ="restaurant_name", keep = 'first', inplace = True) 

In [None]:
restaurant_names_df.info()

In [None]:
# Create restaurant ID
restaurant_names_df.index.name='restaurant_id'

In [None]:
restaurant_names_df.head()

In [None]:
# Replace restaurant names in restaurant address table with restaurant_id > use merge 
#restaurant_address_df['restaurant__name_id'] = restaurant_names_df.lookup(restaurant_names_df.index, restaurant_names_df['restaurant_name'])

In [None]:
restaurant_names_df['id']=restaurant_names_df.index

In [None]:
restaurant_names_df.head()

In [None]:
#test_one = pd.merge(restaurant_names_df, restaurant_address_df,left_on="restaurant_name", right_on="restaurant_name")

inner_join = pd.merge(restaurant_address_df, restaurant_names_df,  
                      on ='restaurant_name',  
                      how ='inner')

In [None]:
inner_join.shape

In [None]:
restaurant_address_df.head()

In [None]:
# Export restaurant data to CSV
restaurant_clean_df.to_csv('restaurant_clean.csv', index=False)