# Cleaning Yelp Dataset

In [1]:
import sys
import json
sys.path.append("..")

import mongo_manager
import pandas as pd

def find_restaurants_by_attributes(attributes: str) -> bool:
    """
    This method returns true if the business attributes containt restaurant as True
    :param attributes: string representing json data point
    :return: boolean
    """
    if attributes is None:
        return False
    restaurant = dict(attributes)
    if restaurant.get("Restaurants", False) == "True":
        return True
    return False
    
def find_restaurants_by_categories(categories: str) -> bool:
    """
    This method returns true if the business categories contain Restaurant
    :param attributes: string representin json data point
    :return: boolean
    """
    if categories is None:
        return False
    return any(i in ["restaurant", "restaurants"]  for i in [i.lower() for i in categories.split(",")])


def define_restaurants_by_category(new_df: pd.DataFrame, index: int, categories: str) -> None:
    """
    This method sets a categorical column to 1 or True based on the index
    and categories passed to it
    :param new_df: The dataframe to modify
    :param index: The index of the row to modify
    :param categories: A string of categories delimited by a comma to be set to 1
    :return: void
    """
    if categories is None:
        return
    for category in categories.split(","):
        new_df.loc[index, category.strip()] = 1

def define_restaurants_by_attributes(new_df: pd.DataFrame, index: int, attributes: str) -> None:
    """
    This method sets a categorical column to 1 or True based on the index
    and attributes passed to it
    :param new_df: The dataframe to modify
    :param index: The index of the row to modify
    :param categories: An object of attributes
    :return: void
    """
    if attributes is None:
        return
    for attribute, value in dict(attributes).items():
        obj = eval(value)
        if type(obj) == bool and obj == True:
            new_df.loc[index, attribute.strip()] = 1
        elif type(obj) == dict:
            for i,v in obj.items():
                if v == True:
                    new_df.loc[index, f"{attribute}_{i}"] = 1
        else:
            if obj:
                new_df.loc[index, f"{attribute}_{obj}"] = 1


Connection Successful MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True)
Databases:
admin
config
local
yelp_data


In [2]:
mongo_manager.utils.list_collections("yelp_data")

cleaned_restaurant_dataset
business_data


In [3]:
# pull yelp data from the database
business_data_df = mongo_manager.utils.read_collection("yelp_data", "business_data")

In [4]:
# create a new dataframe from the old which only contains restaurant related data
restaurants_df =  \
    business_data_df.loc[business_data_df.categories.apply(lambda x: find_restaurants_by_categories(x))  == True]

In [5]:
# reset index to merge later
restaurants_df.reset_index(inplace=True, drop=True)

In [6]:
# inspect the dataframe
restaurants_df

Unnamed: 0,_id,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,62f293eb8838871c65b871f4,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
1,62f293eb8838871c65b87208,9OG5YkX1g2GReZM0AskizA,Romano's Macaroni Grill,5505 S Virginia St,Reno,NV,89502,39.476117,-119.789339,2.5,339,1,"{'RestaurantsGoodForGroups': 'True', 'Restaura...","Restaurants, Italian","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'..."
2,62f293eb8838871c65b8720c,tMkwHmWFUEXrC9ZduonpTg,The Green Pheasant,215 1st Ave S,Nashville,TN,37201,36.159886,-86.773197,4.0,161,0,"{'RestaurantsGoodForGroups': 'True', 'HappyHou...","Restaurants, Japanese, Seafood","{'Wednesday': '16:0-22:0', 'Thursday': '16:0-2..."
3,62f293eb8838871c65b87215,2xVsWBNFwZOxIOdd9Mwnww,Cheeseburger In Paradise,116 N Pottstown Pike,Exton,PA,19341,40.029962,-75.630607,2.5,20,0,"{'NoiseLevel': 'u'average'', 'HasTV': 'True', ...","Restaurants, Burgers",
4,62f293eb8838871c65b8721a,ljxNT9p0y7YMPx0fcNBGig,Tony's Restaurant & 3rd Street Cafe,312 Piasa St,Alton,IL,62002,38.896563,-90.186203,3.0,94,1,"{'RestaurantsReservations': 'True', 'Restauran...","Restaurants, Specialty Food, Steakhouses, Food...","{'Monday': '0:0-0:0', 'Tuesday': '16:0-21:30',..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15285,62f294138838871c65babd13,wVxXRFf10zTTAs11nr4xeA,PrimoHoagies,6024 Ridge Ave,Philadelphia,PA,19128,40.032483,-75.214430,3.0,55,1,"{'NoiseLevel': 'u'average'', 'RestaurantsTakeO...","Restaurants, Specialty Food, Food, Sandwiches,...","{'Monday': '10:0-21:0', 'Tuesday': '10:0-21:0'..."
15286,62f294138838871c65babd1a,sf_oQ62L8UEnOOLf00nNGA,Pizza Hut,5028 Old Hickory,Hermitage,TN,37076,36.193201,-86.614748,3.0,6,1,"{'RestaurantsTakeOut': 'True', 'GoodForKids': ...","Restaurants, Pizza, Fast Food, Chicken Wings, ...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'..."
15287,62f294138838871c65babd26,l9eLGG9ZKpLJzboZq-9LRQ,Wawa,19 N Bishop Ave,Clifton Heights,PA,19018,39.925656,-75.310344,3.0,11,1,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Restaurants, Sandwiches, Convenience Stores, C...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W..."
15288,62f294138838871c65babd31,WnT9NIzQgLlILjPT0kEcsQ,Adelita Taqueria & Restaurant,1108 S 9th St,Philadelphia,PA,19147,39.935982,-75.158665,4.5,35,1,"{'WheelchairAccessible': 'False', 'Restaurants...","Restaurants, Mexican","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'..."


In [7]:
# collect all the attributes and categories
unique_categories = set()
unique_attributes = set()


for index, series in restaurants_df.iterrows():
    categories = series.categories
    attributes = series.attributes
    if categories:
        for category in [i for i in categories.split("\n")]:
            for item in category.split(","):
                unique_categories.add(item.strip())
    
    if attributes:
        for attribute, value in dict(attributes).items():
            obj = eval(value)
            if type(obj) == bool:
                unique_attributes.add(attribute)
            elif type(obj) == dict:
                for i,v in obj.items():
                    unique_attributes.add(f"{attribute}_{i}")
            else:
                if obj:
                    unique_attributes.add(f"{attribute}_{obj}")
                
    

In [8]:
# inspect unique attributes for all restaurants
unique_attributes

{'AcceptsInsurance',
 'AgesAllowed_21plus',
 'AgesAllowed_allages',
 'Alcohol_beer_and_wine',
 'Alcohol_full_bar',
 'Alcohol_none',
 'Ambience_casual',
 'Ambience_classy',
 'Ambience_divey',
 'Ambience_hipster',
 'Ambience_intimate',
 'Ambience_romantic',
 'Ambience_touristy',
 'Ambience_trendy',
 'Ambience_upscale',
 'BYOB',
 'BYOBCorkage_no',
 'BYOBCorkage_yes_corkage',
 'BYOBCorkage_yes_free',
 'BestNights_friday',
 'BestNights_monday',
 'BestNights_saturday',
 'BestNights_sunday',
 'BestNights_thursday',
 'BestNights_tuesday',
 'BestNights_wednesday',
 'BikeParking',
 'BusinessAcceptsBitcoin',
 'BusinessAcceptsCreditCards',
 'BusinessParking_garage',
 'BusinessParking_lot',
 'BusinessParking_street',
 'BusinessParking_valet',
 'BusinessParking_validated',
 'ByAppointmentOnly',
 'Caters',
 'CoatCheck',
 'Corkage',
 'DietaryRestrictions_dairy-free',
 'DietaryRestrictions_gluten-free',
 'DietaryRestrictions_halal',
 'DietaryRestrictions_kosher',
 'DietaryRestrictions_soy-free',
 'Diet

In [9]:
# inspect unique categories for all restaurants
unique_categories

{'Acai Bowls',
 'Active Life',
 'Adult',
 'Adult Entertainment',
 'Afghan',
 'African',
 'American (New)',
 'American (Traditional)',
 'Amusement Parks',
 'Apartments',
 'Appliances',
 'Arabic',
 'Arcades',
 'Argentine',
 'Armenian',
 'Art Classes',
 'Art Galleries',
 'Arts & Crafts',
 'Arts & Entertainment',
 'Asian Fusion',
 'Australian',
 'Austrian',
 'Auto Customization',
 'Auto Parts & Supplies',
 'Auto Repair',
 'Automotive',
 'Bagels',
 'Bakeries',
 'Bangladeshi',
 'Banks & Credit Unions',
 'Bar Crawl',
 'Barbeque',
 'Barbers',
 'Bars',
 'Bartenders',
 'Basque',
 'Beaches',
 'Beauty & Spas',
 'Bed & Breakfast',
 'Beer',
 'Beer Bar',
 'Beer Gardens',
 'Beer Tours',
 'Belgian',
 'Beverage Store',
 'Bike Rentals',
 'Bistros',
 'Books',
 'Bookstores',
 'Bowling',
 'Brasseries',
 'Brazilian',
 'Breakfast & Brunch',
 'Breweries',
 'Brewing Supplies',
 'Brewpubs',
 'British',
 'Bubble Tea',
 'Buffets',
 'Building Supplies',
 'Burgers',
 'Burmese',
 'Business Consulting',
 'Butcher',
 '

In [10]:
# create a dataframe with columns for each category
unique_categories_df = \
    pd.DataFrame({i:[0 for i in range(len(restaurants_df))] for i in unique_categories})

# create a dataframe with columns for each attribute
unique_attributes_df = \
    pd.DataFrame({i:[0 for i in range(len(restaurants_df))] for i in unique_attributes})

In [11]:
unique_categories_df


Unnamed: 0,Art Galleries,Auto Repair,Cosmetics & Beauty Supply,Wine Tours,Cafes,Pubs,Real Estate,Turkish,Party Equipment Rentals,Education,...,Fish & Chips,Hardware Stores,Fast Food,Elementary Schools,Shanghainese,Sports Clubs,Taiwanese,Adult,Beer Gardens,Arabic
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15285,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15286,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15287,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15288,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
unique_attributes_df

Unnamed: 0,RestaurantsAttire_dressy,Ambience_casual,BestNights_saturday,BusinessParking_garage,DietaryRestrictions_vegetarian,BestNights_sunday,AcceptsInsurance,DietaryRestrictions_soy-free,RestaurantsTakeOut,BusinessAcceptsBitcoin,...,AgesAllowed_allages,DogsAllowed,Music_live,RestaurantsPriceRange2_2,GoodForKids,Ambience_romantic,BestNights_thursday,CoatCheck,BestNights_friday,Ambience_trendy
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15285,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15286,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15287,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15288,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# merge our attributes and categories dataframes
attributes_and_categories_df = \
    pd.merge(unique_categories_df, unique_attributes_df, left_index=True, right_index=True)

In [14]:
# validate columns are the same count as all our unique attributes and categories
assert len(unique_attributes) + len(unique_categories) == len(attributes_and_categories_df.columns)

In [15]:
# inspect new dataframe
attributes_and_categories_df

Unnamed: 0,Art Galleries,Auto Repair,Cosmetics & Beauty Supply,Wine Tours,Cafes,Pubs,Real Estate,Turkish,Party Equipment Rentals,Education,...,AgesAllowed_allages,DogsAllowed,Music_live,RestaurantsPriceRange2_2,GoodForKids,Ambience_romantic,BestNights_thursday,CoatCheck,BestNights_friday,Ambience_trendy
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15285,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15286,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15287,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15288,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# merge our categorical data with our original restaurants df
restaurant_with_categorical_df = \
    pd.merge(restaurants_df, attributes_and_categories_df, left_index=True, right_index=True)

In [17]:
# validate we did not lose any rows from our original restaurant dataframe
assert len(restaurants_df) == len(restaurant_with_categorical_df)

In [18]:
# inspect new dataframe
restaurant_with_categorical_df

Unnamed: 0,_id,business_id,name,address,city,state,postal_code,latitude,longitude,stars,...,AgesAllowed_allages,DogsAllowed,Music_live,RestaurantsPriceRange2_2,GoodForKids,Ambience_romantic,BestNights_thursday,CoatCheck,BestNights_friday,Ambience_trendy
0,62f293eb8838871c65b871f4,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,...,0,0,0,0,0,0,0,0,0,0
1,62f293eb8838871c65b87208,9OG5YkX1g2GReZM0AskizA,Romano's Macaroni Grill,5505 S Virginia St,Reno,NV,89502,39.476117,-119.789339,2.5,...,0,0,0,0,0,0,0,0,0,0
2,62f293eb8838871c65b8720c,tMkwHmWFUEXrC9ZduonpTg,The Green Pheasant,215 1st Ave S,Nashville,TN,37201,36.159886,-86.773197,4.0,...,0,0,0,0,0,0,0,0,0,0
3,62f293eb8838871c65b87215,2xVsWBNFwZOxIOdd9Mwnww,Cheeseburger In Paradise,116 N Pottstown Pike,Exton,PA,19341,40.029962,-75.630607,2.5,...,0,0,0,0,0,0,0,0,0,0
4,62f293eb8838871c65b8721a,ljxNT9p0y7YMPx0fcNBGig,Tony's Restaurant & 3rd Street Cafe,312 Piasa St,Alton,IL,62002,38.896563,-90.186203,3.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15285,62f294138838871c65babd13,wVxXRFf10zTTAs11nr4xeA,PrimoHoagies,6024 Ridge Ave,Philadelphia,PA,19128,40.032483,-75.214430,3.0,...,0,0,0,0,0,0,0,0,0,0
15286,62f294138838871c65babd1a,sf_oQ62L8UEnOOLf00nNGA,Pizza Hut,5028 Old Hickory,Hermitage,TN,37076,36.193201,-86.614748,3.0,...,0,0,0,0,0,0,0,0,0,0
15287,62f294138838871c65babd26,l9eLGG9ZKpLJzboZq-9LRQ,Wawa,19 N Bishop Ave,Clifton Heights,PA,19018,39.925656,-75.310344,3.0,...,0,0,0,0,0,0,0,0,0,0
15288,62f294138838871c65babd31,WnT9NIzQgLlILjPT0kEcsQ,Adelita Taqueria & Restaurant,1108 S 9th St,Philadelphia,PA,19147,39.935982,-75.158665,4.5,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# iterate over the original dataframe using it to mark 1's in the new dataframe if the category or
# attributes corresponds to true
for index, row in restaurants_df.iterrows():
    define_restaurants_by_attributes(restaurant_with_categorical_df, index, row.attributes)
    define_restaurants_by_category(restaurant_with_categorical_df, index, row.categories)

In [20]:
restaurant_with_categorical_df

Unnamed: 0,_id,business_id,name,address,city,state,postal_code,latitude,longitude,stars,...,AgesAllowed_allages,DogsAllowed,Music_live,RestaurantsPriceRange2_2,GoodForKids,Ambience_romantic,BestNights_thursday,CoatCheck,BestNights_friday,Ambience_trendy
0,62f293eb8838871c65b871f4,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,...,0,0,0,0,0,0,0,0,0,0
1,62f293eb8838871c65b87208,9OG5YkX1g2GReZM0AskizA,Romano's Macaroni Grill,5505 S Virginia St,Reno,NV,89502,39.476117,-119.789339,2.5,...,0,0,0,1,1,0,0,0,0,0
2,62f293eb8838871c65b8720c,tMkwHmWFUEXrC9ZduonpTg,The Green Pheasant,215 1st Ave S,Nashville,TN,37201,36.159886,-86.773197,4.0,...,0,0,0,0,0,0,0,0,0,0
3,62f293eb8838871c65b87215,2xVsWBNFwZOxIOdd9Mwnww,Cheeseburger In Paradise,116 N Pottstown Pike,Exton,PA,19341,40.029962,-75.630607,2.5,...,0,0,0,1,1,0,0,0,0,0
4,62f293eb8838871c65b8721a,ljxNT9p0y7YMPx0fcNBGig,Tony's Restaurant & 3rd Street Cafe,312 Piasa St,Alton,IL,62002,38.896563,-90.186203,3.0,...,0,0,0,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15285,62f294138838871c65babd13,wVxXRFf10zTTAs11nr4xeA,PrimoHoagies,6024 Ridge Ave,Philadelphia,PA,19128,40.032483,-75.214430,3.0,...,0,0,0,1,1,0,0,0,0,0
15286,62f294138838871c65babd1a,sf_oQ62L8UEnOOLf00nNGA,Pizza Hut,5028 Old Hickory,Hermitage,TN,37076,36.193201,-86.614748,3.0,...,0,0,0,0,0,0,0,0,0,0
15287,62f294138838871c65babd26,l9eLGG9ZKpLJzboZq-9LRQ,Wawa,19 N Bishop Ave,Clifton Heights,PA,19018,39.925656,-75.310344,3.0,...,0,0,0,0,0,0,0,0,0,0
15288,62f294138838871c65babd31,WnT9NIzQgLlILjPT0kEcsQ,Adelita Taqueria & Restaurant,1108 S 9th St,Philadelphia,PA,19147,39.935982,-75.158665,4.5,...,0,0,0,1,1,0,0,0,0,0


In [21]:
# convert all star ratings to integers
restaurant_with_categorical_df["stars"] = restaurant_with_categorical_df["stars"].astype(int)

In [22]:
# drop the attributes and categories columns as we no longer need them
restaurant_with_categorical_df.drop(columns=["attributes", "categories"], inplace=True)

# rename columns with invalid characters in them
for column in restaurant_with_categorical_df.columns:
    if "&" in column or " " in column:
        restaurant_with_categorical_df.rename(columns={column:column.replace(" ","_").replace("&","and")}, inplace=True)

In [45]:
for col in restaurant_with_categorical_df.columns:
    series = restaurant_with_categorical_df[col]
    if series.dtype == int and series.sum() < 20:
        print(col)

Art_Galleries
Auto_Repair
Cosmetics_and_Beauty_Supply
Wine_Tours
Real_Estate
Party_Equipment_Rentals
Education
Security_Systems
Home_Organization
Appliances
Teppanyaki
Watches
Champagne_Bars
Country_Dance_Halls
Japanese_Curry
Barbers
Golf_Lessons
Financial_Services
Malaysian
Colombian
Venezuelan
Smokehouse
Delicatessen
Health_and_Medical
Cideries
Vitamins_and_Supplements
Contractors
Syrian
Car_Wash
Auto_Parts_and_Supplies
Department_Stores
Hair_Salons
Golf
Tours
Armenian
Restaurant_Supplies
Mass_Media
Landmarks_and_Historical_Buildings
Comedy_Clubs
Escape_Games
Irish_Pub
Butcher
Donairs
Pet_Services
Self_Storage
Festivals
Professional_Services
Perfume
Museums
Wedding_Chapels
Moroccan
Rest_Stops
Paint_and_Sip
Beer_Tours
Tennis
Wine_Tasting_Room
Wine_Tasting_Classes
Transportation
Lebanese
Tiki_Bars
Propane
Iberian
Imported_Food
Oriental
Day_Spas
Pool_Halls
Adult_Entertainment
Community_Service/Non-Profit
Men's_Clothing
Tuscan
Persian/Iranian
Fitness_and_Instruction
Themed_Cafes
Cheese_S

In [46]:
# inspect questionable categories might be unrelated to restaurants
questionable_categories = [
    "Pet_Adoption",
    "Newspapers_and_Magazines",
    "Souvenir_Shops",
    "Home_and_Garden",
    "Bowling",
    "Rafting/Kayaking",
    "Hardware_Stores",
    "Elementary_Schools",
    "Sports_Clubs",
    "Adult",
    "AcceptsInsurance",
    "International_Grocery",
    "Pet_Stores",
    "Social_Clubs",
    "Pet_Adoption"
    "Educational_Services",
    "Religious_Items",
    "Pet_Photography",
    "Junk_Removal_and_Hauling",
    "Weight_Loss_Centers",
    "Ethnic_Grocery",
    "Yoga",
    "Banks_and_Credit_Unions",
    "Recycling_Center",
    "Grilling_Equipment"
    "Wholesalers",
    "Auto_Customization",
    "Outdoor_Gear",
    "Countertop_Installation",
    "Home_Decor",
    "Art_Classes",
    "Photography_Stores_and_Services",
    "Cooking_Schools"
    "Nurseries_and_Gardening",
    "Movers",
    "Mags",
    "Gun/Rifle_Ranges",
    "Women's_Clothing",
    "Local_Services",
    "Kids_Activities",
    "Jewelry",
    "Massage",
    "Electronics",
    "Gyms",
    "Public_Services_and_Government",
    "Tobacco_Shops",
    "Home_Services",
    "Cooking_Classes",
    "Kitchen_and_Bath",
    "Building_Supplies",
    "Guns_and_Ammo",
    "Wedding_Planning",
    "Carpet_Cleaning",
    "Beauty_and_Spas",
    "Books",
    "Brewing_Supplies",
    "Preschools",
    "Fashion"
    "Skin_Care",
    "Furniture_Stores",
    "Flowers_and_Gifts",
    "Plumbing",
    "Ice_Delivery",
    "IT_Services_and_Computer_Repair",
    "Hunting_and_Fishing_Supplies",
    "Vape_Shops",
    "Tree_Services",
    "Nail_Salons",
    "Auto_Repair",
    "Cosmetics_and_Beauty_Supply",
    "Real_Estate",
    "Education",
    "Security_Systems",
    "Appliances",
    "Watches",
    "Barbers",
    "Golf_Lessons",
    "Financial_Services",
    "Health_and_Medical",
    "Contractors",
    "Car_Wash",
    "Auto_Parts_and_Supplies",
    "Hair_Salons",
    "Restaurant_Supplies",
    "Mass_Media",
    "Escape_Games",
    "Pet_Services",
    "Self_Storage",
    "Perfume",
    "Professional_Services",
    "Tennis",
    "Transportation",
    "Day_Spas",
    "Men's_Clothing",
    "Fitness_and_Instruction",
    "Mattresses",
    "Playgrounds",
    "Florists",
    "Mini_Golf",
    "Bookstores",
    "Business_Consulting",
    "Car_Dealers",
    "Drugstores",
    "Party_Supplies",
    "Special_Education",
    "Arts_and_Crafts"
    
]

In [None]:
# add cleaned collection to mongo
mongo_manager.utils.upload_dataframe("yelp_data", "cleaned_restaurant_dataset", restaurant_with_categorical_df)

In [None]:
# validate collection has been added
mongo_manager.utils.list_collections("yelp_data")

In [47]:
for col in restaurant_with_categorical_df.columns:
    series = restaurant_with_categorical_df[col]
    if series.dtype == int and series.sum() == 0:
        print(col)

AcceptsInsurance
Music_no_music
DietaryRestrictions_kosher
DietaryRestrictions_halal
