# Yelp Project

This notebook contains some code to help you get started with the final CI project. This will load the data from the cities in the data folder.

Load the required packages below

In [None]:
import pandas as pd
import os
import json

The functions below help loading the data

In [None]:
DATA_DIR = "data"


def list_cities():
    """
    Finds all cities (all directory names) in ./data
    Returns a list of city names
    """
    return os.listdir(DATA_DIR)


def load_city(city, data_filename):
    """
    Given city name extract all data from ./data/<city>/<data_filename>.json
    Returns a pandas DataFrame
    """
    data = {}

    city_data = []
    with open(f"{DATA_DIR}/{city}/{data_filename}.json", "r") as f:
        for line in f:
            city_data.append(json.loads(line))
    return pd.DataFrame(city_data)

def load_cities(cities, data_filename):
    """
    Given a list of city names,
        for each city extract all data from ./data/<city>/<data_filename>.json
    Returns a pandas DataFrame
    """ 
    dfs = []
    for city in cities:
        dfs.append(load_city(city, data_filename))
        
    # combine data
    combined = pd.concat(dfs)
    
    # if an entry occurs in multiple cities, there could be duplicates. try to remove
    try:
        cleaned = combined.drop_duplicates(ignore_index = True)
    except TypeError:
        # if an item is unhashable
        cleaned = combined
    
    return cleaned

This is how you can load all the data from a single city:

In [None]:
CITY = 'sun city'
USERS = load_city(CITY, "user")
BUSINESSES = load_city(CITY, "business")
REVIEWS = load_city(CITY, "review")
TIPS = load_city(CITY, "tip")
CHECKINS = load_city(CITY, "checkin")

display(REVIEWS)

This is how you can load all the data from multiple cities at once:

In [None]:
CITIES = list_cities()
print(CITIES)

USERS = load_cities(CITIES, "user")
BUSINESSES = load_cities(CITIES, "business")
REVIEWS = load_cities(CITIES, "review")
TIPS = load_cities(CITIES, "tip")
CHECKINS = load_cities(CITIES, "checkin")

display(REVIEWS)