In [None]:
import pandas as pd
import numpy as np

In [None]:
business = pd.read_json("yelp_academic_dataset_business.json", lines=True)

Sneak peek into 2 first rows to realise what we've got.

In [None]:
business.head(2)

Create 2 bool arrays to finaly get only Austing AND Texas entries.

In [None]:
austin_tx_bool = np.array(business.city == 'Austin') * np.array(business.state == 'TX')
austin_tx = business[austin_tx_bool]

In [None]:
# austin_tx.to_json('austin_tx.json')

Drop unimportant columns.

In [None]:
# austin_tx = austin_tx.drop(['city', 'state', 'business_id', 'address'], axis=1)
austin_tx = austin_tx.drop(['city', 'state', 'business_id'], axis=1)

We have a lot of nan values, let's see how many.

In [None]:
austin_tx.isna().sum()

Let's drop the nans.

In [None]:
dropped = austin_tx.shape[0]
austin_tx.dropna(inplace=True)
dropped -= austin_tx.shape[0]
print("Dropped:", dropped, "rows with nan values.") #5070

In [None]:
# austin_tx.to_json('austin_tx_nona.json')

Let's peek to first 2 rows to see what we've got.

In [None]:
austin_tx.head(2)

## Function to add distances from given place to the dataframe.

Clear address column from strange informations, this fix_adress() is suited for austin_tx database. New one may be required for Berlin.

In [None]:
def fix_address(address):
    return (address.split(sep=', '))[0]
austin_tx['address'] = austin_tx['address'].apply(fix_address)

In [None]:
austin_tx.head(2)

### Actual function to use in our work.

In [None]:
# conda install -c conda-forge geopy
from geopy.geocoders import Nominatim
from geopy.distance import distance

# proceed with cauction!!!
def distance_from(dataframe, addresses_column_name, given_location, postifx="", inplace=False):
    """
    dataframe - pandas.DataFrame object
    addresses_column_name - name of the column in which we can find addresses
    given_location - tuple with latitude and longitude of a location from which we're counting the distance

    postfix - what should I add to every address to make sure it's what we need. example: city/country
    inplace - check it if you want to change the given dataframe. default False

    returned value - pd.Series distances in kilometers
    """
    geolocator = Nominatim(user_agent="Strive School YMVC Project")

    number_of_rows = dataframe.shape[0]
    addresses_df = dataframe[addresses_column_name]

    gps_addresses = np.zeros(shape=number_of_rows)

    # this loop can take a looooong time.
    for row_nr in np.arange(0, number_of_rows):
        dist = np.nan
        try:
            loc = geolocator.geocode(addresses_df.iloc[row_nr] + postifx)
            dist = distance(given_location, (loc.latitude, loc.longitude)).kilometers
        except Exception as e:
            print(e)
        gps_addresses[row_nr] = dist

    if inplace:
        # we put a new column before the "address" column
        dataframe.insert(dataframe.columns.get_loc(addresses_column_name), 'distance_km', gps_addresses)
    else:
        return pd.Series(gps_addresses, name='distance_km')

Example of use with inplace=False

In [None]:
# temporary dataframe
head_2 = austin_tx.head(2)

# some coordinates
some_place = (30.274773446583634, -97.74038126660496)

pd_series_distance = distance_from(head_2, 'address', some_place, postifx=" Austin Texas", inplace=False)


In [None]:
# distances stored outside the given dataframe
pd_series_distance

In [None]:
# dataframe unchanged
head_2

Example use with inplace=True

In [None]:
distance_from(head_2, 'address', some_place, postifx=" Austin Texas", inplace=True)
# returned value: None

In [None]:
# head_2 has new column: distance_km
head_2

## Function for indexing dataframes based on strings/string parts that are contained in given column.
(This function is just a mechanism we can use for boolean-indexing the dataframe since it returns boolean array.)

In [None]:
def contains(column, word):
    """
    Usage:
    Pass a column and a word u want to find in the cells, to get a np.array of type bool.
    Afterwards u can pass it as an argument of [] selection mechanism. Examples:

    austin_tx[ contains(austin_tx.categories, "Shopping") ]
    austin_tx[ contains(austin_tx.categories, "Shopping") | contains(austin_tx.categories, "Hotels") ]      # and
    austin_tx[ contains(austin_tx.categories, "Shopping") & contains(austin_tx.categories, "Watches") ]     # or

    Works for all columns that return strings. hours and attributes too.
    """
    return np.array([(word in vals) for vals in column])

Example: get all Shops that do Watches. :)

In [None]:
(austin_tx[ contains(austin_tx.categories, "Shopping") & contains(austin_tx.categories, "Watches") ]).head(2)

## Function that returns all categories and how much of each there is.

Clear categories column from strange informations, this fix_category() is suited for austin_tx database. New one may be required for Berlin.

In [None]:
def fix_category(category):
    return category.replace(", &", " &")
austin_tx['categories'] = austin_tx['categories'].apply(fix_category)

### Actual function

In [None]:
def get_categories_dict(category_column):
    categories_counts = {}
    for things in category_column:
        for thing in things.split(sep=", "):    
            if thing in categories_counts.keys():
                categories_counts[thing] += 1
            else:
                categories_counts[thing] = 1
    return categories_counts

Example usage:

In [None]:
categories_counts = get_categories_dict(austin_tx['categories'])

I use the loop below to just show first 3 entries.

In [None]:
# few example entries in "categories_counts"
for idx, key in enumerate(categories_counts):
    if idx < 3:
        print("Category name:", key, "\nNumber of entries with this type:", categories_counts[key], "\n")
    else:
        break

In [None]:
# uncomment the line below and run this cell if you wanna see EVERYTHING
# categories_counts