In [1]:
import pandas as pd
import numpy as np

In [2]:
business = pd.read_json("yelp_academic_dataset_business.json", lines=True)

Sneak peek into 2 first rows to realise what we've got.

In [3]:
business.head(2)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,6iYb2HFDywm3zjuRg0shjw,Oskar Blues Taproom,921 Pearl St,Boulder,CO,80302,40.017544,-105.283348,4.0,86,1,"{'RestaurantsTableService': 'True', 'WiFi': 'u...","Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'..."
1,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,7000 NE Airport Way,Portland,OR,97218,45.588906,-122.593331,4.0,126,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ..."


Create 2 bool arrays to finaly get only Austing AND Texas entries.

In [4]:
austin_tx_bool = np.array(business.city == 'Austin') * np.array(business.state == 'TX')
austin_tx = business[austin_tx_bool]

In [5]:
# austin_tx.to_json('austin_tx.json')

Drop unimportant columns.

In [6]:
# austin_tx = austin_tx.drop(['city', 'state', 'business_id', 'address'], axis=1)
austin_tx = austin_tx.drop(['city', 'state', 'business_id'], axis=1)

We have a lot of nan values, let's see how many.

In [7]:
austin_tx.isna().sum()

name               0
address            0
postal_code        0
latitude           0
longitude          0
stars              0
review_count       0
is_open            0
attributes      2269
categories        17
hours           3477
dtype: int64

Let's drop the nans.

In [8]:
dropped = austin_tx.shape[0]
austin_tx.dropna(inplace=True)
dropped -= austin_tx.shape[0]
print("Dropped:", dropped, "rows with nan values.") #5070

Dropped: 5070 rows with nan values.


In [9]:
# austin_tx.to_json('austin_tx_nona.json')

Let's peek to first 2 rows to see what we've got.

In [10]:
austin_tx.head(2)

Unnamed: 0,name,address,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
8,Lane Wells Jewelry Repair,"7801 N Lamar Blvd, Ste A140",78752,30.346169,-97.711458,5.0,30,1,"{'RestaurantsPriceRange2': '1', 'ByAppointment...","Shopping, Jewelry Repair, Appraisal Services, ...","{'Monday': '12:15-17:0', 'Tuesday': '12:15-17:..."
9,Capital City Barber Shop,"615 W Slaughter Ln, Ste 113",78748,30.172706,-97.79992,4.0,5,0,"{'BusinessAcceptsCreditCards': 'False', 'Resta...","Barbers, Beauty & Spas","{'Monday': '9:0-17:0', 'Tuesday': '9:0-19:0', ..."


Add a column stating distance in... kinda minutes in GPS sense I guess... Doesn't really matter - what matters is we can compare distances from the centre now.

In [11]:
lat_lon_df = austin_tx.loc[:,'latitude':'longitude']
number_of_rows = lat_lon_df.shape[0]

gps_based_distance = np.zeros(shape=number_of_rows)

for row in np.arange(number_of_rows):

    lat_lon = np.array([lat_lon_df.iloc[row][0], lat_lon_df.iloc[row][1]]) * 60
    capitol_lat_lon = np.array([30.274773446583634, -97.74038126660496]) * 60
    gps_based_distance[row] = np.linalg.norm(lat_lon - capitol_lat_lon)

austin_tx['dist_from_center'] = gps_based_distance

Clear address column from strange informations.

In [12]:
def fix_address(address):
    return (address.split(sep=', '))[0]

In [13]:
austin_tx['address'] = austin_tx['address'].apply(fix_address)

In [14]:
austin_tx

Unnamed: 0,name,address,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,dist_from_center
8,Lane Wells Jewelry Repair,7801 N Lamar Blvd,78752,30.346169,-97.711458,5.0,30,1,"{'RestaurantsPriceRange2': '1', 'ByAppointment...","Shopping, Jewelry Repair, Appraisal Services, ...","{'Monday': '12:15-17:0', 'Tuesday': '12:15-17:...",4.621901
9,Capital City Barber Shop,615 W Slaughter Ln,78748,30.172706,-97.799920,4.0,5,0,"{'BusinessAcceptsCreditCards': 'False', 'Resta...","Barbers, Beauty & Spas","{'Monday': '9:0-17:0', 'Tuesday': '9:0-19:0', ...",7.089800
24,DoubleTree by Hilton Hotel Austin,6505 N Interstate 35,78752,30.326377,-97.704543,3.0,139,1,"{'WiFi': 'u'free'', 'RestaurantsPriceRange2': ...","Hotels, Hotels & Travel, Event Planning & Serv...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",3.769626
27,PS Property Management Company,2506 S Lamar Blvd,78704,30.246465,-97.778738,4.5,9,1,{'BusinessAcceptsCreditCards': 'True'},"Home Services, Real Estate, Property Management","{'Monday': '9:0-17:0', 'Tuesday': '9:0-17:0', ...",2.860291
37,McKinley Chiropractic,5625 Eiger Rd,78735,30.244902,-97.857409,5.0,5,1,"{'BusinessAcceptsCreditCards': 'True', 'Accept...","Chiropractors, Health & Medical","{'Monday': '9:0-17:45', 'Tuesday': '12:0-17:45...",7.246796
...,...,...,...,...,...,...,...,...,...,...,...,...
160560,Starbucks,9300 S I-35 Service Rd S Bound,78748,30.165573,-97.788109,3.0,78,1,"{'BikeParking': 'True', 'BusinessParking': '{'...","Food, Coffee & Tea","{'Monday': '0:0-0:0', 'Tuesday': '5:0-21:0', '...",7.150498
160564,AcuStretch,13018 N Hwy 183,78750,30.437784,-97.777968,4.0,7,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Massage, Beauty & Spas","{'Monday': '10:0-21:0', 'Tuesday': '10:0-21:0'...",10.037267
160566,Salute Oral & Facial Surgery,10801 N Mo Pac Expy,78759,30.398422,-97.730128,3.5,13,1,"{'ByAppointmentOnly': 'True', 'BusinessAccepts...","Health & Medical, General Dentistry, Dentists","{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', ...",7.444376
160567,Austin Stained Concrete and More,5710 Fitchwood Ln,78749,30.210815,-97.869660,5.0,19,1,{'BusinessAcceptsCreditCards': 'True'},"Masonry/Concrete, Home Services, Flooring","{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', ...",8.654087


Work In progress - function to get all the distances from a given location to given addressess.

In [20]:
from geopy.geocoders import Nominatim
from geopy.distance import distance

def get_distances(dataframe, addresses_column_name, given_location, postifx="", inplace=False): # TODO: add mechanics for inplace=True
    """
    dataframe - pandas.DataFrame object
    addresses_column_name - name of the column in which we can find addresses
    given_location - tuple with latitude and longitude of a location from which we're counting the distance

    postfix - what should I add to every address to make sure it's what we need. example: city/country

    returned value - pd.Series distances in kilometers
    """
    geolocator = Nominatim(user_agent="Strive School YMVC Project")

    number_of_rows = dataframe.shape[0]
    addresses_df = dataframe[addresses_column_name]

    gps_addresses = np.zeros(shape=number_of_rows)

    for row_nr in np.arange(110, 112):  ## TODO: fix this for the real rannge, this is a testing one
        dist = -1.0
        try:
            loc = geolocator.geocode(addresses_df.iloc[row_nr] + postifx)
            dist = distance(given_location, (loc.latitude, loc.longitude)).kilometers
        except Exception as e:
            print(e)
        gps_addresses[row_nr] = dist
        
    return pd.Series(gps_addresses, name="distance")

In [21]:
from_here = (30.274773446583634, -97.74038126660496)
x = get_distances(austin_tx, 'address', from_here, postifx=" Austin Texas")

In [23]:
x[110:112]

110     1.789218
111    16.528674
Name: distance, dtype: float64

Useful function to help with selecting only rows which contain a chosen word in a given column.

In [18]:
def contains(column, word):
    """
    Usage:
    Pass a column and a word u want to find in the cells, to get a np.array of type bool.
    Afterwards u can pass it as an argument of [] selection mechanism. Examples:

    austin_tx[ contains(austin_tx.categories, "Shopping") ]
    austin_tx[ contains(austin_tx.categories, "Shopping") | contains(austin_tx.categories, "Hotels") ]      # and
    austin_tx[ contains(austin_tx.categories, "Shopping") & contains(austin_tx.categories, "Watches") ]     # or

    Works for all columns that return strings. hours and attributes too.
    """
    return np.array([(word in vals) for vals in column])

Chech for ALL existing cathegories. (Additionaly it checks how many of each one are there.)

In [19]:
categories_counts = {}
for things in austin_tx.categories.unique():
    things = things.replace(', &', ' &')
    for thing in things.split(sep=", "):
        if thing in categories_counts.keys():
            categories_counts[thing] += 1
        else:
            categories_counts[thing] = 1