In [68]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

In [2]:
df = pd.read_json("dataset/yelp_academic_dataset_business.json", encoding='utf-8', lines=True)

In [3]:
df.isnull().sum()

business_id         0
name                0
address             0
city                0
state               0
postal_code         0
latitude            0
longitude           0
stars               0
review_count        0
is_open             0
attributes      13744
categories        103
hours           23223
dtype: int64

### Cleaning

In [4]:
# df.dropna(inplace=True)

In [5]:
usa_states = ('AL', 'KY', 'OH', 'AK', 'LA', 'OK', 'AZ', 'ME', 'OR', \
              'AR', 'MD', 'PA', 'AS', 'MA', 'PR', 'CA', 'MI', 'RI', \
              'CO', 'MN', 'SC', 'CT', 'MS', 'SD', 'DE', 'MO', 'TN', \
              'DC', 'MT', 'TX', 'FL', 'NE', 'TT', 'GA', 'NV', 'UT', \
              'GU', 'NH', 'VT', 'HI', 'NJ', 'VA', 'ID', 'NM', 'VI', \
              'IL', 'NY', 'WA', 'IN', 'NC', 'WV', 'IA', 'ND', 'WI', \
              'KS', 'MP', 'WY')

len(usa_states)

57

In [6]:
# filter businesses Open only in USA.
usa_filter = (df['state'].isin(usa_states))
df = df[df['categories'].notnull()]
is_open = (df['is_open'] == 1)
df = df[is_open & usa_filter]

# make all categories lower case
df.loc[:, 'categories'] = df['categories'].str.lower()

# get "restaurants"
df = df[df['categories'].str.contains('restaurants')]

print(df.shape)


(33252, 14)


  df = df[is_open & usa_filter]


Postal code

In [7]:
# assigning placeholder postal codes 
df.loc[df['postal_code'].apply(lambda x: len(x)<5), 'postal_code'] = 99999

In [8]:
# manually filling in one missing postal code value
# df.loc[df['postal_code'] == '', 'postal_code'] = 33701
# Convert to suitable dtype 
df['postal_code'] = df['postal_code'].astype('int')

Hours

In [9]:
default_hours = {'Monday': None,
                 'Tuesday': None,
                 'Wednesday': None,
                 'Thursday': None,
                 'Friday': None,
                 'Saturday': None,
                 'Sunday': None}

def impute_hours(row):
    if pd.isnull(row):
        return default_hours
    else:
        return row
    
df.loc[:, 'hours'] = df['hours'].apply(impute_hours)

In [10]:
days_of_week = ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday')
for day in days_of_week:
    df.loc[:, f'{day}'] = df['hours'].apply(lambda x: x.get(f'{day}', None))

In [11]:
df[df['Monday'].notnull()]['Monday']

3          7:0-20:0
5           0:0-0:0
9           0:0-0:0
11        11:0-14:0
12         6:0-22:0
            ...    
150323      0:0-0:0
150325      0:0-0:0
150327      0:0-0:0
150336    11:0-22:0
150339      0:0-0:0
Name: Monday, Length: 26661, dtype: object

In [12]:
int(df[df['Monday'].notnull()]['Monday'].apply(lambda x: int(x.split('-')[0].split(':')[0])).mean())

7

In [13]:
for day in days_of_week:
    not_null = df[day].notnull()
    mean_opening = df[not_null][day].apply(lambda x: int(x.split('-')[0].split(':')[0])).mean()
    mean_closing = df[not_null][day].apply(lambda x: int(x.split('-')[1].split(':')[1])).mean()
    df.loc[df[day].isnull(), day] = f'{int(mean_opening)}:00 - {int(mean_closing)}:00'


Attributes

In [14]:
df.isnull().sum()

business_id       0
name              0
address           0
city              0
state             0
postal_code       0
latitude          0
longitude         0
stars             0
review_count      0
is_open           0
attributes      397
categories        0
hours             0
Monday            0
Tuesday           0
Wednesday         0
Thursday          0
Friday            0
Saturday          0
Sunday            0
dtype: int64

In [15]:
# creating a set of all attributes.
df = df[df['attributes'].notnull()]
all_attributes = set()
df['attributes'].apply(lambda x: all_attributes.update(x.keys()))

3         None
5         None
9         None
11        None
12        None
          ... 
150323    None
150325    None
150327    None
150336    None
150339    None
Name: attributes, Length: 32855, dtype: object

In [16]:
# Create separate columns for each attribute.
for attribute in all_attributes:
    df.loc[:, f'attributes.{attribute}'] = None

In [17]:
# map each attribute with its corresponding value from the 'attribute' column.
for attribute in all_attributes:
    df.loc[:, f'attributes.{attribute}'] = df['attributes'].apply(lambda x: x.get(f'{attribute}', None))

In [18]:
# Defining all binary attributes
binary_attributes = ['attributes.GoodForKids', 'attributes.RestaurantsGoodForGroups', 'attributes.BikeParking', 'attributes.RestaurantsReservations',
                    'attributes.HasTV', 'attributes.Caters', 'attributes.OutdoorSeating', 'attributes.WheelchairAccessible', 'attributes.RestaurantsDelivery',
                    'attributes.RestaurantsTakeOut', 'attributes.BusinessAcceptsCreditCards']

# Fill all missing values in binary columns with a False
df.loc[:, binary_attributes] = df[binary_attributes].fillna(False)

# correcting existing values in the dataframe
df.loc[:, binary_attributes] = df[binary_attributes].replace('None', False)
df.loc[:, binary_attributes] = df[binary_attributes].replace('True', True)
df.loc[:, binary_attributes] = df[binary_attributes].replace('False', False)

In [19]:
# Dropping redundant columns
drop_columns = ['attributes.RestaurantsAttire', 'attributes.CoatCheck', 'attributes.ByAppointmentOnly', 
                'attributes.DogsAllowed', 'attributes.GoodForMeal', 'attributes.DriveThru', 'attributes.HappyHour',
                'attributes.BusinessAcceptsBitcoin', 'attributes.RestaurantsTableService', 'attributes.Music', 
                'attributes.BestNights', 'attributes.Smoking', 'attributes.GoodForDancing', 'attributes.Corkage', 
                'attributes.BYOB', 'attributes.AgesAllowed', 'attributes.BYOBCorkage', 'attributes.DietaryRestrictions',
                'attributes.AcceptsInsurance', 'attributes.Open24Hours', 'attributes.RestaurantsCounterService', 
                'attributes.HairSpecializesIn', 'hours', 'attributes', 'categories', 'is_open']

df.drop(drop_columns, axis=1, inplace=True)

In [20]:
df.isnull().sum()

business_id                                  0
name                                         0
address                                      0
city                                         0
state                                        0
postal_code                                  0
latitude                                     0
longitude                                    0
stars                                        0
review_count                                 0
Monday                                       0
Tuesday                                      0
Wednesday                                    0
Thursday                                     0
Friday                                       0
Saturday                                     0
Sunday                                       0
attributes.WiFi                           8405
attributes.RestaurantsTakeOut                0
attributes.Alcohol                        8267
attributes.Caters                            0
attributes.Bu

NoiseLevel

In [21]:
noise_levels = ['quiet', 'average', 'loud', 'very_loud']

In [22]:
# Define a function to clean the values
def clean_category(value):
    if value:
        value = value.strip("'u")
        if value == "None":
            return None
        else:
            return value
    return value

# Apply the cleaning function to the 'column_name' column
df['attributes.NoiseLevel'] = df['attributes.NoiseLevel'].apply(clean_category)

# Filling in missing values with random noise levels
null_mask = df['attributes.NoiseLevel'].isnull()
df.loc[null_mask, 'attributes.NoiseLevel'] = df['attributes.NoiseLevel'].apply(lambda x: np.random.choice(noise_levels))


Alcohol

In [23]:
# df.loc[:, 'attributes.Alcohol'] = df['attributes.Alcohol'].fillna('none')
df.loc[:, 'attributes.Alcohol'] = df['attributes.Alcohol'].apply(clean_category)
# applying "clean_category" introduces some null values
df.loc[:, 'attributes.Alcohol'] = df['attributes.Alcohol'].fillna('no')

RestaurantPriceRange

In [24]:
df.loc[:, 'attributes.RestaurantsPriceRange2'] = df['attributes.RestaurantsPriceRange2'].apply(clean_category)

In [25]:
# filling in missing values with random ratings.
min_value = 1
max_value = 4

mask = df['attributes.RestaurantsPriceRange2'].isnull()
df.loc[mask, 'attributes.RestaurantsPriceRange2'] = df['attributes.RestaurantsPriceRange2'].apply(lambda x: np.random.randint(min_value, max_value+1))

# Fix the dtype to 'int'
df.loc[:, 'attributes.RestaurantsPriceRange2'] = df['attributes.RestaurantsPriceRange2'].astype('int')

WiFi

In [26]:
df['attributes.WiFi'].isnull().sum()

8405

In [27]:
df.loc[:, 'attributes.WiFi'] = df['attributes.WiFi'].apply(clean_category)

df.loc[:, 'attributes.WiFi'] = df['attributes.WiFi'].fillna('no')

In [28]:
assert not df['attributes.WiFi'].isnull().any(), "AssertionError: Null values found in the WiFi column."


Ambience

In [29]:
df.loc[:, 'attributes.Ambience'] = df['attributes.Ambience'].apply(clean_category)

In [30]:
# Define a default state for this column
default_ambience_string = "{'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': False, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': False}"
# Imputing missing values
df['attributes.Ambience'].fillna(default_ambience_string, inplace=True)

In [31]:
def parse_json_string(json_string, default):
    json_string = eval(json_string)
    for k,v in json_string.items():
        if v == True:
            return k
    return default

# Fixing the values in the column
df['attributes.Ambience'] = df['attributes.Ambience'].apply(lambda x: parse_json_string(x, default='absent'))


In [32]:
assert not df['attributes.Ambience'].isnull().any(), "AssertionError: Null values found in the Ambience column."


BikeParking

In [33]:
df['attributes.BikeParking'].value_counts()

attributes.BikeParking
True     17531
False    15324
Name: count, dtype: int64

In [34]:
assert not df['attributes.BikeParking'].isnull().any(), "AssertionError: Null values found in the BikeParking column."


BusinessAcceptsCreditCards

In [35]:
assert not df['attributes.BusinessAcceptsCreditCards'].isnull().any(), "AssertionError: Null values found in the BusinessAcceptsCreditCards column."


OutdoorSeating

In [36]:
assert not df['attributes.OutdoorSeating'].isnull().any(), "AssertionError: Null values found in the OutdoorSeating column."


BusinessParking

In [37]:
df.loc[:, 'attributes.BusinessParking'] = df['attributes.BusinessParking'].apply(clean_category)

# No Parking dict map
noparking_string = "{'garage': False, 'street': False, 'validated': False, 'lot': False, 'valet': False}"

# Filling the missing values with 'noparking'
df.loc[:, 'attributes.BusinessParking'] = df['attributes.BusinessParking'].fillna(noparking_string)

df['attributes.BusinessParking'] = df['attributes.BusinessParking'].apply(lambda x: parse_json_string(x, default='noparking'))


In [38]:
assert not df['attributes.BusinessParking'].isnull().any(), "AssertionError: Null values found in the BusinessParking column."


RestaurantsGoodForGroups

In [39]:
assert not df['attributes.RestaurantsGoodForGroups'].isnull().any(), "AssertionError: Null values found in the RestaurantsGoodForGroups column."

HasTV

In [40]:
assert not df['attributes.HasTV'].isnull().any(), "AssertionError: Null values found in the HasTV column."

RestaurantsReservations

In [41]:
assert not df['attributes.RestaurantsReservations'].isnull().any(), "AssertionError: Null values found in the RestaurantsReservations column."

In [42]:
assert not df.isnull().any().any(), "AssertionError: Null values found in the final cleaned dataframe."


In [57]:
df.to_csv('cleaned_dataset/business.csv', index=False)

In [72]:
engine = create_engine('postgresql://root:root@localhost:5432/yelp_db')

In [74]:
df.head(n=0).to_sql(name='business', con=engine, if_exists='replace')

0

In [76]:
%time df.to_sql(name='business', con=engine, if_exists='append')

CPU times: total: 5.31 s
Wall time: 8.14 s


165

In [88]:
import psycopg2

# Create a connection to the database
conn = psycopg2.connect(host='localhost', database='yelp_db', user='root', password='root')

# Create a cursor
cursor = conn.cursor()

# Execute an SQL statement
cursor.execute('SELECT count(*) FROM business')

# Fetch the results
results = cursor.fetchall()

dataframe_shape = df.shape
# print(dataframe_shape)

# Print the results
assert results[0][0] == dataframe_shape[0], "AssertionError: Did not write all records to DB"


# Close the cursor
cursor.close()

# Close the connection
conn.close()