In [1]:
import os
import pandas as pd
import numpy as np


In [2]:
base_path = '../data'
business_path = os.path.join(base_path, 'yelp_academic_dataset_business.json')
review_path = os.path.join(base_path, 'yelp_academic_dataset_review.json')
user_path = os.path.join(base_path, 'yelp_academic_dataset_user.json')
photos_path = os.path.join(base_path, 'filtered_photos.json')

In [10]:
users_df = pd.read_json(user_path, lines=True)


In [46]:
business_df = pd.read_json(business_path, lines=True)
business_df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
1,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
2,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."
3,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St,Ashland City,TN,37015,36.269593,-87.058943,2.0,6,1,"{'BusinessParking': 'None', 'BusinessAcceptsCr...","Burgers, Fast Food, Sandwiches, Food, Ice Crea...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-22:0', '..."
4,qkRM_2X51Yqxk3btlwAQIg,Temple Beth-El,400 Pasadena Ave S,St. Petersburg,FL,33707,27.76659,-82.732983,3.5,5,1,,"Synagogues, Religious Organizations","{'Monday': '9:0-17:0', 'Tuesday': '9:0-17:0', ..."


In [45]:
for i, row in business_df.iterrows():
    attrs = row['attributes']
    if attrs is not None:
        for key, value in attrs.items():
            print(key,value)

    if i > 10:
        break


BusinessAcceptsCreditCards True
RestaurantsDelivery False
OutdoorSeating False
BusinessAcceptsCreditCards False
BusinessParking {'garage': False, 'street': True, 'validated': False, 'lot': False, 'valet': False}
BikeParking True
RestaurantsPriceRange2 1
RestaurantsTakeOut True
ByAppointmentOnly False
WiFi u'free'
Alcohol u'none'
Caters True
BusinessAcceptsCreditCards True
WheelchairAccessible True
RestaurantsTakeOut True
BusinessParking {'garage': None, 'street': None, 'validated': None, 'lot': True, 'valet': False}
BikeParking True
GoodForKids True
Caters False
BusinessParking None
BusinessAcceptsCreditCards True
RestaurantsAttire u'casual'
OutdoorSeating True
RestaurantsReservations False
Caters False
RestaurantsTakeOut True
Alcohol u'none'
Ambience None
GoodForKids True
RestaurantsPriceRange2 1
ByAppointmentOnly False
CoatCheck False
DogsAllowed False
RestaurantsTableService False
RestaurantsGoodForGroups True
RestaurantsDelivery True
WiFi u'no'
WheelchairAccessible True
HasTV True


In [44]:
# Now let's check unique values for interesting fields
for attr in [
        'BusinessAcceptsCreditCards', 'RestaurantsTakeOut', 'RestaurantsDelivery',
        'OutdoorSeating', 'WiFi', 'Alcohol', 'NoiseLevel', 'RestaurantsGoodForGroups',
        'RestaurantsReservations', 'RestaurantsPriceRange2'
    ]:
    values = business_df['attributes'].apply(lambda x: x.get(attr, 'unknown')).unique()
    print(f"{attr}: {values}")


AttributeError: 'NoneType' object has no attribute 'get'

In [None]:
review_df = pd.read_json(review_path, lines=True)

In [11]:
def enhance_user_features(users_df):
    users_df = users_df.copy()

    # Rename columns for consistency
    users_df.rename(columns={
        'name': 'user_name',
        'review_count': 'user_review_count'
    }, inplace=True)

    # Parse datetime
    users_df['yelping_since'] = pd.to_datetime(users_df['yelping_since'])
    users_df['yelping_years'] = (pd.to_datetime('now') - users_df['yelping_since']).dt.days / 365.0
    users_df['yelping_years'] = users_df['yelping_years'].fillna(0)

    # Core engagement
    users_df['reviews_per_year'] = users_df['user_review_count'] / (users_df['yelping_years'] + 1)
    users_df['engagement_score'] = (users_df['useful'] + users_df['funny'] + users_df['cool'] + users_df['fans']) / (users_df['yelping_years'] + 1)
    users_df['engagement_per_review'] = (users_df['useful'] + users_df['funny'] + users_df['cool']) / (users_df['user_review_count'] + 1)

    # Compliments
    compliment_cols = [col for col in users_df.columns if col.startswith('compliment_')]
    users_df['total_compliments'] = users_df[compliment_cols].sum(axis=1)
    users_df['compliments_per_review'] = users_df['total_compliments'] / (users_df['user_review_count'] + 1)
    users_df['compliment_type_count'] = users_df[compliment_cols].astype(bool).sum(axis=1)

    # Social metrics
    users_df['num_friends'] = users_df['friends'].fillna('').apply(lambda x: len(x.split(',')) if x else 0)
    users_df['friend_density'] = users_df['num_friends'] / (users_df['yelping_years'] + 1)
    users_df['friend_to_fan_ratio'] = users_df['num_friends'] / (users_df['fans'] + 1)

    # Elite years
    users_df['elite_years_count'] = users_df['elite'].fillna('').apply(lambda x: len(x.split(',')) if x else 0)
    users_df['is_elite'] = users_df['elite'].apply(lambda x: 0 if x == '' else 1)
    users_df['elite_score'] = np.log1p(users_df['elite_years_count'])

    # Rating behavior
    users_df['tough_reviewer'] = (users_df['average_stars'] < 3.0).astype(int)

    # Grouped compliments
    social_comps = ['compliment_cool', 'compliment_funny', 'compliment_photos']
    thoughtful_comps = ['compliment_note', 'compliment_writer', 'compliment_plain']
    users_df['social_compliments'] = users_df[social_comps].sum(axis=1)
    users_df['thoughtful_compliments'] = users_df[thoughtful_comps].sum(axis=1)

    # Drop columns that won't be used directly
    drop_cols = ['friends', 'elite', 'yelping_since', 'cool', 'funny', 'useful', 'fans', 'average_stars', 'compliment_hot', 'compliment_list', 'compliment_note', 'compliment_writer', 'compliment_photos', 'compliment_plain', 'compliment_cool', 'compliment_funny', 'compliment_hot', 'compliment_list', 'compliment_note', 'compliment_writer', 'compliment_photos', 'compliment_plain', 'compliment_more', 'compliment_profile', 'compliment_cute']
    users_df.drop(columns=drop_cols, inplace=True)

    return users_df


In [12]:
users_df = enhance_user_features(users_df)
users_df.head()

Unnamed: 0,user_id,user_name,user_review_count,yelping_years,reviews_per_year,engagement_score,engagement_per_review,total_compliments,compliments_per_review,compliment_type_count,num_friends,friend_density,friend_to_fan_ratio,elite_years_count,is_elite,elite_score,tough_reviewer,social_compliments,thoughtful_compliments
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,18.224658,30.429671,766.567621,24.692833,2873,4.90273,11,14995,779.987887,55.951493,1,1,0.693147,0,1114,1315
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,16.221918,251.597996,5027.082405,19.251961,20631,4.760268,11,4646,269.77251,1.480089,14,1,2.70805,0,8208,10422
2,hA5lMy-EnncsH4JoR-hFGQ,Karen,79,18.276712,4.098209,2.697555,0.6375,4,0.05,4,27,1.400654,13.5,0,0,0.0,0,0,2
3,E9kcWJdJUHuTKfQurPljwA,Mike,358,16.342466,20.64297,38.460506,1.793872,56,0.155989,8,82,4.728278,3.416667,0,0,0.0,0,24,19
4,4ZaqBJqt7laPPs8xfWvr6A,Nina,801,16.663014,45.349,190.907399,4.110973,306,0.381546,10,492,27.854816,6.473684,13,1,2.639057,0,102,164


In [13]:
users_df.columns

Index(['user_id', 'user_name', 'user_review_count', 'yelping_years',
       'reviews_per_year', 'engagement_score', 'engagement_per_review',
       'total_compliments', 'compliments_per_review', 'compliment_type_count',
       'num_friends', 'friend_density', 'friend_to_fan_ratio',
       'elite_years_count', 'is_elite', 'elite_score', 'tough_reviewer',
       'social_compliments', 'thoughtful_compliments'],
      dtype='object')

In [47]:
import pandas as pd
import numpy as np
import ast
from datetime import datetime

# Fix time string formatting
def fix_time(t):
    parts = t.split(':')
    return f"{int(parts[0]):02d}:{int(parts[1]):02d}"

# Safely parse dictionary-like strings
def safe_parse_dict(s):
    if isinstance(s, dict):
        return s
    if isinstance(s, str):
        try:
            return ast.literal_eval(s)
        except:
            return {}
    return {}

# Extract hour features
def extract_hour_features(hours_str):
    hours_dict = safe_parse_dict(hours_str)
    total_minutes = 0
    open_weekends = 0
    open_late = 0
    days_open = 0

    for day, time_str in hours_dict.items():
        try:
            open_time, close_time = time_str.split('-')
            open_dt = datetime.strptime(fix_time(open_time), "%H:%M")
            close_dt = datetime.strptime(fix_time(close_time), "%H:%M")

            # Handle overnight hours
            if close_dt <= open_dt:
                duration = (24 - open_dt.hour + close_dt.hour) * 60 + (close_dt.minute - open_dt.minute)
            else:
                duration = (close_dt.hour - open_dt.hour) * 60 + (close_dt.minute - open_dt.minute)

            total_minutes += duration
            days_open += 1

            if close_dt.hour >= 22:
                open_late = 1
            if day in ['Saturday', 'Sunday']:
                open_weekends = 1

        except Exception as e:
            continue

    return pd.Series({
        'weekly_hours': round(total_minutes / 60, 1),
        'open_weekends': open_weekends,
        'open_late': open_late,
        'days_open': days_open
    })

# Preprocessing pipeline
def business_data_preprocessing(business_df, top_n_categories=20):

    # Filter for open restaurants
    business_df = business_df[
        business_df['categories'].apply(lambda x: isinstance(x, str) and 'Restaurants' in x)
    ]
    business_df = business_df[business_df['is_open'] == 1].copy()

    # Rename
    business_df.rename(columns={
        'stars': 'rating',
        'review_count': 'business_review_count',
        'name': 'business_name'
    }, inplace=True)

    # Log popularity, binary rating
    business_df['popularity'] = np.log1p(business_df['business_review_count'])
    business_df['high_rating'] = (business_df['rating'] >= 4).astype(int)

    # Parse attributes safely
    curated_attributes = [
        'BusinessAcceptsCreditCards', 'RestaurantsTakeOut', 'RestaurantsDelivery',
        'OutdoorSeating', 'ByAppointmentOnly', 'Caters', 'RestaurantsGoodForGroups',
        'RestaurantsReservations', 'RestaurantsPriceRange2', 'BikeParking', 'WheelchairAccessible', 'GoodForKids', 'DogsAllowed', 'HasTV', 'HappyHour', 'DriveThru'
    ]

    def extract_attribute(attr_str, key):
        attr_dict = safe_parse_dict(attr_str)
        val = attr_dict.get(key)

        if val == 'True' or val is True:
            return 1
        elif val == 'False' or val is False:
            return 0
        else:
            return -1  # unknown or missing


    for attr in curated_attributes:
        business_df[f'attr_{attr.lower()}'] = business_df['attributes'].apply(lambda x: extract_attribute(x, attr))

    # Category processing
    business_df['category_list'] = business_df['categories'].apply(lambda x: [c.strip() for c in x.split(',')])
    all_categories = business_df['category_list'].explode()
    top_categories = all_categories.value_counts().head(top_n_categories).index.tolist()

    cat_df = pd.DataFrame()
    for cat in top_categories:
        col_name = f"cat_{cat.lower().replace(' ', '_')}"
        cat_df[col_name] = business_df['category_list'].apply(lambda x: int(cat in x))

    # Extract hour features
    hour_features = business_df['hours'].apply(extract_hour_features)

    # Final dataframe
    final_df = pd.concat([
        business_df.drop(columns=[
            'attributes', 'categories', 'category_list',
            'address', 'city', 'state', 'postal_code',
            'latitude', 'longitude', 'hours'
        ]),
        cat_df,
        hour_features
    ], axis=1)

    return final_df


In [48]:
business_df = business_data_preprocessing(business_df)
business_df.head()

Unnamed: 0,business_id,business_name,rating,business_review_count,is_open,popularity,high_rating,attr_businessacceptscreditcards,attr_restaurantstakeout,attr_restaurantsdelivery,...,cat_seafood,cat_salad,cat_chicken_wings,cat_event_planning_&_services,cat_chinese,cat_cafes,weekly_hours,open_weekends,open_late,days_open
1,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,4.0,80,1,4.394449,1,0,1,0,...,0,0,0,0,0,0,94.0,1.0,0.0,7.0
3,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,2.0,6,1,1.94591,0,1,1,1,...,0,0,0,0,0,0,114.0,1.0,1.0,7.0
5,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,1.5,10,1,2.397895,0,1,1,1,...,0,0,0,0,0,0,100.0,1.0,0.0,7.0
7,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,4.0,10,1,2.397895,1,-1,-1,-1,...,0,0,0,0,0,0,23.0,1.0,0.0,7.0
8,il_Ro8jwPlHresjw9EGmBg,Denny's,2.5,28,1,3.367296,0,1,1,1,...,0,0,0,0,0,0,112.0,1.0,1.0,7.0


In [22]:
business_df.columns

Index(['business_id', 'business_name', 'rating', 'business_review_count',
       'is_open', 'hours', 'popularity', 'high_rating',
       'attr_businessacceptscreditcards', 'attr_restaurantstakeout',
       'attr_restaurantsdelivery', 'attr_outdoorseating', 'attr_wifi',
       'attr_alcohol', 'attr_noiselevel', 'attr_restaurantsgoodforgroups',
       'attr_restaurantsreservations', 'attr_restaurantspricerange2',
       'cat_restaurants', 'cat_food', 'cat_nightlife', 'cat_sandwiches',
       'cat_bars', 'cat_american_(traditional)', 'cat_fast_food', 'cat_pizza',
       'cat_breakfast_&_brunch', 'cat_burgers', 'cat_american_(new)',
       'cat_mexican', 'cat_italian', 'cat_coffee_&_tea', 'cat_seafood',
       'cat_salad', 'cat_chicken_wings', 'cat_event_planning_&_services',
       'cat_chinese', 'cat_cafes'],
      dtype='object')

In [41]:
all_categories = []
for i, row in business_df.iterrows():
    if row['categories'] is not None:
        current_categories = row['categories'].split(', ')
        for category in current_categories:
            all_categories.append(category)

all_categories

['Shipping Centers',
 'Local Services',
 'Notaries',
 'Mailbox Centers',
 'Printing Services',
 'Restaurants',
 'Food',
 'Bubble Tea',
 'Coffee & Tea',
 'Bakeries',
 'Brewpubs',
 'Breweries',
 'Food',
 'Burgers',
 'Fast Food',
 'Sandwiches',
 'Food',
 'Ice Cream & Frozen Yogurt',
 'Restaurants',
 'Synagogues',
 'Religious Organizations',
 'Ice Cream & Frozen Yogurt',
 'Fast Food',
 'Burgers',
 'Restaurants',
 'Food',
 'Department Stores',
 'Shopping',
 'Fashion',
 'Vietnamese',
 'Food',
 'Restaurants',
 'Food Trucks',
 'American (Traditional)',
 'Restaurants',
 'Diners',
 'Breakfast & Brunch',
 'Sushi Bars',
 'Restaurants',
 'Japanese',
 'Automotive',
 'Auto Parts & Supplies',
 'Auto Customization',
 'Automotive',
 'Car Rental',
 'Hotels & Travel',
 'Truck Rental',
 'Korean',
 'Restaurants',
 'Shopping',
 'Books',
 'Mags',
 'Music & Video',
 'Bookstores',
 'Steakhouses',
 'Asian Fusion',
 'Restaurants',
 'Restaurants',
 'Italian',
 'Pizza',
 'Chicken Wings',
 'Sandwiches',
 'Restaurant

In [42]:
len(set(all_categories))


1270

In [43]:
def check_categories(categories):
    """Return all food related categories"""
    try:
        return 'Restaurants' in categories.split(', ')
    except Exception:
        return False

filtered_business = business_df[business_df['categories'].apply(check_categories)]
len(filtered_business)

30918

In [44]:
# only keep open businesses
filtered_business = filtered_business[filtered_business['is_open'] == 1]
len(filtered_business)



29835

In [51]:
review_df = pd.read_json(review_path, lines=True)
review_df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,Z6-2rrZvRnDcXpBiqhsk9A,pk3SPF_Ip1t65E3G9xFoKA,-e8RwknT5szoLk9uBZjzcw,5,1,0,0,Had a great time tasting for my moms birthday....,2021-03-01 00:00:43
1,C08Oi6p84SfNq_fh4GPKEQ,kk4GynEiF13My49uJ3hP7w,XYjlZTXifIjpAbPW33tCXw,5,0,0,0,Best place for ice cream on the go. Very covid...,2021-03-01 00:00:52
2,1skKNG9rGSArmPXfvlcy6Q,DDpOQU-_tXEdHsoJN-REOg,bSJczuohHVko33UT82hnfA,2,0,0,0,I went to blaze yesterday around this same tim...,2021-03-01 00:01:08
3,oabif-hAnvLWOmSc4HCATQ,s4Cd9whJ8_g_p4iABq4hyQ,7UbXvL4SnVEDtx8oJbbuhw,2,0,1,0,"The gas is cheap, that's about it. The servic...",2021-03-01 00:01:25
4,mGW-EduqWdKsdHYVj8vSag,WXVo7Ff1NQPE71dnb0pK-A,UuRZpgZsHX7PyykHV3wbZg,5,0,0,0,It was pretty late by the time I went in so th...,2021-03-01 00:01:41


In [53]:
filtered_reviews = pd.merge(review_df, users_df, on='user_id', how='inner')
filtered_reviews = pd.merge(filtered_reviews, business_df, on='business_id', how='inner')

In [54]:
print(len(filtered_reviews))
filtered_reviews.head()

366252


Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,user_name,...,cat_seafood,cat_salad,cat_chicken_wings,cat_event_planning_&_services,cat_chinese,cat_cafes,weekly_hours,open_weekends,open_late,days_open
0,1skKNG9rGSArmPXfvlcy6Q,DDpOQU-_tXEdHsoJN-REOg,bSJczuohHVko33UT82hnfA,2,0,0,0,I went to blaze yesterday around this same tim...,2021-03-01 00:01:08,Malik,...,0,1,0,0,0,0,89.0,1.0,1.0,7.0
1,mGW-EduqWdKsdHYVj8vSag,WXVo7Ff1NQPE71dnb0pK-A,UuRZpgZsHX7PyykHV3wbZg,5,0,0,0,It was pretty late by the time I went in so th...,2021-03-01 00:01:41,Sarah,...,0,0,0,0,0,1,56.0,1.0,0.0,7.0
2,fpFl0aaBvH-ODbFlG8Nqew,kk4GynEiF13My49uJ3hP7w,3Z4rhPFO6XWJRQipEIZCWA,3,0,0,0,Always been a place with great atmosphere and ...,2021-03-01 00:02:54,Joe,...,0,0,0,0,0,0,64.0,1.0,1.0,6.0
3,d0BNtceSzXS34hMhXKzhJw,9lxeey8azLxu_DhcsrCO2Q,aRb_ToTRcHa9BICLLj8n-A,4,0,0,0,This is becoming our go to takeout restaurant....,2021-03-01 00:03:06,David,...,0,0,0,0,0,0,53.0,1.0,0.0,6.0
4,GEsSANWisPzNnR7MwSRLOA,x2PV9PpcRbJm74a9mb07ug,q7rBuON2WEIhdfz-AIYFFg,5,1,0,1,Tryed a new restaurant tonight! Mediterranean ...,2021-03-01 00:03:53,Nancy,...,0,0,0,0,0,0,55.0,1.0,0.0,6.0


In [55]:
filtered_reviews.columns

Index(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny',
       'cool', 'text', 'date', 'user_name', 'user_review_count',
       'yelping_years', 'reviews_per_year', 'engagement_score',
       'engagement_per_review', 'total_compliments', 'compliments_per_review',
       'compliment_type_count', 'num_friends', 'friend_density',
       'friend_to_fan_ratio', 'elite_years_count', 'is_elite', 'elite_score',
       'tough_reviewer', 'social_compliments', 'thoughtful_compliments',
       'business_name', 'rating', 'business_review_count', 'is_open',
       'popularity', 'high_rating', 'attr_businessacceptscreditcards',
       'attr_restaurantstakeout', 'attr_restaurantsdelivery',
       'attr_outdoorseating', 'attr_byappointmentonly', 'attr_caters',
       'attr_restaurantsgoodforgroups', 'attr_restaurantsreservations',
       'attr_restaurantspricerange2', 'attr_bikeparking',
       'attr_wheelchairaccessible', 'attr_goodforkids', 'attr_dogsallowed',
       'attr_hastv', '

In [48]:
filtered_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366252 entries, 0 to 366251
Data columns (total 43 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   review_id              366252 non-null  object        
 1   user_id                366252 non-null  object        
 2   business_id            366252 non-null  object        
 3   stars                  366252 non-null  int64         
 4   useful_x               366252 non-null  int64         
 5   funny_x                366252 non-null  int64         
 6   cool_x                 366252 non-null  int64         
 7   text                   366252 non-null  object        
 8   date                   366252 non-null  datetime64[ns]
 9   user_name              366252 non-null  object        
 10  user_review_count      366252 non-null  int64         
 11  yelping_since          366252 non-null  object        
 12  useful_y               366252 non-null  int6

In [5]:
photos_df = pd.read_json(photos_path, lines=True)
photos_df.head()

Unnamed: 0,photo_id,business_id,caption,label
0,zsvj7vloL4L5jhYyPIuVwg,Nk-SJhPlDBkAZvfsADtccA,Nice rock artwork everywhere and craploads of ...,inside
1,HCUdRJHHm_e0OCTlZetGLg,yVZtL5MmrpiivyCIrVkGgA,,outside
2,vkr8T0scuJmGVvN2HJelEA,_ab50qdWOk0DdB6XOrBitw,oyster shooter,drink
3,pve7D6NUrafHW3EAORubyw,SZU9c8V2GuREDN5KgyHFJw,Shrimp scampi,food
4,H52Er-uBg6rNrHcReWTD2w,Gzur0f0XMkrVxIwYJvOt2g,,food


In [49]:
from sklearn.preprocessing import LabelEncoder

# Encode user and business ids using LabelEncoder   
le_user = LabelEncoder()
le_business = LabelEncoder()
filtered_reviews['user_id_enc'] = le_user.fit_transform(filtered_reviews['user_id'])
filtered_reviews['business_id_enc'] = le_business.fit_transform(filtered_reviews['business_id'])

In [50]:
filtered_reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful_x,funny_x,cool_x,text,date,user_name,...,latitude,longitude,rating,business_review_count,is_open,attributes,categories,hours,user_id_enc,business_id_enc
0,1skKNG9rGSArmPXfvlcy6Q,DDpOQU-_tXEdHsoJN-REOg,bSJczuohHVko33UT82hnfA,2,0,0,0,I went to blaze yesterday around this same tim...,2021-03-01 00:01:08,Malik,...,29.948395,-90.075281,4.0,302,1,"{'Ambience': '{'romantic': False, 'intimate': ...","Pizza, Fast Food, Gluten-Free, Restaurants, Salad","{'Monday': '0:0-0:0', 'Tuesday': '11:0-22:0', ...",43324,18402
1,mGW-EduqWdKsdHYVj8vSag,WXVo7Ff1NQPE71dnb0pK-A,UuRZpgZsHX7PyykHV3wbZg,5,0,0,0,It was pretty late by the time I went in so th...,2021-03-01 00:01:41,Sarah,...,34.42572,-119.705241,4.0,761,1,"{'BusinessParking': '{'garage': False, 'street...","Bakeries, Cafes, Coffee & Tea, Food, Desserts,...","{'Monday': '8:0-16:0', 'Tuesday': '8:0-16:0', ...",101817,14941
2,fpFl0aaBvH-ODbFlG8Nqew,kk4GynEiF13My49uJ3hP7w,3Z4rhPFO6XWJRQipEIZCWA,3,0,0,0,Always been a place with great atmosphere and ...,2021-03-01 00:02:54,Joe,...,28.011587,-82.788281,3.5,806,1,"{'RestaurantsReservations': 'True', 'GoodForKi...","Mexican, Restaurants, Desserts, Vegetarian, Fo...","{'Tuesday': '11:0-21:30', 'Wednesday': '11:0-2...",148081,2234
3,d0BNtceSzXS34hMhXKzhJw,9lxeey8azLxu_DhcsrCO2Q,aRb_ToTRcHa9BICLLj8n-A,4,0,0,0,This is becoming our go to takeout restaurant....,2021-03-01 00:03:06,David,...,40.152561,-75.137371,4.5,174,1,"{'RestaurantsPriceRange2': '2', 'RestaurantsTa...","Restaurants, Vietnamese","{'Monday': '11:0-19:0', 'Wednesday': '11:0-20:...",32871,17949
4,GEsSANWisPzNnR7MwSRLOA,x2PV9PpcRbJm74a9mb07ug,q7rBuON2WEIhdfz-AIYFFg,5,1,0,1,Tryed a new restaurant tonight! Mediterranean ...,2021-03-01 00:03:53,Nancy,...,38.789061,-90.549623,5.0,123,1,"{'BikeParking': 'True', 'BYOB': 'False', 'Rest...","Mediterranean, Restaurants, Middle Eastern","{'Tuesday': '10:0-20:0', 'Wednesday': '15:0-20...",185620,25283


In [56]:
users_df.columns

Index(['user_id', 'user_name', 'user_review_count', 'yelping_years',
       'reviews_per_year', 'engagement_score', 'engagement_per_review',
       'total_compliments', 'compliments_per_review', 'compliment_type_count',
       'num_friends', 'friend_density', 'friend_to_fan_ratio',
       'elite_years_count', 'is_elite', 'elite_score', 'tough_reviewer',
       'social_compliments', 'thoughtful_compliments'],
      dtype='object')

In [57]:
business_df.columns

Index(['business_id', 'business_name', 'rating', 'business_review_count',
       'is_open', 'popularity', 'high_rating',
       'attr_businessacceptscreditcards', 'attr_restaurantstakeout',
       'attr_restaurantsdelivery', 'attr_outdoorseating',
       'attr_byappointmentonly', 'attr_caters',
       'attr_restaurantsgoodforgroups', 'attr_restaurantsreservations',
       'attr_restaurantspricerange2', 'attr_bikeparking',
       'attr_wheelchairaccessible', 'attr_goodforkids', 'attr_dogsallowed',
       'attr_hastv', 'attr_happyhour', 'attr_drivethru', 'cat_restaurants',
       'cat_food', 'cat_nightlife', 'cat_sandwiches', 'cat_bars',
       'cat_american_(traditional)', 'cat_fast_food', 'cat_pizza',
       'cat_breakfast_&_brunch', 'cat_burgers', 'cat_american_(new)',
       'cat_mexican', 'cat_italian', 'cat_coffee_&_tea', 'cat_seafood',
       'cat_salad', 'cat_chicken_wings', 'cat_event_planning_&_services',
       'cat_chinese', 'cat_cafes', 'weekly_hours', 'open_weekends',
     