# Predicting neighborhoods where new restaurants can open based on categories and price rating
This notebook puts our learnings in all other notebooks together. Here we build the final models for predicting the following:
- Checkings for a restaurant
- If the restaurant will remain open in a neighborhood
- Rating of the restaurant

All models take the same inputs:
- Neighborhood
- Categories
- Price rating

## Loading the dataset
The restaurants are loaded from `yelp_academic_dataset_business.json`, while the checkins are loaded from `yelp_academic_dataset_checkin.json`.

Businesses with at least one of the following categories are considered restaurants:
- Restaurant
- Food
- Bar

For each restaurant we only care about the following attributes:
- Neighborhood
- Price Rating
- Select few categories (see full list below)
- If the restaurant is open
- Restaurant star rating

Only th following categories are selected for each restaurant:

In [1]:
chosen_categories = {'coffee & tea', 'specialty food', 'sandwiches', 'breakfast & brunch', 'chinese', 'cafes',
                     'canadian (new)', 'bakeries', 'fast food', 'pizza', 'desserts', 'italian', 'japanese', 'burgers', 'pubs',
                     'american (traditional)', 'sushi bars', 'indian', 'juice bars & smoothies', 'asian fusion', 'korean',
                     'mexican', 'middle eastern', 'thai', 'mediterranean', 'salad', 'chicken wings',
                     'ice cream & frozen yogurt', 'seafood', 'beer', 'wine & spirits', 'vegetarian', 'comfort food', 'vegan',
                     'greek', 'barbeque', 'vietnamese', 'diners', 'caribbean', 'french', 'american (new)', 'halal',
                     'ethnic food', 'gluten-free', 'delis', 'tea rooms', 'gastropubs', 'tapas/small plates', 'soup',
                     'steakhouses', 'bubble tea', 'dim sum', 'noodles', 'donuts', 'chicken shop', 'portuguese',
                     'chocolatiers & shops', 'ramen', 'tapas bars', 'latin american', 'bagels', 'pakistani', 'fish & chips',
                     'taiwanese', 'modern european', 'tex-mex', 'british', 'creperies', 'southern', 'filipino', 'african',
                     'hot dogs', 'irish', 'poke', 'ethiopian', 'afghan', 'turkish', 'falafel', 'hot pot', 'spanish',
                     'local flavor', 'himalayan/nepalese', 'hawaiian', 'lebanese', 'persian/iranian', 'polish', 'waffles',
                     'soul food', 'malaysian', 'sri lankan', 'live/raw food'}

In [2]:
import json
from types import SimpleNamespace

def Neighborhood(name, index):
    return SimpleNamespace(name=name, index=index)

def Category(category, index):
    return SimpleNamespace(
        category = category,
        index = index
    )

class CategoryBuilder(object):
    def __init__(self, chosen_categories):
        self.cat_to_ind = {cat: i for i, cat in enumerate(chosen_categories)}
    
    def build(self, category):
        category = category.strip().lower()
        if (category in self.cat_to_ind):
            return Category(category, self.cat_to_ind[category])
        return None

def Restaurant(business, categoryBuilder):
    return SimpleNamespace(
        business_id = business['business_id'],
        neighborhood = Neighborhood(business['neighborhood'], None),
        city = business['city'],
        location = [business['latitude'], business['longitude']],
        stars = business['stars'],
        categories = list(filter(
            lambda c: c is not None,
            [categoryBuilder.build(category) for category in business['categories'].split(',')]
        )),
        is_open = bool(business.get('is_open', 0)),
        price_rating = int(business['attributes']['RestaurantsPriceRange2'])
    )
    

def getRestaurantsFromFile(sparkContext, restaurant_file, checkins_file, chosen_categories):
    restaurant_categories = {'restaurants', 'food', 'bars'}
    categoryBuilder = CategoryBuilder(chosen_categories)
    restaurants = sparkContext.textFile(restaurant_file) \
        .map(lambda row: json.loads(row)) \
        .filter(lambda business: business['categories'] is not None and business['attributes'] is not None) \
        .filter(lambda business: restaurant_categories & {x.strip().lower() for x in business['categories'].split(',')}) \
        .filter(lambda business: 'RestaurantsPriceRange2' in business['attributes']) \
        .filter(lambda business: business['latitude'] is not None and business['longitude'] is not None) \
        .map(lambda restaurant: Restaurant(restaurant, categoryBuilder)) \
        .keyBy(lambda restaurant: restaurant.business_id)
    checkins = sc.textFile(checkins_file) \
        .map(lambda row: json.loads(row)) \
        .map(lambda checkin: (checkin['business_id'], sum(checkin['time'].values())))
    def update_restaurant(restaurant, checkins):
        restaurant.checkins = checkins
        return restaurant
    return checkins.join(restaurants) \
        .map(lambda tup: update_restaurant(tup[1][1], tup[1][0])) \
        .collect()

In [3]:
restaurants = getRestaurantsFromFile(
    sc,
    '../data/raw/yelp_academic_dataset_business.json',
    '../data/raw/yelp_academic_dataset_checkin.json',
    chosen_categories
)

We focus on the top 4 cities:
- Toronto
- Pheonix
- Las Vagas
- Montreal

In [4]:
toronto = [r for r in restaurants if r.city == 'Toronto']
print(len(toronto))

8888


In [5]:
vegas = [r for r in restaurants if r.city == 'Las Vegas']
print(len(vegas))

7872


In [6]:
phoenix = [r for r in restaurants if r.city == 'Phoenix']
print(len(phoenix))

4666


In [7]:
montreal = [r for r in restaurants if r.city == 'Montréal']
print(len(montreal))

3741


## Clustering into neighborhoods
A lot of restaurants don't have any neighborhood assigned. We employ kmeans with a custom distance function to assign neighborhoods to the restaurants per city.

In [8]:
from nltk.cluster.kmeans import KMeansClusterer
import numpy as np

def custom_dist(vec1, vec2):
    if vec1[2] >= 0 and vec1[2] == vec2[2]:
        return 0
    return np.linalg.norm(vec1[:2] - vec2[:2])

def cluster_restaurants(restaurants, num_clusters):
    neighborhood_inds, ind = {}, 0
    for r in restaurants:
        if r.neighborhood.name not in neighborhood_inds:
            r.neighborhood.index = ind
            neighborhood_inds[r.neighborhood.name] = ind
            ind += 1
        else:
            r.neighborhood.index = neighborhood_inds[r.neighborhood.name]
    
    locations = np.array([r.location + [r.neighborhood.index if r.neighborhood.name else -1] for r in restaurants])
    clusterer = KMeansClusterer(num_clusters, custom_dist, repeats=2, avoid_empty_clusters=True)
    _ = clusterer.cluster(locations, assign_clusters=True)
    for r in restaurants:
        loc = np.array(r.location + [r.neighborhood.index if r.neighborhood.name else -1])
        r.neighborhood.index = clusterer.classify(loc)

In [9]:
cluster_restaurants(toronto, 95)

In [10]:
cluster_restaurants(vegas, 95)

In [11]:
cluster_restaurants(phoenix, 50)

In [12]:
cluster_restaurants(montreal, 50)

## Saving the data

In [13]:
import pickle

def saveToFile(obj, filename):
    with open(filename, 'wb') as file:
        pickle.dump(obj, file)

def readFromFile(filename):
    with open(filename, 'rb') as file:
        return pickle.load(file)

In [14]:
saveToFile(toronto, '../data/raw/clustered_restaurants_toronto.pickle')
saveToFile(vegas, '../data/raw/clustered_restaurants_vegas.pickle')
saveToFile(phoenix, '../data/raw/clustered_restaurants_phoenix.pickle')
saveToFile(montreal, '../data/raw/clustered_restaurants_montreal.pickle')

In [15]:
toronto = readFromFile('../data/raw/clustered_restaurants_toronto.pickle')
vegas = readFromFile('../data/raw/clustered_restaurants_vegas.pickle')
phoenix = readFromFile('../data/raw/clustered_restaurants_phoenix.pickle')
montreal = readFromFile('../data/raw/clustered_restaurants_montreal.pickle')

## Creating the classifiers

In [42]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, confusion_matrix
from scipy.stats import expon
from scipy import stats
import math

class Classifiers(object):
    def __init__(self, restaurants):
        self.data = restaurants
        self.models = SimpleNamespace(
            checkins=BernoulliNB(),
            is_open=BernoulliNB(),
            stars=MultinomialNB()
        )
        self.fit()
    
    def fit(self, test_size=0.2, scoring=accuracy_score):
        train_X, test_X = train_test_split(self._get_traing_X(), test_size=test_size, random_state=0)
        c_train_Y, c_test_Y = train_test_split(self._get_checkins_Y(), test_size=test_size, random_state=0)
        o_train_Y, o_test_Y = train_test_split(self._get_is_open_Y(), test_size=test_size, random_state=0)
        s_train_Y, s_test_Y = train_test_split(self._get_stars_Y(), test_size=test_size, random_state=0)
        self.models.checkins.fit(train_X, c_train_Y)
        self.models.is_open.fit(train_X, o_train_Y)
        self.models.stars.fit(train_X, s_train_Y)
        p1 = self.models.checkins.predict(test_X)
        p2 = self.models.is_open.predict(test_X)
        p3 = self.models.stars.predict(test_X)
        s1 = scoring(c_test_Y, p1)
        s2 = scoring(o_test_Y, p2)
        s3 = scoring(s_test_Y, p3)
        k1 = stats.ks_2samp(p1, c_test_Y)
        k2 = stats.ks_2samp(p2, o_test_Y)
        k3 = stats.ks_2samp(p3, s_test_Y)
        return SimpleNamespace(
            checkins=SimpleNamespace(score=s1, ks_test=k1),
            is_open=SimpleNamespace(score=s2, ks_test=k2),
            stars=SimpleNamespace(score=s3, ks_test=k3)
        )

    def predict(self, restaurants):
        feature_vecs_X = []
        for r in restaurants:
            n_feature_vec = np.zeros(self.num_neighborhoods)
            n_feature_vec[r.neighborhood.index] = 1
            c_feature_vec = np.zeros(self.num_categories)
            for c in r.categories:
                c_feature_vec[c.index] = 1
            p_feature_vec = np.zeros(4)
            p_feature_vec[r.price_rating - 1] = 1
            feature_vecs_X.append(np.hstack([n_feature_vec, c_feature_vec, p_feature_vec]))
        X = np.vstack(feature_vecs_X)
        return SimpleNamespace(
            checkins=self.models.checkins.predict(X),
            is_open=self.models.is_open.predict(X),
            stars=self.models.stars.predict(X),
        )
    
    def _get_traing_X(self):
        feature_vecs_X = []
        self.num_neighborhoods = max(r.neighborhood.index for r in self.data) + 1
        self.num_categories = max(cat.index for r in self.data for cat in r.categories) + 1
        for r in self.data:
            n_feature_vec = np.zeros(self.num_neighborhoods)
            n_feature_vec[r.neighborhood.index] = 1
            c_feature_vec = np.zeros(self.num_categories)
            for c in r.categories:
                c_feature_vec[c.index] = 1
            #p_feature_vec = np.zeros(4)
            #p_feature_vec[r.price_rating - 1] = 1
            feature_vecs_X.append(np.hstack([n_feature_vec, c_feature_vec]))
        return np.vstack(feature_vecs_X)
    
    def _get_checkins_Y(self):
        feature_vec_Y = [r.checkins for r in self.data]
        checkin_bins = self._get_checkin_bins(feature_vec_Y, 5)
        return np.digitize(feature_vec_Y, checkin_bins)
    
    def _get_is_open_Y(self):
        return np.array([r.is_open for r in self.data], dtype=np.float)
    
    def _get_stars_Y(self):
        return np.array([r.stars * 2 for r in self.data])
    
    def _get_checkin_bins(self, checkin_data, num_bins):
        _, scale = expon.fit(checkin_data)
        # scale is equal to 1/lambda according to http://reliawiki.org/index.php/The_Exponential_Distribution
        ret = []
        for i in range(num_bins):
            p = i / num_bins
            ret.append(-math.log(1 - p) * scale)
        return np.array(ret)

In [43]:
classifiers = SimpleNamespace()
classifiers.toronto = Classifiers(toronto)
classifiers.toronto.fit()

namespace(checkins=namespace(ks_test=Ks_2sampResult(statistic=0.2890888638920135, pvalue=1.7059259126120749e-65), score=0.3858267716535433), is_open=namespace(ks_test=Ks_2sampResult(statistic=0.22553430821147358, pvalue=4.9797491213735405e-40), score=0.6856017997750281), stars=namespace(ks_test=Ks_2sampResult(statistic=0.18616422947131606, pvalue=2.0756447582630234e-27), score=0.29190101237345334))

In [44]:
classifiers.vegas = Classifiers(vegas)
classifiers.vegas.fit()

namespace(checkins=namespace(ks_test=Ks_2sampResult(statistic=0.28190476190476194, pvalue=2.8914067617774917e-55), score=0.5041269841269841), is_open=namespace(ks_test=Ks_2sampResult(statistic=0.19873015873015873, pvalue=1.1160954127723686e-27), score=0.6882539682539682), stars=namespace(ks_test=Ks_2sampResult(statistic=0.10793650793650794, pvalue=1.8261476296280326e-08), score=0.2723809523809524))

In [45]:
classifiers.phoenix = Classifiers(phoenix)
classifiers.phoenix.fit()

namespace(checkins=namespace(ks_test=Ks_2sampResult(statistic=0.3190578158458244, pvalue=3.382371304873677e-42), score=0.4464668094218415), is_open=namespace(ks_test=Ks_2sampResult(statistic=0.22162740899357602, pvalue=1.3983921654055535e-20), score=0.6905781584582441), stars=namespace(ks_test=Ks_2sampResult(statistic=0.10385438972162742, pvalue=7.502485517710985e-05), score=0.2740899357601713))

In [46]:
classifiers.montreal = Classifiers(montreal)
classifiers.montreal.fit()

namespace(checkins=namespace(ks_test=Ks_2sampResult(statistic=0.2990654205607477, pvalue=6.733267702960478e-30), score=0.3658210947930574), is_open=namespace(ks_test=Ks_2sampResult(statistic=0.22296395193591453, pvalue=8.305252117053393e-17), score=0.7476635514018691), stars=namespace(ks_test=Ks_2sampResult(statistic=0.2069425901201602, pvalue=1.5453323186098562e-14), score=0.3164218958611482))

In [41]:
print(len([r for r in restaurants if r.city == 'Montréal' and r.is_open]))

2901
