## Importing Libraries

In [None]:
# importing libraries

import json
import requests
import pandas as pd
import itertools
import difflib
from tqdm.notebook import tqdm_notebook
import math
import re
from collections import Counter
from geopy import distance
from math import radians, cos, sin, asin, sqrt

In [99]:
# filtering local authorirty ids for a region

region_name='London'
url = "https://api.ratings.food.gov.uk/authorities"
#params = {}
headers = {
    "x-api-version": '2',
#             "accept": "application/json",
#             "content-type": "application/json",
#             "Accept-Language": "en-GB"
            }
response = requests.get(url,headers=headers).json()

authority_df = pd.DataFrame(columns=['local_authority_id','local_authority_id_code','name']) 
for authorities in response['authorities']:
    if authorities['RegionName'] == region_name:
        local_authority_id = authorities['LocalAuthorityId']
        local_authority_id_code = authorities['LocalAuthorityIdCode']
        name = authorities['Name']
        authority_df = authority_df.append({'local_authority_id':local_authority_id,'local_authority_id_code':local_authority_id_code
                    ,'name':name  },ignore_index=True)
authority_ids = authority_df['local_authority_id'].astype('str').to_list()

## Using API to pull all restaurants in London

In [None]:
# business types available

business_types_df = pd.DataFrame(columns=['business_type_id','business_type_name'])

url = "https://api.ratings.food.gov.uk/businessTypes"
headers = {"x-api-version": '2'}
response = requests.get(url,headers=headers).json()

for business_types in response['businessTypes']:
    business_type_id = business_types['BusinessTypeId']
    business_type_name = business_types['BusinessTypeName']
    business_types_df = business_types_df.append({'business_type_id':business_type_id,'business_type_name':business_type_name
                           },ignore_index=True)

# selecting business ids relevant to ubereats

select_business_ids = ['7846','7841','7843','1','7844']

In [None]:
# pulling dataset based on business type ids and authority ids

df = pd.DataFrame(columns=['establishment_id','establishment_name'
                            ,'establishment_address_line1','establishment_address_line2'
                            ,'establishment_address_line3','establishment_address_line4'
                            ,'establishment_lng','establishment_lat'
                            ,'establishment_business_type','establishment_business_type_id'
                            ,'authority_id_code']) 
for authority_id,business_id in itertools.product(authority_ids,select_business_ids):
    url = "https://api.ratings.food.gov.uk/establishments?businessTypeId="+business_id+"&localAuthorityId="+authority_id+""
    headers = {"x-api-version": '2'}
    response = requests.get(url,headers=headers).json()

    time.sleep(1) #giving it a second before starting the for loop

    for establishments in response['establishments']:
        establishment_id = establishments['LocalAuthorityBusinessID']
        establishment_name = establishments['BusinessName']
        establishment_address_line1 = establishments['AddressLine1']
        establishment_address_line2 = establishments['AddressLine2']
        establishment_address_line3 = establishments['AddressLine3']
        establishment_address_line4 = establishments['AddressLine4']
        establishment_lng = establishments['geocode']['longitude']
        establishment_lat = establishments['geocode']['latitude']
        establishment_business_type = establishments['BusinessType']
        establishment_business_type_id = establishments['BusinessTypeID']
        authority_id_code = establishments['LocalAuthorityCode']
        df = df.append({'establishment_id':establishment_id,'establishment_name':establishment_name
                    ,'establishment_address_line1':establishment_address_line1,'establishment_address_line2':establishment_address_line2
                    ,'establishment_address_line3':establishment_address_line3,'establishment_address_line4':establishment_address_line4
                    ,'establishment_lng':establishment_lng,'establishment_lat':establishment_lat
                    ,'establishment_business_type':establishment_business_type,'establishment_business_type_id':establishment_business_type_id
                    ,'authority_id_code':authority_id_code
                   },ignore_index=True)

## Data cleaning and formatting

In [None]:
# write the data pulled using API to CSV

df.to_csv('all_food_establishments_in_london.csv')

# read data from CSV

all_food_establishments_in_london = pd.read_csv('all_food_establishments_in_london.csv')

# formatting data as required

df_all = pd.read_csv('df_based_on_ids.csv')
df_all = df_all.drop(columns=['Unnamed: 0'])

# Removing duplicates and missing value treatment

df_all = df_all.drop_duplicates()

# Only comparing restaurants where GPS details are available

df_all = df_all[df_all['establishment_lng'].notna()]
df_all = df_all[df_all['establishment_lat'].notna()]

# Concatenating Restaurant name and address

df_all = df_all.fillna('')
df_all['full_name'] = df_all['establishment_name'] + " " + df_all['establishment_address_line1'] + " " + df_all['establishment_address_line2'] + " " + df_all['establishment_address_line3'] + " " + df_all['establishment_address_line4']
df_all

# Reset index

df_all.reset_index(drop=True)

In [101]:
# read uber data from CSV

df_uber = pd.read_csv("ubereats.csv")

#formatting data as required

# Removing duplicates and missing value treatment

df_uber = df_uber.drop_duplicates()
df_uber = df_uber.dropna()

# Removing Cuisine Prefix from Restaurant names

df_uber['first_cuisine'] = df_uber.cuisine.str.split(',').str.get(0)
df_uber['new_name'] = df_uber.apply(lambda row : str(row['name']).replace(str(row['first_cuisine']),''),axis=1)

# Concatenating Restaurant name and address

df_uber['full_name'] = df_uber['new_name'] + " " + df_uber['address']

# Reset index

df_uber.reset_index(drop=True)

## Matching Algortihm to find closest matching restaurant

In [106]:
WORD = re.compile(r"\w+")

# function to convert text to vector

def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)

# function to get cosine distance between set of strings

def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator
    
# function to get haversine distance between two points

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """

    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 

    # 6367 km is the radius of the Earth
    km = 6367 * c
    return km

''' matching algorithm that finds closest match for each uber restaurant in the 
universal dataset based on closest restaurant name and shortest distance based 
on latitude and longitude '''

def get_score(orig_vect,comp_vect,lon1, lat1, lon2, lat2):
    cosine = get_cosine(orig_vect, comp_vect)
    dist = haversine(lon1, lat1, lon2, lat2)
    
    return (1.0000001 - cosine)**2 * dist

def min_score_est(df_all,orig_vect,lon1, lat1):
    score_list = []
    
    df_all['score'] = df_all.apply(lambda x: get_score(orig_vect,x['all_vector'],lon1, lat1,x['establishment_lng'],x['establishment_lat']),axis=1)

    min_score = df_all['score'].min()
    matching_rest = df_all[df_all['score'] ==  min_score]['establishment_name'].values[0]
    matching_lat = df_all[df_all['score'] ==  min_score]['establishment_lat'].values[0]
    matching_lng = df_all[df_all['score'] ==  min_score]['establishment_lng'].values[0]
    return matching_rest,matching_lat,matching_lng,min_score

# creating vectorized columns in both uber and universal dataset


df_uber['uber_vector'] = df_uber['new_name'].astype(str).apply(text_to_vector)
df_all['all_vector'] = df_all['establishment_name'].astype(str).apply(text_to_vector)


df_uber['closest_match_restaurant'] = 0
df_uber['closest_match_lat'] = 0
df_uber['closest_match_lng'] = 0
df_uber['score'] = 0
score_list = []
for index1 in tqdm_notebook(range(df_uber.shape[0])):
    orig_vect = df_uber['uber_vector'].iloc[index1]
#     p1 = df_uber['p1'].iloc[index1]
    df_uber['closest_match_restaurant'].iloc[index1],df_uber['closest_match_lat'].iloc[index1],df_uber['closest_match_lng'].iloc[index1],df_uber['score'].iloc[index1] = min_score_est(df_all,orig_vect,df_uber['longitude'].iloc[index1], df_uber['latitude'].iloc[index1])
    
    

    

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=13486.0), HTML(value='')))




## Final overall dataset with common restaurants removed

In [107]:
df_uber.to_csv('uber_restaurant_with_closest_match_1.csv')

# Setting threshold on score to ensure accuracy

df_uber_match_list = df_uber[df_uber['score']<=0.025]

# Removing resturants found in uber dataset

list_of_restaurants_not_on_uber = df_all.merge(df_uber, left_on=['establishment_name','establishment_lng','establishment_lat'], right_on=['closest_match_restaurant','closest_match_lng','closest_match_lat'], how='left', indicator=True)

list_of_restaurants_not_on_uber = list_of_restaurants_not_on_uber[list_of_restaurants_not_on_uber['_merge'] == 'left_only']

list_of_restaurants_not_on_uber.to_csv('list_of_restaurants_not_on_uber.csv')
