In [11]:
from pymongo import MongoClient
import numpy as np
import gpxpy.geo
from tqdm import tqdm
from collections import defaultdict
from sklearn.cluster import KMeans

db = MongoClient().yelp

def get_radius_for_city(city):
    
    def get_all_reviews_by_user_for_city(city):
        bizes = list(db.businesses.find({"city": city}, { "business_id": 1, "latitude": 1, "longitude": 1}))
        user_reviews = defaultdict(list)
        for biz in tqdm(bizes):
            rel_review = db.reviews.find({"business_id": biz['business_id']})
            for review in list(rel_review):
                user_reviews[review['user_id']].append((biz["latitude"], biz["longitude"]))
        return user_reviews    
    
    def get_average_lat_and_long(set_lat_by_long):
        set_lat_by_long = np.asarray(set_lat_by_long).copy()
        return np.average(set_lat_by_long, axis=0)

    def get_max_distance_from_mid(set_lat_by_long):
        set_lat_by_long = np.asarray(set_lat_by_long)
        mid = get_average_lat_and_long(set_lat_by_long)
        return np.max([gpxpy.geo.haversine_distance(mid[0], mid[1], lat, lon)/1609.34 for lat, lon in set_lat_by_long])
    
    def get_clusters(data, n_clusters=2):
        kmeans = KMeans(n_clusters=2).fit(data)
        centers = kmeans.cluster_centers_
        to_return = [[] for i in range(len(centers))]
        
        for x in data:
            dist = np.linalg.norm(centers - x, 1, axis=1)
            index = np.argmin(dist)
            to_return[index].append(x)
            
        return sorted([np.vstack(i) for i in to_return], key=lambda x: len(x))[::-1]
    
    city_reviews = get_all_reviews_by_user_for_city(city)
    city_radi = []
    review_weights = [] # the more reviews that a user has given the more confident we are that they are part of the competitive region
    for i, val in enumerate(city_reviews.values()):
        if len(val) > 1:
            dis_from_mid = get_max_distance_from_mid(val)
            k = 2
            while dis_from_mid > 100:
                val = get_clusters(val, n_clusters=k)[0]
                k += 1
                dis_from_mid = get_max_distance_from_mid(val)
            city_radi.append(dis_from_mid)
            review_weights.append(len(val))
            
    final = np.dot(np.array(review_weights).T, np.array(city_radi))/np.sum(review_weights)
    db.cityDistanceMetric.insert_one({"city" : city, "radius" : final})
    return city, final

In [12]:
cities = db.businesses.find().distinct('city')

for city in cities:
    get_radius_for_city(city)

  0%|          | 0/6254714 [00:00<?, ?it/s]


KeyError: 'latitude'