In [83]:
from pymongo import MongoClient
import numpy as np
import gpxpy.geo
from tqdm import tqdm
from collections import defaultdict
client = MongoClient()
db = client.yelp
from sklearn.cluster import KMeans
    
def get_radius_for_city(city, industry):
    def get_all_reviews_by_user_for_city(city):
        bizes = list(db.businesses.find({"city": city, 'categories': {"$in": [industry]}}, { "business_id": 1, "latitude": 1, "longitude": 1}))
        user_reviews = defaultdict(list)
        for biz in bizes:
            rel_review = db.reviews.find({"business_id": biz['business_id']})
            for review in list(rel_review):
                user_reviews[review['user_id']].append((biz["latitude"], biz["longitude"]))
        
        return user_reviews    
    
    def get_average_lat_and_long(set_lat_by_long):
        set_lat_by_long = np.asarray(set_lat_by_long).copy()
        return np.average(set_lat_by_long, axis=0)

    def get_max_distance_from_mid(set_lat_by_long):
        set_lat_by_long = np.asarray(set_lat_by_long)
        mid = get_average_lat_and_long(set_lat_by_long)
        return np.max([gpxpy.geo.haversine_distance(mid[0], mid[1], lat, lon)/1609.34 for lat, lon in set_lat_by_long])
    
    def get_clusters(data, n_clusters=2):
        kmeans = KMeans(n_clusters=2).fit(data)
        centers = kmeans.cluster_centers_
        to_return = [[] for i in range(len(centers))]
        for x in data:
            dist = np.linalg.norm(centers - x, 1, axis=1)
            index = np.argmin(dist)
            to_return[index].append(x)
        return sorted([np.vstack(i) for i in to_return], key=lambda x: len(x))[::-1]
    
    city_reviews = get_all_reviews_by_user_for_city(city)
    if len(city_reviews) == 0:
        return None
    city_radi = []
    review_weights = [] # the more reviews that a user has given the more confident we are that they are part of the competitive region
    for i, val in enumerate(city_reviews.values()):
        if len(val) > 1:
            dis_from_mid = get_max_distance_from_mid(val)
            k = 2
            while dis_from_mid > 100:
                val = get_clusters(val, n_clusters=k)[0]
                k += 1
                dis_from_mid = get_max_distance_from_mid(val)
            city_radi.append(dis_from_mid)
            review_weights.append(len(val))
            
    if len(np.array(review_weights).T) == 0:
        return None
    final = np.dot(np.array(review_weights).T, np.array(city_radi))/np.sum(review_weights)
    return city, final, industry

In [91]:
def put_stuff_in_database():
    cities = db.businesses.find().distinct('city')
    industries = db.businesses.find().distinct('categories')
    count = 0
    for city in cities:
    # print industries
        for industry in industries:
            if industry == None:
                continue
            items = get_radius_for_city(city, industry)
            if items == None:
                continue

            db.findIndustryDist.insert_one({"city": items[0], "radius": items[1], "industry": items[2]})
        count += 1
        
        if count == 100:
            break
    
put_stuff_in_database()

In [62]:
industries = db.businesses.find().distinct('categories')
for industry in industries:
    if industry == None:
        continue
    stuff = list(db.businesses.find({"city": 'Tempe', 'industries': {"$in": [industry]}}, { "business_id": 1, "latitude": 1, "longitude": 1, "categories": 1}))
    if len(stuff) == 0:
#         print industry
        continue
    print industry
    print stuff
    break

In [66]:
db.businesses.find_one({"city": 'Tempe', "categories": {"$in": ["Tobacco Shops"]}}, { "business_id": 1, "latitude": 1, "longitude": 1, "categories": 1})

{u'_id': ObjectId('58c06b45b33b0a8d2afc16fe'),
 u'business_id': u'0DI8Dt2PJp07XkVvIElIcQ',
 u'categories': [u'Tobacco Shops', u'Nightlife', u'Vape Shops', u'Shopping'],
 u'latitude': 33.3782141,
 u'longitude': -111.936102}

In [102]:
data = list(db.findIndustryDist.find())

for x in data:
    print x

{u'city': u'Ahwatukee', u'industry': u'Health & Medical', u'_id': ObjectId('58d54248236f4409f78e7d23'), u'radius': 0.7820309019857332}
{u'city': u'Ahwatukee', u'industry': u'Italian', u'_id': ObjectId('58d54248236f4409f78e7d24'), u'radius': 0.5929347086385134}
{u'city': u'Ahwatukee', u'industry': u'Pizza', u'_id': ObjectId('58d54248236f4409f78e7d25'), u'radius': 0.5929347086385134}
{u'city': u'Ahwatukee', u'industry': u'Restaurants', u'_id': ObjectId('58d54248236f4409f78e7d26'), u'radius': 0.7335777560665556}
{u'city': u'Ahwatukee', u'industry': u'Sandwiches', u'_id': ObjectId('58d54248236f4409f78e7d27'), u'radius': 0.5929347086385134}
{u'city': u'Aichwald', u'industry': u'Restaurants', u'_id': ObjectId('58d54249236f4409f78e7d28'), u'radius': 0.5979559072183469}
{u'city': u'Ajax', u'industry': u'Active Life', u'_id': ObjectId('58d54249236f4409f78e7d29'), u'radius': 1.528200593608912}
{u'city': u'Ajax', u'industry': u'American (Traditional)', u'_id': ObjectId('58d54249236f4409f78e7d2a')

In [98]:
db.findIndustryDist.remove()

  if __name__ == '__main__':


{u'n': 1029, u'ok': 1.0}

In [100]:
import radiiByIndustryByCity
put_stuff_in_database()