In [1]:
from pymongo import MongoClient
import numpy as np
import pandas as pd


In [2]:
client = MongoClient('mongodb://localhost/companies')
db = client.companies
col = db.geo_one
col.find({})

<pymongo.cursor.Cursor at 0x7f3635fa02b0>

# Sample Data

In [3]:
list(col.find({"venue_location.GeoPoint":  {
                                        "$near": { 
                                            "$geometry": {
                                                  "type": "Point" ,
                                                  "coordinates": [ -95.712891, 37.09024 ]
                                            },
                                        }
                                        }}, {"venue_location":1}).limit(2))

[{'_id': ObjectId('5e9dae164fae396022816343'),
  'venue_location': {'PointCategory': 'business office',
   'GeoPoint': {'type': 'Point', 'coordinates': [-95.712891, 37.09024]}}},
 {'_id': ObjectId('5e9dae41f98fc5f1ff8e32ff'),
  'venue_location': {'name': 'Starbucks',
   'GeoPoint': {'type': 'Point',
    'coordinates': [-95.7081298828125, 37.09497833251953]},
   'PointCategory': {'source': '4square location',
    'categories_names': 'Coffee Shop',
    'categories_raw': [{'id': '4bf58dd8d48988d1e0931735',
      'name': 'Coffee Shop',
      'pluralName': 'Coffee Shops',
      'shortName': 'Coffee Shop',
      'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/coffeeshop_',
       'suffix': '.png'},
      'primary': True}],
    'formattedAddress': ['Amerika Birleşik Devletleri',
     'Kansas',
     'United States']}}}]

# # These are all the `categories_names` it has found on MongoDB

In [4]:
cur = col.find({},{"venue_location.PointCategory.categories_names":1})
categs = []
#for e in list(cur):
 #   (e['venue_location']['PointCategory']['categories_names'])
for e in list(cur):
    try: 
        categs.append(e['venue_location']['PointCategory']['categories_names'])
    except:
        pass
    

set(categs)

{'Advertising Agency',
 'Airport',
 'Airport Gate',
 'Airport Lounge',
 'Airport Service',
 'Airport Terminal',
 'Airport Tram',
 'American Restaurant',
 'Arcade',
 'Automotive Shop',
 'Bagel Shop',
 'Bakery',
 'Bank',
 'Bar',
 'Baseball Field',
 'Basketball Court',
 'Beach Bar',
 'Beer Bar',
 'Beer Garden',
 'Bookstore',
 'Bowling Alley',
 'Breakfast Spot',
 'Brewery',
 'Building',
 'Burger Joint',
 'Business Center',
 'Business Service',
 'Café',
 'Campaign Office',
 'Child Care Service',
 'Church',
 'City Hall',
 'Cocktail Bar',
 'Coffee Shop',
 'College Rec Center',
 'Conference Room',
 'Convention Center',
 'Corporate Cafeteria',
 'Corporate Coffee Shop',
 'Coworking Space',
 'Daycare',
 'Distillery',
 'Dive Bar',
 "Doctor's Office",
 'Dog Run',
 'Donut Shop',
 'Elementary School',
 'Event Space',
 'Financial or Legal Service',
 'Food Court',
 'Gastropub',
 'Gay Bar',
 'General College & University',
 'General Entertainment',
 'German Restaurant',
 'Go Kart Track',
 'Government Bu

# Use the `$near` operator to query the `categories_names` within the `office_coords` specified `radius`

In [5]:
# Count how many airports and heliports are near the queried office

In [6]:
# Query a cursor only with offices 
offices = list(col.find({"venue_location.PointCategory":"business office"},
                        {"venue_location.GeoPoint":1}))


In [7]:
def exploreNeighborhood(office_coords, specific_category, radius):
    """
    Query an specific category name
    INPUT 
     - A [long,latt] array
     - A single categoy to use as filter @ the `venue_location.PointCategory.categories_names` key
     - A Radius
    OUTPUT
     - A sum of how many venues fit the specific category given that `coords` and `radius`
    """
    cursor = col.find({"venue_location.GeoPoint":  {
                                        "$near": { 
                                            "$geometry": {
                                                  "type": "Point" ,
                                                # Array or coords here
                                                  "coordinates": office_coords
                                            },
                                            # Specify the maximum radius of the query
                                            "$maxDistance":radius,
                                        }
                            },
                        # Specify the correct 'categories_names' values
                        'venue_location.PointCategory.categories_names': specific_category,
                        },
                        # Project only the `categories_names` values
                        {"venue_location.PointCategory.categories_names":1}
                           )
    
    #print(cursor)
    # Loop the resulting cursor, and try to return the category found
    try:
        for venue in cursor:
            yield venue['venue_location']['PointCategory']['categories_names']
    except:
        return []


specific_category = 'Elementary School'
list(exploreNeighborhood([ -95.712891, 37.09024 ], specific_category, 10000))

['Elementary School']

# Query all `specific_categories` from a `filter_category`, using an arbitraty `array coordinates` variable

In [8]:
def evaluateNeighborhood(office_coords, filter_categories, radius):
    """
    INPUT:
       - An array with coordinates
    OUTPUT:
       - A new Array with the number of occurrences of such `specific_category` within the radius
    """
    for specific_category in filter_categories:
        query = list(exploreNeighborhood(office_coords, specific_category, radius))
        #print(f" ~ evaluateNeighborhood found: {query.count(specific_category)} '{specific_category}'")
        yield query.count(specific_category)
        
        #print(venues)
        #for specific_category in filter_categories:
         #       if venues.count(specific_category) > 0:


kids_venues = ['Daycare', 'Preschool', 'Elementary School']
filter_categories = kids_venues
list(evaluateNeighborhood([ -95.712891, 37.09024 ], filter_categories, 100000))

[0, 1, 8]

# Turn all the coordinates in the database into rows in a panda dataframe

In [9]:
def findOfficeCoords(document):
    """
    INPUT: 
     - A MongoDB object which has a GeoPoint
    OUTPUT:
     - The Coordinates for the object's GeoPoint
    """
    return document['venue_location']['GeoPoint']['coordinates']

In [10]:
# Take a list of offices which made some good money
candidate_locations = list(col.find({"funding_rounds.raised_amount": {"$gt":1000000}},
                                    {"venue_location.GeoPoint":1}))

# Example of one of the results of the query
print(candidate_locations[0]['venue_location']['GeoPoint']['coordinates'])

# Use a mapping function to re-shape the results of our previous query 
def gc(cursor):
    for document in cursor:
        #yield( str(document['venue_location']['GeoPoint']['coordinates']))     # For a `str` of a `array_coords`
        yield( document['venue_location']['GeoPoint']['coordinates'] )         # For an actual `array` of coors

[-73.985506, 40.757929]


# Make a decent dataframe to work with

At this point, the keys should be changed to include the correctkeys

In [11]:
df = pd.DataFrame(gc(
    candidate_locations)
                 ).drop_duplicates(
    ).reset_index(
    ).drop(columns='index')
dfx = df.copy()
dfx.head()

Unnamed: 0,0,1
0,-73.985506,40.757929
1,-111.9035,33.8171
2,-122.323895,37.566879
3,-122.398599,37.798853
4,-118.445243,34.047312


# Try to count venues of certain categories around a GeoPoint

In [12]:
# Loop the specific categories
for specific_category in filter_categories:
    
    # Make a place-holder column for them
    dfx[specific_category] = np.arange(len(dfx[0]))
    
    # loop that column's row, and sum how many of venues have a `categories_names` in the `specific_categories`
    for i in np.arange(len(dfx[0])):
        dfx[specific_category][i] = sum(
            list(evaluateNeighborhood([dfx[0][i], dfx[1][i]], [specific_category], 3000)))
        #  print(list(evaluateNeighborhood([dfx[0][0], dfx[1][0]], filter_categories, 100000)))
dfx.head(5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,0,1,Daycare,Preschool,Elementary School
0,-73.985506,40.757929,7,0,7
1,-111.9035,33.8171,0,0,0
2,-122.323895,37.566879,0,10,0
3,-122.398599,37.798853,8,6,0
4,-118.445243,34.047312,0,0,0


# The `BigPicture`

Integrate all possible `filter_categories`, alog with their `radius` into a single dict

In [13]:
# Define the specific_categories each in a groupped Array
coffee_venues = ['Coffee Shop', 'Café']
transport_venues = ['Heliport', "Airport Terminal", "Airport"]
nightlife_venues = ['Bar', 'Beer Garden']
kids_venues = ['Daycare', 'Preschool', 'Elementary School']

# Then set the `Big Picture`: Which `categories` are matched to the maximum radius
BigPicture = {'coffee_venues':    (['Café', 'Coffee Shop', 'Bagel Shop', 'Bakery',
                                   'Corporate Cafeteria', 'Corporate Coffee Shop'], 200),
              
              'airport_venues':   (['Airport', 'Airport Terminal'] ,        10000),
              
              'nightlife_venues': (['Bar', 'Beach Bar',  'Beer Bar',  'Beer Garden', 'Bowling Alley',
                                    'Brewery', 'Burger Joint', 'Cocktail Bar', 'Arcade', 'Distillery',
                                    'Dive Bar','Whisky Bar' 'Sake Bar','Pub', 'Piercing Parlor',
                                    'Other Nightlife', 'Movie Theater', 'Night Market', 'Nightclub',
                                    'Lounge', 'Gastropub', 'Tiki Bar'] ,      1000),
              
              'kids_venues':      (['Daycare', 'School', 'Preschool', 'Pool',
                                    'Pool Hall', 'Playground', 'Park', 'High School',
                                    'Elementary School', 'Museum']  ,        1000),
              
              'basketball_court': (['Basketball Court'], 1500),
              
              'business_perks':   (['Travel Lounge', 'Tech Startup',  'Meeting Room',
                                     'Event Space', 'Business Center', 'Business Service', 
                                     'Conference Room',  'Convention Center',  'Coworking Space',
                                      'Advertising Agency', 'Office'], 1500 ),
              
              'lifestyle_perks':  (['Pizza Place', 'Sandwich Place',  'Donut Shop', 'Juice Bar', 
                                    'Salon / Barbershop' ] , 500)
             }

In [14]:
# Loop the specific categories
dfz = df.copy()


# Aggregate the occurrences of each filter_categories

In [15]:
# What are our BigPicture keys called?
keys = {'basketball_court', 'coffee_venues', 'airport_venues',
        'kids_venues', 'nightlife_venues',  'business_perks','lifestyle_perks'} 

for key in keys:
    #Open the door, unpack your bag
    filter_categories = BigPicture[key][0]
    radius = BigPicture[key][1]
    
    for specific_category in filter_categories:

        # Make a place-holder column for the filter_categories bin
        dfz[key] = np.arange(len(dfz[0]))

        # loop that column's row, and sum how many of venues have a `categories_names` in the `specific_categories`
        for i in np.arange(len(dfz[0])):
            dfz[key][i] = sum(
                list(evaluateNeighborhood([dfz[0][i], dfz[1][i]], filter_categories, radius)))
            #  print(list(evaluateNeighborhood([dfx[0][0], dfx[1][0]], filter_categories, 100000)))
dfz

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,0,1,nightlife_venues,coffee_venues,kids_venues,business_perks,airport_venues,basketball_court,lifestyle_perks
0,-73.985506,40.757929,0,0,0,42,14,0,0
1,-111.9035,33.8171,0,0,0,0,3,0,0
2,-122.323895,37.566879,3,0,5,0,41,0,0
3,-122.398599,37.798853,0,0,0,30,0,0,0
4,-118.445243,34.047312,0,0,3,3,0,0,0
5,-122.161523,37.446823,3,0,6,15,2,0,0
6,-118.405418,34.069849,3,0,0,0,0,0,3
7,-122.32547,37.564538,3,0,5,0,41,0,0
8,34.7595,32.0554,0,0,0,0,0,0,0
9,-118.243425,34.052187,3,0,0,0,0,0,0


# Our top 5 options 🏁️
Use the `pd.DataFrame.sort_values()` method to specify minimum requirements in the best order possible

In [16]:
sol = dfz.sort_values(by = ['airport_venues', 'coffee_venues', 'kids_venues',
                     'basketball_court','business_perks'], ascending=False).head()

In [17]:
sol.to_csv('OUTPUT/my-top-locations.csv')
sol

Unnamed: 0,0,1,nightlife_venues,coffee_venues,kids_venues,business_perks,airport_venues,basketball_court,lifestyle_perks
2,-122.323895,37.566879,3,0,5,0,41,0,0
7,-122.32547,37.564538,3,0,5,0,41,0,0
19,-122.384349,37.665648,0,0,0,0,40,0,0
0,-73.985506,40.757929,0,0,0,42,14,0,0
1,-111.9035,33.8171,0,0,0,0,3,0,0


Just by looking at our top 5 selections, we can see that there's a pattern around the -110/30 and the -120/40 coordinates

# Playground:

### With the top results, assign a `top` value to return the mean coordinate point of the desired ranking.

### Set `top=1` to receive the best-ranked point.


In [18]:
import folium
top = 3

# We take the mean of the `top` results:
location = [sol[1].head(top).mean(),sol[0].head(top).mean()]

m = folium.Map(location=location, zoom_start=12              )

folium.Marker(location, 
              popup=f'<i>This was the result of your ponderation. You chose <b>the mean of the top {top} results</b>.</i>', 
              tooltip='Click me!'
             ).add_to(m)
m

In [19]:
 # The mean latitude and longitude of our solution

In [20]:
#Dont look below... this is old code that i dont want to get rid of just yet.

In [21]:
0/0
for specific_category in filter_categories:

        # Make a place-holder column for the filter_categories bin
        dfz[key] = np.arange(len(dfz[0]))

        # loop that column's row, and sum how many of venues have a `categories_names` in the `specific_categories`
        for i in np.arange(len(dfz[0])):
            dfz[key][i] = sum(
                list(evaluateNeighborhood([dfz[0][i], dfz[1][i]], filter_categories, 10000)))
            #  print(list(evaluateNeighborhood([dfx[0][0], dfx[1][0]], filter_categories, 100000)))
dfz

ZeroDivisionError: division by zero

In [None]:
0/0

In [None]:
# Add the necessary columns
keys = {'Airport', 'Coffee', 'Basketball'}    # These should be the `filter_categories`
for key in keys:
    dfx[key] = df.copy()
dfx.head(3)

In [None]:
#Avoid double querying by creating a list of coordinates which have already been evaluated
"""
evaluated = []                
for document in offices:      
    office_coords = document['venue_location']['GeoPoint']['coordinates']
    if office_coords in evaluated:
        pass
    else:
        print(f"\nEvaluating this office location: {office_coords}")
        evaluateNeighborhood(office_coords, filter_categories, 10000)
        evaluated.append(office_coords)
        
"""

In [None]:
# INPUT a cursor
# OUTPUT : choose a GeoPoint or a `coords_array`
#getcoords = lambda cur : cur['venue_location']                              # For the complete GeoPoint
#getcoords = lambda cur : cur['venue_location']['GeoPoint']['coordinates']    # For the coords_array
candidate_locations2 = map(gc, candidate_locations)


candidate_locations3 = []
for coords_array in list(candidate_locations2):
    candidate_locations3.append(str(e))
    #candidate_locations3.append(e)

    
df = pd.DataFrame(list(candidate_locations3))
df.head()
# This results in a good dataframe we can query on, using pymongo

In [None]:
# A list of places where we can build on
candidate_locations = list(col.find({"funding_rounds.raised_amount": {"$gt":1000000}},
              {"venue_location.GeoPoint":1}))
#getcoords = lambda cur : cur['venue_location']['GeoPoint']['coordinates']
getcoords = lambda cur : cur['venue_location']#['GeoPoint']['coordinates']
candidate_locations2 = map(getcoords, candidate_locations)

candidate_locations3 = []
for e in list(candidate_locations2):
    #candidate_locations3.append(str(e))
    candidate_locations3.append(e)

    
df = pd.DataFrame(list(candidate_locations3))
df.head(3)
# This results in a good dataframe we can query on, using pymongo

In [None]:
#Cleaning the data, some of the locations were duplicated.
df = df.drop_duplicates()
df = df.reset_index()
df = df.drop(columns='index')
df.head(3)

In [None]:
#Copy the dataframe
df2 = df.copy()

In [None]:
# Make multiple columns with the coordinates as their values
keys = {'Airport', 'Coffee', 'Basketball'}
for key in keys:
    df2[key] = df.copy()
df2.head(3)

In [None]:
#Loop the coordinates
for key in keys:
    for i in range(len(df2.GeoPoint[:3])):
        print(df2[key][i])

# Refactor MongoDB functions to pe applied on pandas columns

In [None]:
def venue_counter(BigPicture):
    for filter_categories, radius in BigPicture:
        for specific_category in filter_categories:
            print(specific_category, radius)

venue_counter(BigPicture)    
# Accessed each specific_category and it's maximum radius allowed

In [None]:
def venue_counter2(llarray, BigPicture):
    print(f"Querying this coordinates {llarray}")
    
    for filter_categories, radius in BigPicture:    
        print(f"Querying this filter_categories {filter_categories}")        
        matches = list(col.find({"venue_location.GeoPoint":  {
                                            "$near": { 
                                                "$geometry": 
                                                     # {"type": "Point" ,
                                                    #  "coordinates":
                                                    llarray    #}
                                                ,
                                                # Specify the maximum radius of the query
                                                "$maxDistance":radius,
                                            }
                                },
                            # Specify the correct 'categories_names' values
                            'venue_location.PointCategory.categories_names': {"$in": filter_categories},
                            },
                            # Project only the `categories_names` values
                            {"venue_location.PointCategory.categories_names":1}
                        ))
        
        venue_list = []
        for venue in matches:
        #    for specific_category in filter_categories:
            venue_list.append(venue['venue_location']['PointCategory']['categories_names'])
            #yield venue['venue_location']['PointCategory']['categories_names']
            
        yield venue_list
     #for specific_category in filter_categories:
      #      print(specific_category, radius)

#Loop the coordinates
for key in keys:
    for i in range(len(df2['GeoPoint'][:2])):
        print(df2[key][i])
        
list(venue_counter2(df2['Airport'][8], BigPicture))

In [None]:
0/0

In [None]:
def exploreNeighborhood(office_coords, filter_categories, radius):

    # Query and make a list of matches. 
    # This will include nearby venues 
    #      - which are part of the `filter_categories` list
    #      - which are in the specified radius
    matches = list(col.find({"venue_location.GeoPoint":  {
                                        "$near": { 
                                            "$geometry": {
                                                  "type": "Point" ,
                                                  "coordinates": office_coords
                                            },
                                            # Specify the maximum radius of the query
                                            "$maxDistance":radius,
                                        }
                            },
                        # Specify the correct 'categories_names' values
                        'venue_location.PointCategory.categories_names': {"$in": filter_categories},
                        },
                        # Project only the `categories_names` values
                        {"venue_location.PointCategory.categories_names":1}))
    
    print(matches)
    # Loop the resulting cursor, and try to return the category found
    for venue in matches:
            yield venue['venue_location']['PointCategory']['categories_names']


In [None]:
fn = lambda office_coords : evaluateNeighborhood(office_coords, BigPicture)
df2.Airport.map(fn)

In [None]:
# Avoid double querying by creating a list of coordinates which have already been evaluated
evaluated = []                
#for document in offices:      
 #   office_coords = document['venue_location']['GeoPoint']['coordinates']
    #if office_coords in evaluated:
    #    pass
    #else:
        print(f"\nEvaluating this office location: {office_coords}")
        evaluateNeighborhood(office_coords, BigPicture)
        evaluated.append(office_coords)

In [None]:
0/0


In [None]:
# Queary a cursor only with offices 
offices = list(col.find({"venue_location.PointCategory":"business office"},
                        {"venue_location.GeoPoint":1}).limit(10))

# Define the specific categories in an Array
filter_categories = ['Coffee Shop', 'Heliport', "Airport Terminal"]

def evaluateOfficeNeighborhood(document, filter_categories, radius):
    # Take office-document's coordinates
    print(f"\nEvaluating this office location: {document['venue_location']['GeoPoint']['coordinates']}")
    office_coords = document['venue_location']['GeoPoint']['coordinates']

    # Perform a query that finds nearby venues which are part of the `filter_categories`
    matches = list(col.find({"venue_location.GeoPoint":  {
                                        "$near": { 
                                            "$geometry": {
                                                  "type": "Point" ,
                                                  "coordinates": office_coords
                                            },
                                            "$maxDistance":radius,
                                           
                                        }
                                        },
                        'venue_location.PointCategory.categories_names': {"$in": filter_categories},
                        },
                        {"venue_location.PointCategory.categories_names":1}))
    
    # Loop the resulting cursor, and try to return the category found
    for venue in matches:
            yield venue['venue_location']['PointCategory']['categories_names']
            
for document in offices:
    venues = list(evaluateOfficeNeighborhood(document, filter_categories, 50000))
    #print(venues)
    
    for var in filter_categories:
        if venues.count(var) > 0:
            print(f" There are {venues.count(var)} '{var}'")