In [3]:
import urllib.request as urllib2
from bs4 import BeautifulSoup
import pandas as pd
import json
import requests
import numpy as np

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe


import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes
import folium 

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/DSX-Python35

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.0.2r             |       h14c3975_0         3.1 MB  conda-forge
    altair-2.2.2               |           py35_1         462 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    ca-certificates-2019.3.9   |       hecc5488_0         146 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    certifi-2018.8.24          |        py35_1001         139 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         4.0 MB

The following NEW packages will

In [4]:
url = r'https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population'
page = urllib2.urlopen(url)
soup = BeautifulSoup(page,'html.parser')
table = soup.find("table",{"class":"wikitable sortable"})
table_rows = table.find_all('tr')

l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    l.append(row)

In [5]:
l[1]

['1\n',
 'New York[d]\n',
 '\xa0New York\n',
 '8,398,748\n',
 '8,175,133\n',
 '+2.74%\n',
 '301.5\xa0sq\xa0mi\n',
 '780.9\xa0km2\n',
 '28,317/sq\xa0mi\n',
 '10,933/km2\n',
 '40°39′49″N 73°56′19″W\ufeff / \ufeff40.6635°N 73.9387°W\ufeff / 40.6635; -73.9387\ufeff (1 New York City)\n']

In [6]:
data = pd.DataFrame(l)
data = data.iloc[1:,1]
data['clean'] = data.apply(lambda x: x.split('[', 1)[0])

data['clean'] = data['clean'].str.strip()
cities = [x+', US' for x in data['clean'].values[1:]]
cities[:10]

['Los Angeles, US',
 'Chicago, US',
 'Houston, US',
 'Phoenix, US',
 'Philadelphia, US',
 'San Antonio, US',
 'San Diego, US',
 'Dallas, US',
 'San Jose, US',
 'Austin, US']

In [7]:
def lat_lng(address):
    geolocator = Nominatim(user_agent="explorer")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    return (latitude, longitude)

In [8]:
data = pd.DataFrame(columns = ['City', 'Latitude', 'Longitude'])

for city in cities:
    print(city)
    try:
        (lat, lng) = lat_lng(city)
        data = data.append({'City':city, 'Latitude':lat, 'Longitude':lng}, ignore_index=True)
    except:
        pass

Los Angeles, US
Chicago, US
Houston, US
Phoenix, US
Philadelphia, US
San Antonio, US
San Diego, US
Dallas, US
San Jose, US
Austin, US
Jacksonville, US
Fort Worth, US
Columbus, US
San Francisco, US
Charlotte, US
Indianapolis, US
Seattle, US
Denver, US
Washington, US
Boston, US
El Paso, US
Detroit, US
Nashville, US
Portland, US
Memphis, US
Oklahoma City, US
Las Vegas, US
Louisville, US
Baltimore, US
Milwaukee, US
Albuquerque, US
Tucson, US
Fresno, US
Mesa, US
Sacramento, US
Atlanta, US
Kansas City, US
Colorado Springs, US
Miami, US
Raleigh, US
Omaha, US
Long Beach, US
Virginia Beach, US
Oakland, US
Minneapolis, US
Tulsa, US
Arlington, US
Tampa, US
New Orleans, US
Wichita, US
Cleveland, US
Bakersfield, US
Aurora, US
Anaheim, US
Honolulu, US
Santa Ana, US
Riverside, US
Corpus Christi, US
Lexington, US
Stockton, US
Henderson, US
Saint Paul, US
St. Louis, US
Cincinnati, US
Pittsburgh, US
Greensboro, US
Anchorage, US
Plano, US
Lincoln, US
Orlando, US
Irvine, US
Newark, US
Toledo, US
Durham, U

In [9]:

data


Unnamed: 0,City,Latitude,Longitude
0,"Los Angeles, US",34.053691,-118.242767
1,"Chicago, US",41.875562,-87.624421
2,"Houston, US",29.758938,-95.367697
3,"Phoenix, US",33.448587,-112.077346
4,"Philadelphia, US",39.952724,-75.163526
5,"San Antonio, US",29.424600,-98.495141
6,"San Diego, US",32.717421,-117.162771
7,"Dallas, US",32.776272,-96.796856
8,"San Jose, US",37.336191,-121.890583
9,"Austin, US",30.271129,-97.743700


In [10]:
CLIENT_ID = '30AKSP1XNS1EGWZY2B2BFSYNMHK0AALN3VHRIZAMLX4UBORN'
CLIENT_SECRET = 'WCO30OZLB2X5LGDFU21A1OHTYEG225TC5MSNH2ZAD3QPN5H2' 
VERSION = '20180605'

In [11]:
LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 500 # define radius

# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']


def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['City', 
                  'Latitude', 
                  'Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [12]:
venues = getNearbyVenues(names=data['City'],
                                   latitudes=data['Latitude'],
                                   longitudes=data['Longitude'])

In [13]:
venues.head()

Unnamed: 0,City,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Los Angeles, US",34.053691,-118.242767,Grand Park,34.055034,-118.245179,Park
1,"Los Angeles, US",34.053691,-118.242767,Badmaash,34.051342,-118.244571,Indian Restaurant
2,"Los Angeles, US",34.053691,-118.242767,Renegade Craft Fair,34.054445,-118.244471,Arts & Crafts Store
3,"Los Angeles, US",34.053691,-118.242767,Redbird,34.050666,-118.244068,American Restaurant
4,"Los Angeles, US",34.053691,-118.242767,CVS pharmacy,34.053426,-118.242107,Pharmacy


In [14]:
print('There are {} uniques categories.'.format(len(venues['Venue Category'].unique())))

There are 404 uniques categories.


In [15]:
onehot = pd.get_dummies(venues[['Venue Category']], prefix="", prefix_sep="")
onehot['City'] = venues['City'] 

fixed_columns = [onehot.columns[-1]] + list(onehot.columns[:-1])
onehot = onehot[fixed_columns]

onehot.head()

Unnamed: 0,City,ATM,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Antique Shop,...,Vietnamese Restaurant,Warehouse Store,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio
0,"Los Angeles, US",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Los Angeles, US",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Los Angeles, US",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Los Angeles, US",0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,"Los Angeles, US",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
grouped = onehot.groupby('City').mean().reset_index()
grouped

Unnamed: 0,City,ATM,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Antique Shop,...,Vietnamese Restaurant,Warehouse Store,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio
0,"Abilene, US",0.000000,0.000000,0.0,0.0,0.00000,0.0,0.0,0.100000,0.000000,...,0.000000,0.0,0.0,0.000000,0.100000,0.000000,0.0,0.000000,0.000000,0.000000
1,"Akron, US",0.000000,0.000000,0.0,0.0,0.00000,0.0,0.0,0.026316,0.000000,...,0.000000,0.0,0.0,0.026316,0.026316,0.000000,0.0,0.000000,0.000000,0.000000
2,"Albuquerque, US",0.017241,0.000000,0.0,0.0,0.00000,0.0,0.0,0.017241,0.000000,...,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
3,"Alexandria, US",0.000000,0.000000,0.0,0.0,0.00000,0.0,0.0,0.333333,0.000000,...,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
4,"Allentown, US",0.000000,0.000000,0.0,0.0,0.00000,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
5,"Amarillo, US",0.000000,0.000000,0.0,0.0,0.00000,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.0,0.000000,0.041667,0.000000,0.0,0.000000,0.000000,0.000000
6,"Anaheim, US",0.000000,0.000000,0.0,0.0,0.00000,0.0,0.0,0.016667,0.000000,...,0.016667,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
7,"Anchorage, US",0.000000,0.042105,0.0,0.0,0.00000,0.0,0.0,0.021053,0.000000,...,0.000000,0.0,0.0,0.000000,0.021053,0.000000,0.0,0.000000,0.000000,0.000000
8,"Ann Arbor, US",0.000000,0.000000,0.0,0.0,0.00000,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
9,"Arlington, US",0.000000,0.000000,0.0,0.0,0.00000,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000


In [17]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [18]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

columns = ['City']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))


venues_sorted = pd.DataFrame(columns=columns)
venues_sorted['City'] = grouped['City']

for ind in np.arange(grouped.shape[0]):
    venues_sorted.iloc[ind, 1:] = return_most_common_venues(grouped.iloc[ind, :], num_top_venues)

venues_sorted.head()

Unnamed: 0,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Abilene, US",Pub,Museum,Seafood Restaurant,Candy Store,Deli / Bodega,Mexican Restaurant,American Restaurant,Art Gallery,Wine Bar,Miscellaneous Shop
1,"Akron, US",Bank,Bar,Art Gallery,Sandwich Place,Coffee Shop,Music Venue,Mediterranean Restaurant,Jazz Club,Brewery,Fast Food Restaurant
2,"Albuquerque, US",Brewery,Coffee Shop,Bar,Restaurant,Music Venue,Sandwich Place,Theater,Hotel,Asian Restaurant,Pizza Place
3,"Alexandria, US",Park,American Restaurant,Yoga Studio,Fast Food Restaurant,English Restaurant,Entertainment Service,Ethiopian Restaurant,Event Space,Exhibit,Fabric Shop
4,"Allentown, US",Sandwich Place,New American Restaurant,Coffee Shop,History Museum,Pharmacy,Convenience Store,Brewery,Gastropub,Donut Shop,Gaming Cafe


In [19]:
kclusters = 10

grouped_clustering = grouped.drop('City', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(grouped_clustering)
kmeans.labels_[0:10]

array([1, 1, 1, 2, 1, 1, 0, 1, 2, 8], dtype=int32)

In [20]:
venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

merged = data
merged = merged.join(venues_sorted.set_index('City'), on='City')

merged.head()

Unnamed: 0,City,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Los Angeles, US",34.053691,-118.242767,1.0,Sushi Restaurant,Speakeasy,Noodle House,American Restaurant,Coffee Shop,Museum,Breakfast Spot,Theater,Sandwich Place,Sculpture Garden
1,"Chicago, US",41.875562,-87.624421,1.0,Coffee Shop,Sandwich Place,Bookstore,Bakery,Pizza Place,Donut Shop,Pub,Museum,Hotel,Fountain
2,"Houston, US",29.758938,-95.367697,1.0,Hotel,Park,Burger Joint,Sandwich Place,Coffee Shop,Theater,Concert Hall,Italian Restaurant,Performing Arts Venue,Deli / Bodega
3,"Phoenix, US",33.448587,-112.077346,1.0,American Restaurant,Coffee Shop,Pizza Place,Music Venue,Hotel,Mexican Restaurant,Juice Bar,Pub,Salon / Barbershop,Cocktail Bar
4,"Philadelphia, US",39.952724,-75.163526,1.0,Hotel,Coffee Shop,Clothing Store,Bakery,Chinese Restaurant,American Restaurant,Salad Place,Pizza Place,Seafood Restaurant,Arts & Crafts Store


In [21]:
map_clusters = folium.Map(location=[41.479014,-101.9245357], zoom_start=4)


x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(merged['Latitude'], merged['Longitude'], merged['City'], merged['Cluster Labels']):
    try :
        label = folium.Popup(str(poi) + ' Cluster ' + str(int(cluster)), parse_html=True)
        folium.CircleMarker(
            [lat, lon],
            radius=5,
            popup=label,
            color=rainbow[int(cluster)-1],
            fill=True,
            fill_color=rainbow[int(cluster)-1],
            fill_opacity=0.7).add_to(map_clusters)
    except:
        pass
       
map_clusters