# Coursera Data Science Capstone

### Phase 1 -- Import necessary libraries

In [1]:
import numpy as np 

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

import requests
from pandas.io.json import json_normalize


import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes 
import folium

!pip install lxml
import lxml

import math
print('Libraries imported.')

Solving environment: done


  current version: 4.5.11
  latest version: 4.8.2

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    scikit-learn-0.20.1        |   py36h22eb022_0         5.7 MB
    liblapack-3.8.0            |      11_openblas          10 KB  conda-forge
    numpy-1.18.1               |   py36h95a1406_0         5.2 MB  conda-forge
    liblapacke-3.8.0           |      11_openblas          10 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    libopenblas-0.3.6          |       h5a2b251_2         7.7 MB
    scipy-1.4.1                |   py36h921218d_0        

### Phase 2 -- Import HTML table of largest US cities and convert to Pandas Dataframe

In [2]:
URL = "https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population"
table = pd.read_html(URL)
Cities = pd.DataFrame(table[4])

In [3]:
Cities.head()
del Cities["2010Census"]
del Cities["Change"]
del Cities["2016 land area.1"]
del Cities["2016 population density.1"]

In [4]:
#Need to strip extraneous characters off the city names and filter only the top
#50 results

new_col=[]
  
for row in Cities["City"]:
    mod = row.split('[', 1)[0].lower().rstrip()
    new_col.append(mod)

Cities["City2"] = new_col
del Cities["City"]
Top_Cities = Cities.head(50)

In [None]:
Top_Cities

### Phase 3 -- Get true latitude, longitude values for each city and filter cities East of St. Louis

In [5]:
geolocator = Nominatim(user_agent="us_explorer3")
lat = []
lng = []

for cit,ste in zip(Top_Cities["City2"],Top_Cities["State[c]"]):
    address = cit + ", " + ste
    print(address)
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    lat.append(latitude)
    lng.append(longitude)


new york, New York
los angeles, California
chicago, Illinois
houston, Texas
phoenix, Arizona
philadelphia, Pennsylvania
san antonio, Texas
san diego, California
dallas, Texas
san jose, California
austin, Texas
jacksonville, Florida
fort worth, Texas
columbus, Ohio
san francisco, California
charlotte, North Carolina
indianapolis, Indiana
seattle, Washington
denver, Colorado
washington, District of Columbia
boston, Massachusetts
el paso, Texas
detroit, Michigan
nashville, Tennessee
portland, Oregon
memphis, Tennessee
oklahoma city, Oklahoma
las vegas, Nevada
louisville, Kentucky
baltimore, Maryland
milwaukee, Wisconsin
albuquerque, New Mexico
tucson, Arizona
fresno, California
mesa, Arizona
sacramento, California
atlanta, Georgia
kansas city, Missouri
colorado springs, Colorado
miami, Florida
raleigh, North Carolina
omaha, Nebraska
long beach, California
virginia beach, Virginia
oakland, California
minneapolis, Minnesota
tulsa, Oklahoma
arlington, Texas
tampa, Florida
new orleans, Louisi

In [6]:
Top_Cities["Latitude"]=lat
Top_Cities["Longitude"]=lng


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [7]:
#Filter Cities in Dataframe that are east of St. Louis
STL_LNG = -90.19789
Eastern_Cities = Top_Cities[Top_Cities['Longitude'] > STL_LNG].reset_index(drop=True)

In [None]:
Eastern_Cities

### Phase 4 -- Define access credentials for FourSquare

In [8]:
CLIENT_ID = 'HXMZ0YDCJX0BFJVHVYWXLO55KG3P5EKU1BDOUR41TGKDZ51T' # your Foursquare ID
CLIENT_SECRET = '3AZ40DHZYT3OKS12VF4X0KUBVS5JU1BUXLXXNGUQULE2U5GU' # your Foursquare Secret
VERSION = '20191230' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: HXMZ0YDCJX0BFJVHVYWXLO55KG3P5EKU1BDOUR41TGKDZ51T
CLIENT_SECRET:3AZ40DHZYT3OKS12VF4X0KUBVS5JU1BUXLXXNGUQULE2U5GU


In [9]:
#Strip out decimals and "sq mile" from land area
area=[]
for row in Eastern_Cities['2016 land area']:
    mod = row.split('.', 1)[0].lower().rstrip()
    mod = float(mod)
    area.append(mod)

Eastern_Cities['2016 land area'] = area


In [10]:
#Find the radius in miles and convert it to meters
def SqMileRadius(area):
    r = (math.sqrt(area/math.pi))*1609
    return r

In [11]:
#Declare variables that will be used in every call to FourSquare
LIMIT = 300

#Used for URL to filter venues with the "Chiropractor" CategoryID.  More
#can be found here: https://developer.foursquare.com/docs/resources/categories
search_string = '52e81612bcbc57f1066b7a3a'

### Phase 5 -- Iterate through each city and get the count of the number of venues for the category defined above

In [12]:
#Define a function to iterate through a list of cities
def find_venues (city_lat, city_lng, radius):
    #Create request URL
    url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&categoryId={}'.format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        city_lat,
        city_lng,
        radius,
        LIMIT,
        search_string)
    
    
    #Grab results as a JSON from FourSquare
    results = requests.get(url).json()
    
    #Flatten JSON to create dataframe of results
    venues = results['response']['venues']
    venueDF = json_normalize(venues)
    
    #Get the count of venues matching the category ID and return it
    venue_count = len(venueDF.index)
    
    return venue_count

In [13]:
#Create a column of total count of venues for each city in the Eastern_Cities
#Dataframe
venue_column = []
for lt,lg,sqmi in zip(Eastern_Cities['Latitude'],Eastern_Cities['Longitude'],Eastern_Cities['2016 land area']):
    rad = SqMileRadius(sqmi)
    vcount = find_venues(lt,lg,rad)
    venue_column.append(vcount)
    


In [14]:
#Append venue counts to original Eastern_Cities dataframe
Eastern_Cities['Venue Count'] = venue_column
Eastern_Cities.head()

Unnamed: 0,2018rank,State[c],2018estimate,2016 land area,2016 population density,Location,City2,Latitude,Longitude,Venue Count
0,1,New York,8398748,301.0,"28,317/sq mi",40°39′49″N 73°56′19″W﻿ / ﻿40.6635°N 73.9387°W,new york,40.712728,-74.006015,50
1,3,Illinois,2705994,227.0,"11,900/sq mi",41°50′15″N 87°40′54″W﻿ / ﻿41.8376°N 87.6818°W,chicago,41.875562,-87.624421,50
2,6,Pennsylvania,1584138,134.0,"11,683/sq mi",40°00′34″N 75°08′00″W﻿ / ﻿40.0094°N 75.1333°W,philadelphia,39.952724,-75.163526,38
3,12,Florida,903889,747.0,"1,178/sq mi",30°20′13″N 81°39′42″W﻿ / ﻿30.3369°N 81.6616°W,jacksonville,30.332184,-81.655651,45
4,14,Ohio,892533,218.0,"3,936/sq mi",39°59′07″N 82°59′05″W﻿ / ﻿39.9852°N 82.9848°W,columbus,39.96226,-83.000707,41


### Phase 6 -- Cleanup dataframe

In [15]:
del Eastern_Cities["2018rank"]
del Eastern_Cities["Location"]
Eastern_Cities.rename(columns={'City2': 'City', 'State[c]': 'State', '2018estimate':'Population'}, inplace=True)


In [16]:
Eastern_Cities = Eastern_Cities[['City','State','Population','Latitude','Longitude','Venue Count','2016 land area','2016 population density']]

In [18]:
Eastern_Cities

Unnamed: 0,City,State,Population,Latitude,Longitude,Venue Count,2016 land area,2016 population density
0,new york,New York,8398748,40.712728,-74.006015,50,301.0,"28,317/sq mi"
1,chicago,Illinois,2705994,41.875562,-87.624421,50,227.0,"11,900/sq mi"
2,philadelphia,Pennsylvania,1584138,39.952724,-75.163526,38,134.0,"11,683/sq mi"
3,jacksonville,Florida,903889,30.332184,-81.655651,45,747.0,"1,178/sq mi"
4,columbus,Ohio,892533,39.96226,-83.000707,41,218.0,"3,936/sq mi"
5,charlotte,North Carolina,872498,35.227087,-80.843127,44,305.0,"2,757/sq mi"
6,indianapolis,Indiana,867125,39.768333,-86.15835,40,361.0,"2,366/sq mi"
7,washington,District of Columbia,702455,38.894893,-77.036553,44,61.0,"11,148/sq mi"
8,boston,Massachusetts,694583,42.360253,-71.058291,45,48.0,"13,938/sq mi"
9,detroit,Michigan,672662,42.331551,-83.04664,32,138.0,"4,847/sq mi"


### Phase 7 -- Mark cities on map

In [19]:
# create map of the Eastern US using latitude and longitude values
us_lat = 36.1658333
us_lng = -86.7844444

map_us = folium.Map(location=[us_lat, us_lng], zoom_start=4)

# add markers to map
for lat, lng, cnt, pop, city, state in zip(Eastern_Cities['Latitude'], Eastern_Cities['Longitude'], Eastern_Cities['Venue Count'],Eastern_Cities['Population'],Eastern_Cities['City'],Eastern_Cities['State']):
    label = '{}, {}: Total Venues {}, Total Population {}'.format(city, state, cnt, pop)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_us)  
    
map_us