## Description of the Data:
#### The following data is required to answer the issues of the problem:
1. List of boroughs and neighborhoods of the two cities with their geodata (latitude and longitude)
2. List of restaurants and shops with their address location in these two cities.
3. List of the places of interest for tourist with their location and communication areas.
4. Venues for each neighbourhood.

In [12]:
import pandas as pd
import urllib
import urllib.request
from bs4 import BeautifulSoup

#### Get info related to Delhi, India

In [24]:
#Get districts in Delhi
url ='https://www.mapsofindia.com/pincode/india/delhi/'
mycolumns = ['District', 'State']
df_del_districts= pd.DataFrame(columns=mycolumns)

soup = BeautifulSoup(
       urllib.request.urlopen( url ).read(),
       'html.parser'
       )

mytable = soup.findAll('table')[0]
rows = mytable.find_all('tr')

for row in rows:
    line = ''
    cells = row.find_all('td')
    
    # no cells in header (<th>!!)
    if cells:
        A = cells[0].get_text().strip()
        B = cells[1].get_text().strip()
        df_del_districts.loc[len(df_del_districts)] = [ A, B]
        
df_del_districts = df_del_districts[df_del_districts['District'] != 'District']
print(df_del_districts.shape)
df_del_districts.head()

Unnamed: 0,District,State
1,Central Delhi,Delhi
2,East Delhi,Delhi
3,New Delhi,Delhi
4,North Delhi,Delhi
5,North West delhi,Delhi


In [49]:
#Get all location info for all districts in Delhi
mycolumns = ['Location', 'Pincode', 'State', 'District']
df_del_neighborhoods= pd.DataFrame(columns=mycolumns)
url_main ='https://www.mapsofindia.com/pincode/india/delhi/'

for dist in df_del_districts['District']:
    url =url_main + dist.replace(" ", "-") + "/"
    #print(url)
    soup = BeautifulSoup(
           urllib.request.urlopen( url ).read(),
           'html.parser'
           )

    mytable = soup.findAll('table')[0]
    rows = mytable.find_all('tr')

    for row in rows:
        line = ''
        cells = row.find_all('td')

        # no cells in header (<th>!!)
        if cells:
            A = cells[0].get_text().strip()
            B = cells[1].get_text().strip()
            C = cells[2].get_text().strip()
            D = cells[3].get_text().strip()
            df_del_neighborhoods.loc[len(df_del_neighborhoods)] = [ A, B, C, D]

df_del_neighborhoods = df_del_neighborhoods[df_del_neighborhoods['Location'] != 'Location']
print(df_del_neighborhoods.shape)
df_del_neighborhoods.to_csv("Delhi_Neighborhoods.csv")
df_del_neighborhoods.head()

(530, 4)


In [54]:
#Group same pincode as one record
filepath = "/resources/labs/DP0701EN/Delhi_Neighborhoods.csv"
df_del_neighborhoods = pd.read_csv(filepath)
df_del_neighborhoods=df_del_neighborhoods.groupby(['Pincode', 'District']).agg(lambda x:','.join(set(x)))
df_del_neighborhoods.reset_index(drop=False, inplace=True)
print(df_del_neighborhoods.shape)
df_del_neighborhoods.head()

Unnamed: 0,Pincode,District,Location,State
0,110001,Central Delhi,"Baroda House,Krishi Bhawan,Janpath,Constitutio...",Delhi
1,110001,New Delhi,"New Delhi,New Delhi.",Delhi
2,110002,Central Delhi,"I.P.estate,Indraprastha,Gandhi Smarak nidhi,Ra...",Delhi
3,110003,Central Delhi,"Delhi High court,Pandara Road",Delhi
4,110003,South Delhi,"C G o complex,Aliganj,Golf Links,Safdarjung Ai...",Delhi


In [58]:
from geopy.geocoders import Nominatim
address = '11001, New Delhi, IN'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))

address = '11001,Central Delhi, IN'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))


The geograpical coordinate of 11001, New Delhi, IN are 28.5896967, 77.1689101.
The geograpical coordinate of 11001,Central Delhi, IN are 28.65035795, 77.1846307277688.


#### Get information realated to Kolkata, India

In [46]:
#Get neighborhoods in Kolkata
mycolumns = ['Location', 'Pincode', 'State', 'District']
df_kol_neighborhoods= pd.DataFrame(columns=mycolumns)
url ='https://www.mapsofindia.com/pincode/india/west-bengal/kolkata/'

soup = BeautifulSoup(
       urllib.request.urlopen( url ).read(),
       'html.parser'
       )

mytable = soup.findAll('table')[0]
rows = mytable.find_all('tr')

for row in rows:
    line = ''
    cells = row.find_all('td')
    
    # no cells in header (<th>!!)
    if cells:
        A = cells[0].get_text().strip()
        B = cells[1].get_text().strip()
        C = cells[2].get_text().strip()
        D = cells[3].get_text().strip()
        df_kol_neighborhoods.loc[len(df_kol_neighborhoods)] = [ A, B, C, D]

df_kol_neighborhoods = df_kol_neighborhoods[df_kol_neighborhoods['Location'] != 'Location']
print(df_kol_neighborhoods.shape)
df_kol_neighborhoods.to_csv("Kolkata_Neighborhoods.csv")
df_kol_neighborhoods.head()

(178, 4)


Unnamed: 0,Location,Pincode,State,District
1,A.J.c.bose road,700020,West Bengal,Kolkata
2,Abinash Chaowdhury lane,700046,West Bengal,Kolkata
3,Alipore,700027,West Bengal,Kolkata
4,Alipore Bodyguard line,700027,West Bengal,Kolkata
5,Alipore Civil court,700027,West Bengal,Kolkata


In [55]:
#Group same pincode as one record
filepath = "/resources/labs/DP0701EN/Kolkata_Neighborhoods.csv"
df_kol_neighborhoods = pd.read_csv(filepath)
df_kol_neighborhoods=df_kol_neighborhoods.groupby(['Pincode', 'District']).agg(lambda x:','.join(set(x)))
df_kol_neighborhoods.reset_index(drop=False, inplace=True)
print(df_kol_neighborhoods.shape)
df_kol_neighborhoods.head()

Unnamed: 0,Pincode,District,Location,State
0,700001,Kolkata,"Kolkatta.,Writer's Building,Council House stre...",West Bengal
1,700007,Kolkata,"Barabazar,College Square,Strand Road",West Bengal
2,700008,Kolkata,Barisha,West Bengal
3,700009,Kolkata,"Parsibagan,Raja Ram mohan sarani",West Bengal
4,700010,Kolkata,"Joramandir,Beleghata,Subhash Sarabor",West Bengal


In [57]:
address = '700001, Kolkata, IN'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))

address = '700007,Kolkata, IN'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))


The geograpical coordinate of 700001, Kolkata, IN are 22.5677459, 88.3476023.
The geograpical coordinate of 700007,Kolkata, IN are 22.5567375, 88.3504629.


In [104]:
#now, using Geocoder and Google API, we get the Latitude and Longitude of each Pincode and District
#!pip install geocoder
import geocoder
GOOGLE_API_KEY='AIzaSyAQWqMTOcyLBRDR2skO4F_5QEWzNDOlUHw'

#function to get latitude and longitude
def get_latlng(postalcode, district):
    lat_lng_coords = None
    while(lat_lng_coords is None):
        g = geocoder.google('{}, {}, IN'.format(postalcode, district), key=GOOGLE_API_KEY)
        lat_lng_coords = g.latlng
    return lat_lng_coords

#Get latitude and logitude into dataframe
df_kol_coords = pd.DataFrame(columns=['Pincode', 'District', 'Latitude', 'Longitude'])
for postalcode, district in zip(df_kol_neighborhoods['Pincode'],df_kol_neighborhoods['District']):   
    coords = get_latlng(postalcode, district)
    df_kol_coords.loc[len(df_kol_coords)] = [postalcode, district, coords[0], coords[1]]

print(df_kol_coords.shape)
df_kol_coords.head()

(62, 4)


Unnamed: 0,Pincode,District,Latitude,Longitude
0,700001,Kolkata,22.573613,88.348341
1,700007,Kolkata,22.582557,88.361703
2,700008,Kolkata,22.481168,88.313455
3,700009,Kolkata,22.576675,88.373545
4,700010,Kolkata,22.563025,88.396257


In [105]:
df_kol_neighborhoods = df_kol_neighborhoods.merge(df_kol_coords, how='left', left_on=['Pincode', 'District'], right_on=['Pincode', 'District'])
df_kol_neighborhoods.head()

Unnamed: 0,Pincode,District,Location,State,Latitude,Longitude
0,700001,Kolkata,"Kolkatta.,Writer's Building,Council House stre...",West Bengal,22.573613,88.348341
1,700007,Kolkata,"Barabazar,College Square,Strand Road",West Bengal,22.582557,88.361703
2,700008,Kolkata,Barisha,West Bengal,22.481168,88.313455
3,700009,Kolkata,"Parsibagan,Raja Ram mohan sarani",West Bengal,22.576675,88.373545
4,700010,Kolkata,"Joramandir,Beleghata,Subhash Sarabor",West Bengal,22.563025,88.396257


In [106]:
print(df_kol_neighborhoods.shape)
df_kol_neighborhoods.to_csv("Kolkata_Neighborhoods_Coords.csv")

(62, 6)


In [107]:
#Get latitude and logitude into dataframe
df_del_coords = pd.DataFrame(columns=['Pincode', 'District', 'Latitude', 'Longitude'])
for postalcode, district in zip(df_del_neighborhoods['Pincode'],df_del_neighborhoods['District']):   
    coords = get_latlng(postalcode, district)
    df_del_coords.loc[len(df_del_coords)] = [postalcode, district, coords[0], coords[1]]

print(df_del_coords.shape)
df_del_coords.head()

(98, 4)


Unnamed: 0,Pincode,District,Latitude,Longitude
0,110001,Central Delhi,28.632743,77.219597
1,110001,New Delhi,28.632743,77.219597
2,110002,Central Delhi,28.63522,77.246886
3,110003,Central Delhi,28.591647,77.231786
4,110003,South Delhi,28.57096,77.224224


In [108]:
df_del_neighborhoods = df_del_neighborhoods.merge(df_del_coords, how='left', left_on=['Pincode', 'District'], right_on=['Pincode', 'District'])
df_del_neighborhoods.head()

Unnamed: 0,Pincode,District,Location,State,Latitude,Longitude
0,110001,Central Delhi,"Baroda House,Krishi Bhawan,Janpath,Constitutio...",Delhi,28.632743,77.219597
1,110001,New Delhi,"New Delhi,New Delhi.",Delhi,28.632743,77.219597
2,110002,Central Delhi,"I.P.estate,Indraprastha,Gandhi Smarak nidhi,Ra...",Delhi,28.63522,77.246886
3,110003,Central Delhi,"Delhi High court,Pandara Road",Delhi,28.591647,77.231786
4,110003,South Delhi,"C G o complex,Aliganj,Golf Links,Safdarjung Ai...",Delhi,28.57096,77.224224


In [110]:
print(df_del_neighborhoods.shape)
df_del_neighborhoods.to_csv("Delhi_Neighborhoods_Coords.csv")

(98, 6)


#### Create a map of Delhi with neighborhoods superimposed on top.

In [114]:
from geopy.geocoders import Nominatim
import folium

address = 'Delhi, IN'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

# create map of New York using latitude and longitude values
map_del = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_del_neighborhoods['Latitude'], df_del_neighborhoods['Longitude'], df_del_neighborhoods['District'], df_del_neighborhoods['Pincode']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_del)  
    
map_del

#### Create a map of Kolkata with neighborhoods superimposed on top.

In [115]:
from geopy.geocoders import Nominatim
import folium

address = 'Kolkata, IN'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

# create map of New York using latitude and longitude values
map_kol = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_kol_neighborhoods['Latitude'], df_kol_neighborhoods['Longitude'], df_kol_neighborhoods['District'], df_kol_neighborhoods['Pincode']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_kol)  
    
map_kol

#### The data will be used as follows:
1. Use Foursquare and geopy data to map top venues for Delhi and Kolkata neighborhoods and clustered in groups.
2. Use Foursquare and geopy data to map the location of different hot places, separately and on top of the above clustered map in order to be able to identify the venues and amenities near each metro station, or explore each subway location separately.
3. Analysis of these data and find out some best places to start.
4. Create a map around a radius each neighborhoods to visualise the data for better understanding.


#### Explore the first neighborhood in Delhi dataframe

In [121]:
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

#Define Foursquare Credentials and Version
CLIENT_ID = 'Q4C2Y4VTVUVQRHVUHWTPEYPN3AI42UEFGTDZILUA2FN201VY' # your Foursquare ID
CLIENT_SECRET = 'G3QBEK1G2W3TSMDCNZF0WM3RNOOP2C5KTPQAZDX5R1G5CS3X' # your Foursquare Secret
VERSION = '20190615'

#Get the neighborhood's latitude and longitude values.
neighborhood_latitude = df_del_neighborhoods.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df_del_neighborhoods.loc[0, 'Longitude'] # neighborhood longitude value
neighborhood_name = df_del_neighborhoods.loc[0, 'Pincode'], df_del_neighborhoods.loc[0, 'District'] # neighborhood postal code, district

#get the top 100 venues that are within a radius of 500 meters
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)

#Send the GET request and examine the resutls
results = requests.get(url).json()

#borrow the get_category_type function from the Foursquare lab.
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

#clean the json and structure it into a pandas dataframe
venues = results['response']['groups'][0]['items']    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
print('{} venues were returned by Foursquare for {}.'.format(nearby_venues.shape[0], neighborhood_name))
nearby_venues.head()

61 venues were returned by Foursquare for (110001, 'Central Delhi').


Unnamed: 0,name,categories,lat,lng
0,Connaught Place | कनॉट प्लेस (Connaught Place),Plaza,28.632731,77.220018
1,Wenger's,Bakery,28.633412,77.218292
2,Starbucks Coffee,Coffee Shop,28.632011,77.217731
3,Khan Chacha | खान चाचा | خان چاچا,Indian Restaurant,28.634202,77.22078
4,Naturals Ice Cream,Ice Cream Shop,28.634455,77.222139


#### Explore the first neighborhood in Kolkata dataframe

In [122]:
#Get the neighborhood's latitude and longitude values.
neighborhood_latitude = df_kol_neighborhoods.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df_kol_neighborhoods.loc[0, 'Longitude'] # neighborhood longitude value
neighborhood_name = df_kol_neighborhoods.loc[0, 'Pincode'], df_kol_neighborhoods.loc[0, 'District'] # neighborhood postal code, district

#get the top 100 venues that are within a radius of 500 meters
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)

#Send the GET request and examine the resutls
results = requests.get(url).json()

#borrow the get_category_type function from the Foursquare lab.
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

#clean the json and structure it into a pandas dataframe
venues = results['response']['groups'][0]['items']    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
print('{} venues were returned by Foursquare for {}.'.format(nearby_venues.shape[0], neighborhood_name))
nearby_venues.head()

4 venues were returned by Foursquare for (700001, 'Kolkata').


Unnamed: 0,name,categories,lat,lng
0,Millennium Park,Park,22.572481,88.344517
1,Kona Dukan Tea Stock Exchange,Tea Room,22.574053,88.349822
2,Lal Dighi,Lake,22.571873,88.349154
3,Fairlie Ghat,Boat or Ferry,22.575727,88.345134
