# Capstone project


## Importing the required libraries

In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup as bsoup
import re
import folium
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="ny_explorer")
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

## Scrapping the data

In [2]:
url = "https://geographic.org/streetview/india/maharashtra/konkan/mumbai.html"
code = requests.get(url)

### Extracting data

page=bsoup(code.text,"html5lib")
zips = []
for tag in page.find_all("li"):
    zips.append([" ".join(tag.text.split()[:-3]),tag.text.split()[-3], tag.text.split()[-2].split(",")[0], tag.text.split()[-1]])

zip_df = pd.DataFrame(zips, columns = ["Neighborhood", "Zip", "Latitude", "Longitude"])

In [3]:
zip_df

Unnamed: 0,Neighborhood,Zip,Latitude,Longitude
0,Aareymilk Colony,400065,19.162898,72.88367
1,Agripada,400011,18.975302,72.824897
2,Airport,400099,19.095696,72.855633
3,A I Staff Colony,400029,19.176062,72.944793
4,Ambewadi,400004,18.955627,72.821715
...,...,...,...,...
234,Worli,400018,19.000633,72.816812
235,Worli Colony,400030,19.006054,72.821421
236,Worli Naka,400018,18.984683,72.819052
237,Worli Police Camp,400030,19.005591,72.815207


In [4]:
zip_df.shape

(239, 4)

### Coordinates of Mumbai

In [5]:
address = 'Mumbai'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Mumbai are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Mumbai are 19.0759899, 72.8773928.


## Getting venue data

In [6]:
CLIENT_ID = 'client-id' # your Foursquare ID # removed after processing
CLIENT_SECRET = 'client-secret' # your Foursquare Secret # removed after processing
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

In [7]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [8]:
mumbai_venues = getNearbyVenues(names = zip_df['Neighborhood'], latitudes = zip_df['Latitude'], longitudes = zip_df['Longitude'])

Aareymilk Colony
Agripada
Airport
A I Staff Colony
Ambewadi
Andheri
Andheri East
Andheri Railway Station
Antop Hill
Anushakti Nagar
Asvini
Audit Bhavan
Azad Nagar
Bandra
Bandra West
Bangur Nagar
BARC
Barve Nagar
Bazargate
Best Staff Colony
BEST STaff Quarters
Bhandup Complex
Bhandup East
Bhandup Ind. Estate
Bhandup West
Bharat Nagar
Bhawani Shankar
Bhawani Shankar Rd
B.N. Bhavan
Borivali
Borivali East
Borivali West
B.P.Lane
B P T Colony
Central Building
Century Mill
C G S Colony
Chakala Midc
Chamarbaug
Charkop
Charni Road
Chaupati
Chembur
Chembur Extension
Chembur Rs
Chinchbunder
Chinchpokli
Chunabhatti
Churchgate
Colaba
Colaba Bazar
Cotton Exchange
Cumballa Hill
Cumballa Sea Face
Dadar
Dadar Colony
Dahisar
Dahisar RS
Danda
Daulat Nagar
Delisle Road
Dharavi
Dharavi Road
D.M. Colony
Dockyard Road
Dr Deshmukh Marg
Falkland Road
FCI
Ghatkopar West
Girgaon
Gokhale Road
Goregaon
Goregaon East
Goregaon RS
Govandi
Government Colony
Gowalia Tank
Grant Road
Haffkin Institute
Haines Road
Haji Al

In [9]:
mumbai_venues

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Aareymilk Colony,19.162898,72.88367,Film City,19.160960,72.886177,Event Space
1,Aareymilk Colony,19.162898,72.88367,Film City Studio No. 7,19.160833,72.886534,Dance Studio
2,Aareymilk Colony,19.162898,72.88367,Cafe Mosaque,19.162695,72.887740,Café
3,Aareymilk Colony,19.162898,72.88367,Hill top,19.166893,72.881493,Mountain
4,Agripada,18.975302,72.824897,Celejor,18.975844,72.823679,Bakery
...,...,...,...,...,...,...,...
3051,Worli Sea Face,19.009143,72.815728,Royce Chocolates,19.005828,72.817707,Dessert Shop
3052,Worli Sea Face,19.009143,72.815728,Papaya,19.005762,72.817621,Sushi Restaurant
3053,Worli Sea Face,19.009143,72.815728,Palladium Ballroom,19.005695,72.817717,Hotel
3054,Worli Sea Face,19.009143,72.815728,Flora Pan Wala,19.005646,72.813764,Smoke Shop


## Transforming data

In [10]:
mum_one_hot = pd.get_dummies(mumbai_venues[['Venue Category']] , prefix="", prefix_sep="")
mum_one_hot['Neighborhood'] = mumbai_venues['Neighborhood']

In [11]:
mum_one_hot.columns

Index(['ATM', 'Accessories Store', 'Advertising Agency', 'Afghan Restaurant',
       'African Restaurant', 'Airport', 'Airport Food Court', 'Airport Lounge',
       'Airport Service', 'Airport Terminal',
       ...
       'Vegetarian / Vegan Restaurant', 'Vietnamese Restaurant',
       'Volleyball Court', 'Waterfront', 'Whisky Bar', 'Wine Bar', 'Wine Shop',
       'Women's Store', 'Yoga Studio', 'Zoo'],
      dtype='object', length=234)

In [12]:
mum_final = mum_one_hot[[mum_one_hot.columns[-1]] + list(mum_one_hot.columns[:-1])]

In [13]:
mum_final

Unnamed: 0,Zoo,ATM,Accessories Store,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,...,Tunnel,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Volleyball Court,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3051,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3052,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3053,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3054,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
mum_g = mum_final.groupby("Neighborhood").sum().reset_index()

In [15]:
mum_g['Total'] = 0
for i in range(len(mum_g)):
    mum_g['Total'][i] = sum(mum_g.iloc[i,1: ])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mum_g['Total'][i] = sum(mum_g.iloc[i,1: ])


In [16]:
mum_g = mum_g.merge(zip_df[["Latitude", "Longitude", "Neighborhood"]], on = ["Neighborhood"], how = "left")

In [17]:
mum_g

Unnamed: 0,Neighborhood,Zoo,ATM,Accessories Store,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Lounge,...,Volleyball Court,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Women's Store,Yoga Studio,Total,Latitude,Longitude
0,A I Staff Colony,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,7,19.176062,72.944793
1,Aareymilk Colony,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4,19.162898,72.88367
2,Agripada,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,5,18.975302,72.824897
3,Airport,0,0,0,0,0,0,1,0,1,...,0,0,1,0,0,0,0,48,19.095696,72.855633
4,Ambewadi,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,10,18.955627,72.821715
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220,Worli,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,16,19.000633,72.816812
221,Worli Colony,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4,19.006054,72.821421
222,Worli Naka,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,9,18.984683,72.819052
223,Worli Police Camp,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,20,19.005591,72.815207


## Mapping the results

In [25]:
map_mumbai = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, neighborhood, size in zip(mum_g['Latitude'], mum_g['Longitude'],  mum_g['Neighborhood'], mum_g['Total']):
    if size <= 10:
        label = '{}'.format(neighborhood)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color='red',
            fill=True,
            fill_color='#dc143c',
            fill_opacity=0.5,
            parse_html=False).add_to(map_mumbai)
    elif size <= 25:
        label = '{}'.format(neighborhood)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color='blue',
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.5,
            parse_html=False).add_to(map_mumbai)
    else:
        label = '{}'.format(neighborhood)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color='green',
            fill=True,
            fill_color='#7cfc00',
            fill_opacity=0.5,
            parse_html=False).add_to(map_mumbai)
    
map_mumbai

## No of underdeveloped areas

Defining underdeveloped area as the area having less than 10 venues:

In [22]:
sum(mum_g['Total'] <= 10)

129

In [27]:
list(mum_g[mum_g['Total'] <= 10]['Neighborhood'])

['A I Staff Colony',
 'Aareymilk Colony',
 'Agripada',
 'Ambewadi',
 'Andheri',
 'Andheri East',
 'Audit Bhavan',
 'B P T Colony',
 'B.N. Bhavan',
 'BARC',
 'BEST STaff Quarters',
 'Bandra',
 'Bangur Nagar',
 'Barve Nagar',
 'Bhandup East',
 'Bhandup Ind. Estate',
 'Bhandup West',
 'Bharat Nagar',
 'Bhawani Shankar',
 'Bhawani Shankar Rd',
 'Borivali East',
 'C G S Colony',
 'Charkop',
 'Charni Road',
 'Chembur',
 'Chembur Rs',
 'Chinchbunder',
 'Chinchpokli',
 'Cotton Exchange',
 'D.M. Colony',
 'Dahisar',
 'Dahisar RS',
 'Dharavi',
 'Dharavi Road',
 'Dockyard Road',
 'FCI',
 'Falkland Road',
 'Gokhale Road',
 'Goregaon',
 'Govandi',
 'Government Colony',
 'H.M.P. School',
 'Haffkin Institute',
 'Haines Road',
 'Holiday Camp',
 'Ins Hamla',
 'J.B. Nagar',
 'J.J.Hospital',
 'J.M. Road',
 'Jacob Circle',
 'Jogeshwari East',
 'Jogeshwari West',
 'Kalachowki',
 'Kamathipura',
 'Kandivali East',
 'Ketkipada',
 'Kharodi',
 'Kidwai Nagar',
 'Kurla',
 'Kurla North',
 'Lal Baug',
 'Liberty Gar

## No of developing areas

Defining developing area as the area having more than 10 venues and less than 25 venues:

In [23]:
sum(mum_g['Total']<=25) - sum(mum_g['Total']<=10)

61

In [30]:
list(mum_g[mum_g['Total'] <= 25][mum_g['Total'] > 10]['Neighborhood'])

  list(mum_g[mum_g['Total'] <= 25][mum_g['Total'] > 10]['Neighborhood'])


['Andheri Railway Station',
 'B.P.Lane',
 'Borivali',
 'Borivali West',
 'Century Mill',
 'Chakala Midc',
 'Chamarbaug',
 'Chunabhatti',
 'Colaba',
 'Colaba Bazar',
 'Cumballa Hill',
 'Dadar',
 'Dadar Colony',
 'Danda',
 'Daulat Nagar',
 'Dr Deshmukh Marg',
 'Ghatkopar West',
 'Goregaon East',
 'Goregaon RS',
 'Gowalia Tank',
 'Grant Road',
 'Haji Ali',
 'Hanuman Road',
 'IRLA',
 'Juhu',
 'Kalbadevi',
 'Kandivali West',
 'Kapad Bazar',
 'Kherwadi',
 'M.P.T.',
 'Magthane',
 'Mandapeshwar',
 'Mandvi',
 'Masjid',
 'Mori Road',
 'Mulund Dd Road',
 'Mulund East',
 'Mulund West',
 'Mumbai Central',
 'N.S.Patkar Marg',
 'NITIE',
 'Nehru Road',
 'New Prabhadevi Road',
 'Null Bazar',
 'Parel Naka',
 'Parel Rly Work Shop',
 'Prabhadevi',
 'Rajawadi',
 'Raoli Camp',
 'S V Marg',
 'S.B. Road',
 'Sakinaka',
 'Santacruz Central',
 'Sindhi Society',
 'Town Hall',
 'Tulsiwadi',
 'V.W.T.C.',
 'Vileparle Railway Station',
 'Worli',
 'Worli Police Camp',
 'Worli Sea Face']

## No of developed areas

Defining developed area as the area having more than 25 venues:

In [24]:
sum(mum_g['Total']>25)

35

In [31]:
list(mum_g[mum_g['Total'] > 25]['Neighborhood'])

['Airport',
 'Azad Nagar',
 'Bandra West',
 'Bazargate',
 'Central Building',
 'Chaupati',
 'Churchgate',
 'Cumballa Sea Face',
 'Delisle Road',
 'Girgaon',
 'High Court Building',
 'International Airport',
 'Khar Colony',
 'Khar Delivery',
 'M A Marg',
 'Mahim',
 'Mahim Bazar',
 'Mantralaya',
 'Marine Lines',
 'Matunga',
 'Mumbai',
 'Nariman Point',
 'New Yogakshema',
 'Opera House',
 'Orlem',
 'Psm Colony',
 'Ranade Road',
 'Santacruz',
 'Santacruz',
 'Secretariate',
 'Shivaji Park',
 'Stock Exchange',
 'Tajmahal',
 'V.P. Road',
 'Vileeparle']