# Battle of neighborhoods - getting data and coding

In [None]:
#Import stats packages
import numpy as np 
import pandas as pd 
from sklearn.cluster import KMeans

#Import HTML/JSON
import requests 
from bs4 import BeautifulSoup
import json 
from pandas.io.json import json_normalize 

#Plot
import matplotlib.cm as cm
import matplotlib.colors as colors


#Geo
from geopy.geocoders import Nominatim 
import geocoder
import folium

## Getting Data and Merging with coordinates

In [None]:
#Getting data
URL="https://en.wikipedia.org/wiki/List_of_neighbourhoods_of_Chennai"
r = requests.get(URL) 
soup = BeautifulSoup(r.content, 'html5lib') 
neighborhood=[]
# append the data into the list
for row in soup.find_all('li')[11:]:
    neighborhood.append(row.text)
    

In [None]:
#using Pandas to clean the data/getting data source with co-ordinates
df = pd.DataFrame({"Neighborhood": neighborhood})
df2=pd.read_csv('df2.csv')

## Map of Neighborhood

In [104]:
# create map of Chennai using latitude and longitude values
map_ch = folium.Map(location=[13.0801721, 80.2838331], zoom_start=11)

# add markers to map
for lat, lng, neighborhood in zip(df2['Latitude'], df2['Longitude'], df2['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_ch)  
    
map_ch

## Merging Data with 4Square API

In [None]:
# Using code from coursera API classes/ref github

radius = 1000
LIMIT = 100
venues = []
for lat, long, neighborhood in zip(df2['Latitude'], df2['Longitude'], df2['Neighborhood']):

  #4Square API - 
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
  CLIENT_ID,
  CLIENT_SECRET,
  VERSION,
  lat,
  long,
  radius,
  LIMIT)

    results = requests.get(url).json()
    for venue in results["response"]["groups"][0]['items']:
          venues.append((
    neighborhood,
    lat,
    long,
    venue['venue']['name'],
    venue['venue']['location']['lat'],
    venue['venue']['location']['lng'],
    venue['venue']['categories'][0]['name']))

In [None]:
df3 = pd.DataFrame(venues)
# define the column names
df3.columns = ['Neighborhood', 'Latitude', 'Longitude', 'Name', 'Lat', 'Long', 'Cat']

In [None]:
df3.shape

## Analyzing Data

In [106]:
df3.groupby(["Neighborhood"]).count().head(5)

Unnamed: 0_level_0,Latitude,Longitude,Name,Lat,Long,Cat
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alapakkam,8,8,8,8,8,8
Alwarthirunagar,15,15,15,15,15,15
Ambattur,12,12,12,12,12,12
Aminjikarai,18,18,18,18,18,18
Anna Nagar,70,70,70,70,70,70


In [105]:
# print out the list of categories
df3['Cat'].unique()

array(['Indian Restaurant', 'Multiplex', 'Bus Station',
       'Department Store', 'Electronics Store', 'Fried Chicken Joint',
       'Dessert Shop', 'Accessories Store', 'Bakery', 'Ice Cream Shop',
       'ATM', 'Train Station', 'Boutique', 'Indie Movie Theater',
       'Vegetarian / Vegan Restaurant', 'Chinese Restaurant',
       'Farmers Market', 'Currency Exchange', 'Beach', 'Harbor / Marina',
       'Mobile Phone Shop', 'Platform', 'Coffee Shop', 'Scenic Lookout',
       'Park', "Women's Store", 'Food & Drink Shop', 'Farm', 'Bookstore',
       'Bike Rental / Bike Share', 'Restaurant', 'Video Store', 'Market',
       'Convenience Store', 'Snack Place', 'Gym', 'Food Truck',
       'Fast Food Restaurant', 'Health & Beauty Service',
       'Business Service', 'Miscellaneous Shop',
       'South Indian Restaurant', 'Pharmacy', 'IT Services', 'Bus Stop',
       'Museum', 'Hotel', 'Italian Restaurant', 'Pizza Place', 'Café',
       'Sandwich Place', 'Diner', 'Juice Bar', 'Yoga Studio',
 

## Clustering Model

In [135]:
#after preprocessing and transforming
df4=pd.read_csv('tran.csv')
X=df4.drop(["Neighborhood"], 1)


clusterNum = 5
k_means = KMeans(init = "k-means++", n_clusters = clusterNum, n_init = 12)
k_means.fit(X)
labels = k_means.labels_
print(labels)

[4 0 0 4 3 0 0 0 4 4 4 0 4 4 4 4 4 4 4 4 4 0 4 0 4 4 4 4 4 0 0 4 1 4 4 4 4
 4 4 4 4 4 4 4 4 4 4 4 0 0 4 4 0 4 4 4 4 4 0 4 4 4 4 4 0 4 4 0 4 4 2 4 4 4
 0 4 4 0 4 0 2 4 4 4 4 4 4 4 2 0 4 4 4 4 4]


In [148]:
# add clustering labels
df4.insert(0, 'Cluster Labels', labels)
df5=pd.read_csv('meta.csv')

In [149]:
ch_merged=pd.merge(df5,df4,on='Neighborhood') 

### Visual View of the clusters

In [150]:
# create map
map_clusters = folium.Map(location=[13.0801721, 80.2838331], zoom_start=11)

# set color scheme for the clusters
x = np.arange(clusterNum)
ys = [i+x+(i*x)**2 for i in range(clusterNum)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(ch_merged['Latitude'], ch_merged['Longitude'], ch_merged['Neighborhood'], ch_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [152]:
ch_merged.groupby('Cluster Labels').mean()

Unnamed: 0_level_0,Latitude,Longitude,Accessories_Store,American_Restaurant,Amphitheater,Andhra_Restaurant,Antique_Shop,Arcade,Asian_Restaurant,ATM,...,Tea_Room,Tennis_Stadium,Thai_Restaurant,Theme_Park,Train_Station,Vegetarian___Vegan_Restaurant,Video_Game_Store,Video_Store,Women_s_Store,Yoga_Studio
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,13.064119,80.206926,0.0,0.0,0.0,0.05,0.05,0.05,0.4,0.0,...,0.05,0.0,0.0,0.05,0.3,0.9,0.0,0.1,0.0,0.0
1,13.05251,80.24176,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,13.073453,80.220493,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.0,...,0.0,0.0,0.0,0.0,0.333333,0.666667,0.0,0.0,0.0,0.333333
3,13.08359,80.2102,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
4,13.110497,80.200305,0.028571,0.0,0.0,0.0,0.0,0.0,0.042857,0.3,...,0.0,0.0,0.014286,0.0,0.214286,0.057143,0.014286,0.028571,0.014286,0.0


## Final Cluster


In [172]:
#cl4=pd.read_csv('ana.csv')
cl4.groupby('Cat').mean()

Unnamed: 0_level_0,count
Cat,Unnamed: 1_level_1
American Restaurant,1
Asian Restaurant,1
BBQ Joint,1
Bakery,1
Bistro,1
Bookstore,1
Burger Joint,1
Cafe,2
Chinese Restaurant,5
Clothing Store,3
