# IBM Developer Skills Network

## Capstone project for applied data science

In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json  # handle json file

from geopy.geocoders import Nominatim  # convert address to lat and long
import requests  # handle request url
from pandas.io.json import json_normalize  # transform json to dataframe

import matplotlib.cm as cm  # color map handling
import matplotlib.colors as colors

from sklearn.cluster import KMeans
import folium  # map rendering



## # Load the seoul rent file

In [3]:
path = 'https://raw.githubusercontent.com/sonpn82/Coursera_Capstone/master/Seoul_rent_1000.csv'

seoul_data = pd.read_csv(path, encoding='cp949')
seoul_data = seoul_data.sample(200)
seoul_data.shape

(200, 19)

In [4]:
seoul_data.dtypes

CityCode          int64
Exit              int64
CodeT             int64
City             object
District         object
Ward             object
RoadCode        float64
Road             object
UnderGround       int64
BuildingNo        int64
BuildingNoEx      int64
BuildingName     object
PostalCode        int64
UseType          object
Separate          int64
WardEx           object
RentPrice         int64
Latitude        float64
Longitude       float64
dtype: object

In [5]:
## Define Foursquare Credentials and Version
path = 'G:\Google Drive\FoursquareToken.csv'
df_token = pd.read_csv(path)

CLIENT_ID = df_token.iloc[0][1]
CLIENT_SECRET = df_token.iloc[1][1]
ACCESS_TOKEN = df_token.iloc[2][1]
VERSION = '20210526'
LIMIT = 100 # default 4square API limit value

In [7]:
def getNearbyVenues(names, latitudes, longitudes, radius=1000):  # get venues for all neighborhoods
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
       # print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [8]:
seoul_venues = getNearbyVenues(names=seoul_data['BuildingName'],
                               latitudes=seoul_data['Longitude'],
                               longitudes=seoul_data['Latitude'])              

In [9]:
seoul_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,롯데캐슬 천지인,37.573936,127.016338,EVEREST (에베레스트),37.572567,127.012588,Indian Restaurant
1,롯데캐슬 천지인,37.573936,127.016338,Dongdaemoon Stationery and Toy Market (동대문문구완구거리),37.571528,127.013856,Toy / Game Store
2,롯데캐슬 천지인,37.573936,127.016338,동대문 양육관점,37.572204,127.012623,Chinese Restaurant
3,롯데캐슬 천지인,37.573936,127.016338,Yukjeon Sikdang (육전식당),37.574425,127.024394,BBQ Joint
4,롯데캐슬 천지인,37.573936,127.016338,창신동 매운족발,37.573021,127.011588,Korean Restaurant


In [63]:
seoul_venues.shape

(9495, 7)

In [14]:
pd.DataFrame(set(seoul_venues['Venue Category'])).to_csv(r'C:\Users\hp\Downloads\Seoul_venues_cat2.csv')

## Group venue into 7 general categories

In [42]:
## Load the csv file contain 7 general categories
path = 'https://raw.githubusercontent.com/sonpn82/Coursera_Capstone/master/Seoul_venues_cat.csv'
seoul_venues_cat = pd.read_csv(path)
seoul_venues_cat = seoul_venues_cat.fillna(1)  # replace NA with 1
seoul_venues_cat.head()

Unnamed: 0,Food and drink,Sports and leisure,Transport,Sightseeing and culture,Service and store,Entertainment,Lodging
0,African Restaurant,Athletics & Sports,Airport,Art Gallery,Antique Shop,Arcade,Boarding House
1,American Restaurant,Badminton Court,Airport Lounge,Art Museum,Auto Dealership,Bath House,Hostel
2,Asian Restaurant,Baseball Field,Airport Service,Arts & Crafts Store,Auto Workshop,Concert Hall,Hotel
3,Bakery,Baseball Stadium,Airport Terminal,Botanical Garden,Bagel Shop,General Entertainment,Residential Building (Apartment / Condo)
4,Bar,Basketball Court,Bridge,Cemetery,Board Shop,Indie Movie Theater,Resort


In [43]:
# Add a new column
seoul_venues['General Category'] = 0
seoul_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,General Category
0,롯데캐슬 천지인,37.573936,127.016338,EVEREST (에베레스트),37.572567,127.012588,Indian Restaurant,0
1,롯데캐슬 천지인,37.573936,127.016338,Dongdaemoon Stationery and Toy Market (동대문문구완구거리),37.571528,127.013856,Toy / Game Store,0
2,롯데캐슬 천지인,37.573936,127.016338,동대문 양육관점,37.572204,127.012623,Chinese Restaurant,0
3,롯데캐슬 천지인,37.573936,127.016338,Yukjeon Sikdang (육전식당),37.574425,127.024394,BBQ Joint,0
4,롯데캐슬 천지인,37.573936,127.016338,창신동 매운족발,37.573021,127.011588,Korean Restaurant,0


In [44]:
# Loop through all rows in seoul_venue df
for i in range(len(seoul_venues)):
    for j in range(len(seoul_venues_cat.columns)):
        if seoul_venues.loc[i,'Venue Category'] in seoul_venues_cat.iloc[:,j].values:
            # Add general category
            seoul_venues.loc[i, 'General Category'] = seoul_venues_cat.columns.values[j]

seoul_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,General Category
0,롯데캐슬 천지인,37.573936,127.016338,EVEREST (에베레스트),37.572567,127.012588,Indian Restaurant,Food and drink
1,롯데캐슬 천지인,37.573936,127.016338,Dongdaemoon Stationery and Toy Market (동대문문구완구거리),37.571528,127.013856,Toy / Game Store,Entertainment
2,롯데캐슬 천지인,37.573936,127.016338,동대문 양육관점,37.572204,127.012623,Chinese Restaurant,Food and drink
3,롯데캐슬 천지인,37.573936,127.016338,Yukjeon Sikdang (육전식당),37.574425,127.024394,BBQ Joint,Food and drink
4,롯데캐슬 천지인,37.573936,127.016338,창신동 매운족발,37.573021,127.011588,Korean Restaurant,Food and drink


In [45]:
print(set(seoul_venues['General Category']))

{'Food and drink', 'Transport', 'Service and store', 'Entertainment', 'Sightseeing and culture', 'Sports and leisure', 'Lodging'}


# Analyze each neighborhood

In [53]:
seoul_onehot = pd.get_dummies(seoul_venues[['General Category']], prefix="", prefix_sep="")

# Remove 0 column in general venue categories if exist
try:
    del seoul_onehot['0']
except:
    pass
finally:    
    # add neighborhood column back to dataframe
    seoul_onehot.insert(0,'Neighborhood',seoul_venues['Neighborhood'],True)
    print(seoul_onehot.shape)
    print(seoul_onehot.head())

(9484, 8)
  Neighborhood  Entertainment  Food and drink  Lodging  Service and store  \
0     롯데캐슬 천지인              0               1        0                  0   
1     롯데캐슬 천지인              1               0        0                  0   
2     롯데캐슬 천지인              0               1        0                  0   
3     롯데캐슬 천지인              0               1        0                  0   
4     롯데캐슬 천지인              0               1        0                  0   

   Sightseeing and culture  Sports and leisure  Transport  
0                        0                   0          0  
1                        0                   0          0  
2                        0                   0          0  
3                        0                   0          0  
4                        0                   0          0  


## Group the neighborhood

In [60]:
# By total number of general categories
seoul_grouped_sum = seoul_onehot.groupby('Neighborhood').sum().reset_index()
print(seoul_grouped_sum.shape)
seoul_grouped_sum.head(10)

(178, 8)


Unnamed: 0,Neighborhood,Entertainment,Food and drink,Lodging,Service and store,Sightseeing and culture,Sports and leisure,Transport
0,강남빌,0,20,1,0,0,0,7
1,경신빌라,1,10,0,0,4,0,0
2,경원주택,0,16,1,0,3,0,0
3,경희궁파크,7,64,1,3,24,1,0
4,고려빌라,2,9,0,1,3,4,2
5,골드캐슬,1,26,0,0,2,0,2
6,공원빌라,0,93,0,1,5,1,0
7,광일기공,2,57,1,5,1,2,3
8,구일빌라,1,69,3,1,26,0,0
9,궁전빌라,1,79,0,3,2,2,1


In [59]:
# Grouped by mean of general category
seoul_grouped_mean = seoul_onehot.groupby('Neighborhood').mean().reset_index()
print(seoul_grouped_mean.shape)
seoul_grouped_mean.head(10)

(178, 8)


Unnamed: 0,Neighborhood,Entertainment,Food and drink,Lodging,Service and store,Sightseeing and culture,Sports and leisure,Transport
0,강남빌,0.0,0.714286,0.035714,0.0,0.0,0.0,0.25
1,경신빌라,0.066667,0.666667,0.0,0.0,0.266667,0.0,0.0
2,경원주택,0.0,0.8,0.05,0.0,0.15,0.0,0.0
3,경희궁파크,0.07,0.64,0.01,0.03,0.24,0.01,0.0
4,고려빌라,0.095238,0.428571,0.0,0.047619,0.142857,0.190476,0.095238
5,골드캐슬,0.032258,0.83871,0.0,0.0,0.064516,0.0,0.064516
6,공원빌라,0.0,0.93,0.0,0.01,0.05,0.01,0.0
7,광일기공,0.028169,0.802817,0.014085,0.070423,0.014085,0.028169,0.042254
8,구일빌라,0.01,0.69,0.03,0.01,0.26,0.0,0.0
9,궁전빌라,0.011364,0.897727,0.0,0.034091,0.022727,0.022727,0.011364


## Create a new dataframe and display top 7 general catetories

In [61]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [62]:
import numpy as np

num_top_venues = 7
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = seoul_grouped['Neighborhood']

for ind in np.arange(seoul_grouped_mean.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(seoul_grouped_mean.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue
0,강남빌,Food and drink,Transport,Lodging,Entertainment,Service and store,Sightseeing and culture,Sports and leisure
1,경신빌라,Food and drink,Sightseeing and culture,Entertainment,Lodging,Service and store,Sports and leisure,Transport
2,경원주택,Food and drink,Sightseeing and culture,Lodging,Entertainment,Service and store,Sports and leisure,Transport
3,경희궁파크,Food and drink,Sightseeing and culture,Entertainment,Service and store,Lodging,Sports and leisure,Transport
4,고려빌라,Food and drink,Sports and leisure,Sightseeing and culture,Entertainment,Transport,Service and store,Lodging


# Clustering neighborhood