# IBM Developer Skills Network

## Capstone project for applied data science

- The problem to be solved in this project is to find a place to rent in Seoul, capital city of South Korea.

- The selection criteria will be based on rent price, desired location, transportation, entertainment, service, food, sport center ...
- Database for all building, apartment & house in Seoul can be found in the below link: https://www.juso.go.kr/addrlink/addressBuildDevNew.do?menu=match

- This database contains all detail location of all building, apartments and house in Seoul together with X, Y coordinates.
- Due to the size of the data (very large), only 1000 rows were randomly extracted to use in this analysis.

- K-Means classification & minimum Minkowski distance will be used to classify building & apartment.

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json  # handle json file

from geopy.geocoders import Nominatim  # convert address to lat and long
import requests  # handle request url
from pandas.io.json import json_normalize  # transform json to dataframe

import matplotlib.cm as cm  # color map handling
import matplotlib.colors as colors
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.cluster import KMeans
import folium  # map rendering

## # Load the seoul rent file

In [None]:
path = 'https://raw.githubusercontent.com/sonpn82/Coursera_Capstone/master/Seoul_rent_1000.csv'

seoul_data = pd.read_csv(path, encoding='cp949')
seoul_data = seoul_data.sample(200)
seoul_data.shape

In [None]:
seoul_data.dtypes

In [None]:
## Drop unnecessary columns
seoul_data.drop(columns= ['CityCode', 'Exit','CodeT','RoadCode','UnderGround','BuildingNo','BuildingNoEx','UseType','Separate','WardEx'],axis=1, inplace=True)
seoul_data.head()

In [None]:
# Combine BuildingName & Postal code to neighborhood
seoul_data['Neighborhood'] = seoul_data['BuildingName'] + '_' + seoul_data['PostalCode'].astype(str)
seoul_data.head()

In [None]:
# Check to confirm unique name of neighborhood
len(list(set(seoul_data['Neighborhood'])))

In [None]:
# Check the distribution of building across seoul
import folium

latitude = 37.5665
longitude = 126.9780

map_seoul = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(seoul_data['Latitude'], seoul_data['Longitude'], seoul_data['BuildingName'].astype(str)):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False
       ).add_to(map_seoul)
map_seoul

In [None]:
## Define Foursquare Credentials and Version
path = 'G:\Google Drive\FoursquareToken.csv'
df_token = pd.read_csv(path)

CLIENT_ID = df_token.iloc[0][1]
CLIENT_SECRET = df_token.iloc[1][1]
ACCESS_TOKEN = df_token.iloc[2][1]
VERSION = '20210526'
LIMIT = 100 # default 4square API limit value

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=1000):  # get venues for all neighborhoods
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
       # print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
seoul_venues = getNearbyVenues(names=seoul_data['Neighborhood'],
                               latitudes=seoul_data['Latitude'],
                               longitudes=seoul_data['Longitude'])             

In [None]:
seoul_venues.head()

In [None]:
seoul_venues.shape

In [None]:
print('There are {} uniques categories.'.format(len(seoul_venues['Venue Category'].unique())))

## Group venue into 7 general categories

In [None]:
## Load the csv file contain 7 general categories
path = 'https://raw.githubusercontent.com/sonpn82/Coursera_Capstone/master/Seoul_venues_cat.csv'
seoul_venues_cat = pd.read_csv(path)
seoul_venues_cat = seoul_venues_cat.fillna(1)  # replace NA with 1
seoul_venues_cat.head()

In [None]:
# Add a new column
seoul_venues['General Category'] = 0
seoul_venues.head()

In [None]:
# Loop through all rows in seoul_venue df
for i in range(len(seoul_venues)):
    for j in range(len(seoul_venues_cat.columns)):
        if seoul_venues.loc[i,'Venue Category'] in seoul_venues_cat.iloc[:,j].values:
            # Add general category
            seoul_venues.loc[i, 'General Category'] = seoul_venues_cat.columns.values[j]

seoul_venues.head()

In [None]:
# Remove row without any category
seoul_venues = seoul_venues[seoul_venues['General Category'] != 0]

print(set(seoul_venues['General Category']))

# Analyze each neighborhood

In [None]:
seoul_onehot = pd.get_dummies(seoul_venues[['General Category']], prefix="", prefix_sep="")

seoul_onehot.insert(0,'Neighborhood',seoul_venues['Neighborhood'],True)
print(seoul_onehot.shape)
seoul_onehot.head()

## Group the neighborhood

In [None]:
# By total number of general categories
seoul_grouped_sum = seoul_onehot.groupby('Neighborhood').sum().reset_index()
print(seoul_grouped_sum.shape)
seoul_grouped_sum.head(10)

In [None]:
# Grouped by mean of general category
seoul_grouped_mean = seoul_onehot.groupby('Neighborhood').mean().reset_index()
print(seoul_grouped_mean.shape)
seoul_grouped_mean.head(10)

## Create a new dataframe and display top 7 general catetories

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
import numpy as np

num_top_venues = 7
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = seoul_grouped_mean['Neighborhood']

for ind in np.arange(seoul_grouped_mean.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(seoul_grouped_mean.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

# Clustering neighborhood

In [None]:
# Set the df for modeling
seoul_grouped_clustering = seoul_grouped_mean.drop('Neighborhood', 1)

In [None]:
kmeans_kwargs = {
    'init': 'random',
    'n_init': 12,
    'max_iter': 300,
    'random_state': 42,
}

# A list to hold SSE values for each k
sse = []
for k in range(2,11):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)  # **unpacking operator
    kmeans.fit(seoul_grouped_clustering)
    sse.append(kmeans.inertia_)

## Visualize the k means accuracy with k value

In [None]:
plt.plot(range(2,11), sse)
plt.xticks(range(2, 11))
plt.xlabel("Number of clusters")
plt.ylabel("SSE")
plt.show()

In [None]:
# From above chart, k=6 seems to give resonable accuracy
# Build model again using k=6
kclusters = 6
seoul_grouped_clustering = seoul_grouped_mean.drop('Neighborhood', 1)

# Run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(seoul_grouped_clustering)

# Check cluster labels generated for each row in the dataframe
kmeans.labels_[0:20]

## Add label to neighborhood dataframe

In [None]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [None]:
seoul_merged = seoul_data

#seoul_grouped with seoul_data to add latitude/longitude for each neighborhood
seoul_merged = seoul_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

seoul_merged.head() 

## Visualize the resulting cluster

In [None]:
latitude = 37.5665
longitude = 126.9780

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(seoul_merged['Latitude'], seoul_merged['Longitude'], seoul_merged['Neighborhood'], seoul_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# Examine clusters

## Cluster 1 - Service hub and sight-seeing lover
### You can easily find all kind of services and stores near here!
### Also there are a lot of sight-seeing & culture sites which you can visit!

In [None]:
seoul_merged.loc[seoul_merged['Cluster Labels'] == 0, seoul_merged.columns[[1] + list(range(5, seoul_merged.shape[1]))]]

## Cluster 2 - Sport enthusiasm, pro or amateur!
### Heaven for sport lovers

In [None]:
seoul_merged.loc[seoul_merged['Cluster Labels'] == 1, seoul_merged.columns[[1] + list(range(5, seoul_merged.shape[1]))]]

## Cluster 3 - Walk around & play!
### Sight-seeing, culture activities, historical site, entertainment all in one!

In [None]:
seoul_merged.loc[seoul_merged['Cluster Labels'] == 2, seoul_merged.columns[[1] + list(range(5, seoul_merged.shape[1]))]]

## Cluster 4 - All around
### A lot of sight-seeing place + convenience transportation + sport centers + service center, you can find all here!

In [None]:
seoul_merged.loc[seoul_merged['Cluster Labels'] == 3, seoul_merged.columns[[1] + list(range(5, seoul_merged.shape[1]))]]

## Cluster 5 - Sport lovers and sightseeing goers
### A lot of sport centers & sightseeing area close by!

In [None]:
seoul_merged.loc[seoul_merged['Cluster Labels'] == 4, seoul_merged.columns[[1] + list(range(5, seoul_merged.shape[1]))]]

## Group 6 - Transportation hub and a convenience life
### Bus station, train station, metro close by 
### Also a lot of service and store ==> what a convenience life!

In [None]:
seoul_merged.loc[seoul_merged['Cluster Labels'] == 5, seoul_merged.columns[[1] + list(range(5, seoul_merged.shape[1]))]]

# Example for choosing a living place in Seoul

In [None]:
# Example 1
# Customer is a sport lover with a budget of 2000$/month for renting a house and want to live in 강북구
cus_group = 2      # sport lover group
cus_rent = 2000    # max 2000$/month
cus_loc = '강북구'  # district selected

cus_data = seoul_merged[(seoul_merged['Cluster Labels']==(cus_group-1)) & (seoul_merged['RentPrice']<=cus_rent) & (seoul_merged['District']==cus_loc)]

print('Number of place = ',len(cus_data))
cus_data.head(100)

## Show these places on the map

In [None]:
cus_data['Latitude']

In [None]:
import folium

latitude = 37.5665
longitude = 126.9780

map_cus = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(cus_data['Latitude'], cus_data['Longitude'], cus_data['BuildingName'].astype(str)):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False
       ).add_to(map_cus)
map_cus

In [None]:
# Example 2
# Customer love sight-seeing & entertainment with a budget of 1500$/month for renting a house and want to live in 강서구
cus_group = 3      # sport lover group
cus_rent = 1500    # max 2000$/month
cus_loc = '강서구'  # district selected

cus_data = seoul_merged[(seoul_merged['Cluster Labels']==(cus_group-1)) & (seoul_merged['RentPrice']<=cus_rent) & (seoul_merged['District']==cus_loc)]

print('Number of place = ',len(cus_data))
cus_data.head(100)

In [None]:
import folium

latitude = 37.5665
longitude = 126.9780

map_cus = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(cus_data['Latitude'], cus_data['Longitude'], cus_data['BuildingName'].astype(str)):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False
       ).add_to(map_cus)
map_cus

# Customized profile selection

## For customers who do not fall into any of 6 above groups

## A customized profile will be built

In [None]:
# Example customer manual profile 1

lv_food = 100              # food and drink
lv_sightSeeing = 100       # sight-seeing and culture
lv_service = 80            # service and shop
lv_sport = 40              # sport and leisure
lv_entertain = 20          # entertainment
lv_lodging = 10            # lodging
lv_transport = 80          # trasportation

cus_rent = 2000    # max 2000$/month

In [None]:
# normalize these values
lv_total = lv_food + lv_sightSeeing + lv_service + lv_sport + lv_entertain + lv_lodging + lv_transport

# Customer profile setting
cus_profile = [lv_entertain / lv_total, lv_food / lv_total, lv_lodging / lv_total,lv_service / lv_total, lv_sightSeeing / lv_total, lv_sport / lv_total, lv_transport/lv_total]
cus_profile

In [None]:
# Compare with seoul_grouped_mean to get a best matched
seoul_grouped_dis = seoul_grouped_mean.drop(columns=['Neighborhood'],axis=1)
seoul_grouped_dis.head()

## Calculate Minkowski distance to find similarity in profile

In [None]:
from scipy.spatial import distance

seoul_grouped_mean['Mindis'] = 0 # to save the minkowski distance

# Calculate minkowski distance
for i in range(len(seoul_grouped_dis)):
    seoul_grouped_mean.loc[i, 'Mindis'] = distance.minkowski(seoul_grouped_dis.iloc[i].values,cus_profile,1)

seoul_mean_merged = seoul_data

# merge seoul_mean with seoul_data to add distance for each neighborhood
seoul_mean_merged = seoul_mean_merged.join(seoul_grouped_mean.set_index('Neighborhood'), on='Neighborhood')

# sort value to get the lowest minkowski distance 
seoul_mean_sorted = seoul_mean_merged.sort_values(by = 'Mindis')

# get only row with rent price less than customer given value
cus_data = seoul_mean_sorted[seoul_mean_sorted['RentPrice']<=cus_rent]

print('Number of place = ',len(cus_data))
cus_data = cus_data.head(10)
cus_data.head()

## Show top 10 places on the map

In [None]:
import folium

latitude = 37.5665
longitude = 126.9780

map_cus = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(cus_data['Latitude'], cus_data['Longitude'], cus_data['BuildingName'].astype(str)):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False
       ).add_to(map_cus)
map_cus

In [None]:
# Example customer manual profile 2

lv_food = 100              # food and drink
lv_sightSeeing = 20       # sight-seeing and culture
lv_service = 20            # service and shop
lv_sport = 100              # sport and leisure
lv_entertain = 100          # entertainment
lv_lodging = 100            # lodging
lv_transport = 20          # trasportation

cus_rent = 2000    # max 2000$/month

In [None]:
# normalize these values
lv_total = lv_food + lv_sightSeeing + lv_service + lv_sport + lv_entertain + lv_lodging + lv_transport

# Customer profile setting
cus_profile = [lv_entertain / lv_total, lv_food / lv_total, lv_lodging / lv_total,lv_service / lv_total, lv_sightSeeing / lv_total, lv_sport / lv_total, lv_transport/lv_total]
cus_profile

In [None]:
from scipy.spatial import distance

seoul_grouped_mean['Mindis'] = 0 # to save the minkowski distance

# Calculate minkowski distance
for i in range(len(seoul_grouped_dis)):
    seoul_grouped_mean.loc[i, 'Mindis'] = distance.minkowski(seoul_grouped_dis.iloc[i].values,cus_profile,1)

seoul_mean_merged = seoul_data

# merge seoul_mean with seoul_data to add distance for each neighborhood
seoul_mean_merged = seoul_mean_merged.join(seoul_grouped_mean.set_index('Neighborhood'), on='Neighborhood')

# sort value to get the lowest minkowski distance 
seoul_mean_sorted = seoul_mean_merged.sort_values(by = 'Mindis')

# get only row with rent price less than customer given value
cus_data = seoul_mean_sorted[seoul_mean_sorted['RentPrice']<=cus_rent]

print('Number of place = ',len(cus_data))
cus_data = cus_data.head(10)
cus_data.head()

## Show top 10 place on the map

In [None]:
import folium

latitude = 37.5665
longitude = 126.9780

map_cus = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(cus_data['Latitude'], cus_data['Longitude'], cus_data['BuildingName'].astype(str)):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False
       ).add_to(map_cus)
map_cus