Running this whole workbook will give you the full 2019 dataset. But it might take weeks for a full run ;) <br>
Try reducing the number of datetime query

## Libraries

In [1]:
import requests
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import itertools
from datetime import date, timedelta
from shapely import wkt
from shapely.ops import nearest_points
import geopandas as gpd
import copy

## Datetime to query

In [2]:
# Generating list of dates 2019
sdate = date(2019,1,1)
#edate = date(2019,1,2)
edate = date(2020,1,1)  
dates_2019 = pd.date_range(sdate,edate-timedelta(days=1),freq='d')
dates_2019 = [str(x)[:10] for x in dates_2019]

# Generating list of hours 00 to 23
#military_time = ['00','01']
military_time = np.arange(0,24,1)
military_time = [('0'+str(x))[-2:] for x in military_time]

# 15-min interval
#minutes = ['00']
minutes = ['00', '15','30', '45']

all_queries = []
for i in dates_2019:
    for hour in military_time:
        for minute in minutes:
            all_queries.append(i + 'T' + hour + ":" + minute + ":00")

# eg of query: "2019-01-01T20:00:00"

## Get weather data

In [3]:
def get_weather_data(query:str):
    '''Returns dictionary of JSON objects for weather data given a datetime string
    Each entry in the dictionary is the JSON response for each of the API Endpoints of
     ["air-temperature","rainfall","relative-humidity"]    
    '''
    data_sets = ["air-temperature","rainfall","relative-humidity"]
    results={}
    for measure in data_sets:
        URL = "https://api.data.gov.sg/v1/environment/"+measure
        params={'date_time':query}
        r=requests.get(URL,params=params)
        results[measure]=r.json()
    return results

In [4]:
# Querying each timestamp and writing it to individual dataset (for humidity, rainfall, temp)
df_temp = pd.DataFrame()
df_rain = pd.DataFrame()
df_humid = pd.DataFrame()

df_temp_stn = pd.DataFrame()
df_rain_stn = pd.DataFrame()
df_humid_stn = pd.DataFrame()

for query in all_queries:
    # Query data
    weather_data = get_weather_data(query)
################################            
    # For item df (temp)
    temp_stn = np.array([i['station_id'] for i in weather_data['air-temperature']['items'][0]['readings']])
    temp_val = np.array([i['value'] for i in weather_data['air-temperature']['items'][0]['readings']])
    itr_temp = pd.DataFrame({'timestamp': [query for xx in range(len(temp_stn))],
                             'station_id': temp_stn, 'value': temp_val})
    df_temp = pd.concat([df_temp, itr_temp]) # concat to item df

    # For metadata df (temp)
    temp_stn = np.array([i['id'] for i in weather_data['air-temperature']['metadata']['stations']]) # id
    temp_lat = np.array([i['location']['latitude'] for i in weather_data['air-temperature']['metadata']['stations']]) # lat
    temp_lon = np.array([i['location']['longitude'] for i in weather_data['air-temperature']['metadata']['stations']]) # lon
    itr_temp_meta = pd.DataFrame({'timestamp': [query for xx in range(len(temp_stn))],
                             'station_id': temp_stn, 'latitude': temp_lat, 'longitude': temp_lon})
    df_temp_stn = pd.concat([df_temp_stn, itr_temp_meta]) # concat to meta to df
################################     
    # For item df (rain)
    rain_stn = np.array([i['station_id'] for i in weather_data['rainfall']['items'][0]['readings']])
    rain_val = np.array([i['value'] for i in weather_data['rainfall']['items'][0]['readings']])
    itr_rain = pd.DataFrame({'timestamp': [query for xx in range(len(rain_stn))],
                             'station_id': rain_stn, 'value': rain_val})
    df_rain = pd.concat([df_rain, itr_rain]) # concat to item df

    # For metadata df (temp)
    rain_stn = np.array([i['id'] for i in weather_data['rainfall']['metadata']['stations']]) # id
    rain_lat = np.array([i['location']['latitude'] for i in weather_data['rainfall']['metadata']['stations']]) # lat
    rain_lon = np.array([i['location']['longitude'] for i in weather_data['rainfall']['metadata']['stations']]) # lon
    itr_rain_meta = pd.DataFrame({'timestamp': [query for xx in range(len(rain_stn))],
                             'station_id': rain_stn, 'latitude': rain_lat, 'longitude': rain_lon})
    df_rain_stn = pd.concat([df_rain_stn, itr_rain_meta]) # concat to meta to df
################################    
    # For item df (humid)
    humid_stn = np.array([i['station_id'] for i in weather_data['relative-humidity']['items'][0]['readings']])
    humid_val = np.array([i['value'] for i in weather_data['relative-humidity']['items'][0]['readings']])
    itr_humid = pd.DataFrame({'timestamp': [query for xx in range(len(humid_stn))],
                             'station_id': humid_stn, 'value': humid_val})
    df_humid = pd.concat([df_humid, itr_humid]) # concat to item df

    # For metadata df (humid)
    humid_stn = np.array([i['id'] for i in weather_data['relative-humidity']['metadata']['stations']]) # id
    humid_lat = np.array([i['location']['latitude'] for i in weather_data['relative-humidity']['metadata']['stations']]) # lat
    humid_lon = np.array([i['location']['longitude'] for i in weather_data['relative-humidity']['metadata']['stations']]) # lon
    itr_humid_meta = pd.DataFrame({'timestamp': [query for xx in range(len(humid_stn))],
                             'station_id': humid_stn, 'latitude': humid_lat, 'longitude': humid_lon})
    df_humid_stn = pd.concat([df_humid_stn, itr_humid_meta]) # concat to meta to df


## Read SG grid shape file

In [5]:
# Read SG grid shape file - 47*27
grids = gpd.read_file('SG_grid/SG_grids.shp')
grids['centroid'] = grids['geometry'].apply(lambda x: x.centroid) # grids get centroid
grids.head()

Unnamed: 0,grid_num,geometry,centroid
0,1.0,"POLYGON ((103.60000 1.47500, 103.61000 1.47500...",POINT (103.60500 1.47006)
1,2.0,"POLYGON ((103.61000 1.47500, 103.62000 1.47500...",POINT (103.61500 1.47006)
2,3.0,"POLYGON ((103.62000 1.47500, 103.63000 1.47500...",POINT (103.62500 1.47006)
3,4.0,"POLYGON ((103.63000 1.47500, 103.64000 1.47500...",POINT (103.63500 1.47006)
4,5.0,"POLYGON ((103.64000 1.47500, 103.65000 1.47500...",POINT (103.64500 1.47006)


## For each timestamp, match weather data and num avail taxi to each grid

In [6]:
def convert_gpd(df):
    coord = list(zip(df['longitude'], df['latitude']))
    iterim = df
    iterim["Coordinates"] = [f'POINT({str(i[0])} {str(i[1])})' for i in coord]
    
    iterim['geometry'] = iterim.Coordinates.apply(wkt.loads)
    gdf_stn = gpd.GeoDataFrame(iterim, geometry='geometry')
    gdf_stn.drop('Coordinates', inplace=True, axis=1)
    gdf_stn.reset_index(inplace=True)
    
    return gdf_stn

def near(point, gdf, pts):
    # find the nearest point and return the corresponding Place value
    nearest = gdf.geometry == nearest_points(point, pts)[1]
    return gdf[nearest].station_id.to_numpy()[0]

## Get taxi avail data
def get_taxi_data(query:str):
    '''Returns the coordinates of all taxis via the LTA API endpoint for a given datetime string
    '''
    URL = "https://api.data.gov.sg/v1/transport/taxi-availability"
    params={'date_time':query}
    r=requests.get(URL,params=params)
    return r.json()

def taxi_convert_gpd(taxi_data):
    taxi_list = taxi_data['features'][0]['geometry']["coordinates"]
    df2 = pd.DataFrame({'timestamp': [query for x in range(len(taxi_list))]})
    df2["Coordinates"] = [f'POINT({str(i[0])} {str(i[1])})' for i in taxi_list]

    df2['geometry'] = df2.Coordinates.apply(wkt.loads)
    gdf2 = gpd.GeoDataFrame(df2, geometry='geometry')
    gdf2.drop('Coordinates', inplace=True, axis=1)
    gdf2.reset_index(inplace=True)
    return gdf2

In [7]:
# For each timestamp, get weather data for each grid
final_df = pd.DataFrame()

for timestamp in all_queries:
    ## GET WEATHER DATA
    # subset for just 1 timestamp (FOR RAIN)
    ss_rain_val = df_rain[df_rain["timestamp"] == timestamp]
    ss_rain_stn = df_rain_stn[df_rain_stn['timestamp'] == timestamp]
    
    gdf_rain_stn = convert_gpd(ss_rain_stn)
    ## Getting nearest rain station for each grid
    pts3 = gdf_rain_stn.geometry.unary_union
    grids_final = copy.deepcopy(grids)
    grids_final['station_id'] = grids_final.apply(lambda row: near(row.geometry, gdf_rain_stn, pts3), axis=1)
    grids_final = grids_final.merge(ss_rain_val, on = ['station_id'], how='inner')
    grids_final.rename(columns = {'value':'rainfall'}, inplace = True)
    grids_final = grids_final.drop(['station_id', 'timestamp'], axis=1)
    
    #############################################
    # subset for just 1 timestamp (FOR TEMP)
    ss_temp_val = df_temp[df_temp["timestamp"] == timestamp]
    ss_temp_stn = df_temp_stn[df_temp_stn['timestamp'] == timestamp]
    
    gdf_temp_stn = convert_gpd(ss_temp_stn)
    ## Getting nearest rain station for each grid
    pts3 = gdf_temp_stn.geometry.unary_union
    grids_final['station_id'] = grids_final.apply(lambda row: near(row.geometry, gdf_temp_stn, pts3), axis=1)
    grids_final = grids_final.merge(ss_temp_val, on = ['station_id'], how='inner')
    grids_final.rename(columns = {'value':'air_temp'}, inplace = True)
    grids_final = grids_final.drop(['station_id', 'timestamp'], axis=1)
    
    #############################################
    # subset for just 1 timestamp (FOR HUMID)
    ss_humid_val = df_humid[df_humid["timestamp"] == timestamp]
    ss_humid_stn = df_humid_stn[df_humid_stn['timestamp'] == timestamp]
    
    gdf_humid_stn = convert_gpd(ss_humid_stn)
    ## Getting nearest rain station for each grid
    pts3 = gdf_humid_stn.geometry.unary_union
    grids_final['station_id'] = grids_final.apply(lambda row: near(row.geometry, gdf_humid_stn, pts3), axis=1)
    grids_final = grids_final.merge(ss_humid_val, on = ['station_id'], how='inner')
    grids_final.rename(columns = {'value':'humidity'}, inplace = True)
    grids_final = grids_final.drop(['station_id'], axis=1)
    
    
    ## GET TAXI AVAIL DATA
    taxi_data=get_taxi_data(query=timestamp)
    taxi_gdf = taxi_convert_gpd(taxi_data)
    ## Count how many taxis in each grid 
    grids_final["num_taxi"] = 0
    for pt in range(len(taxi_gdf)): # for each taxi
        geom = taxi_gdf['geometry'][pt]
        for plg in range(len(grids_final)):
            poly = grids_final['geometry'][plg]
            if poly.contains(geom):
                grids_final.loc[plg, "num_taxi"] += 1
                break
                
    # APPEND DATA FROM THIS TIMESTAMP TO final_df
    final_df = pd.concat([final_df, grids_final])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the

In [10]:
final_df

Unnamed: 0,grid_num,geometry,centroid,rainfall,air_temp,timestamp,humidity,num_taxi
0,1.0,"POLYGON ((103.60000 1.47500, 103.61000 1.47500...",POINT (103.60500 1.47006),0,25.3,2019-01-01T00:00:00,88.6,0
1,2.0,"POLYGON ((103.61000 1.47500, 103.62000 1.47500...",POINT (103.61500 1.47006),0,25.3,2019-01-01T00:00:00,88.6,0
2,46.0,"POLYGON ((103.60000 1.46511, 103.61000 1.46511...",POINT (103.60500 1.46017),0,25.3,2019-01-01T00:00:00,88.6,0
3,47.0,"POLYGON ((103.61000 1.46511, 103.62000 1.46511...",POINT (103.61500 1.46017),0,25.3,2019-01-01T00:00:00,88.6,0
4,48.0,"POLYGON ((103.62000 1.46511, 103.63000 1.46511...",POINT (103.62500 1.46017),0,25.3,2019-01-01T00:00:00,88.6,0
...,...,...,...,...,...,...,...,...
1210,1201.0,"POLYGON ((103.90000 1.21789, 103.91000 1.21789...",POINT (103.90500 1.21294),0,26.9,2019-01-01T01:00:00,87.9,0
1211,1202.0,"POLYGON ((103.91000 1.21789, 103.92000 1.21789...",POINT (103.91500 1.21294),0,26.9,2019-01-01T01:00:00,87.9,0
1212,1203.0,"POLYGON ((103.92000 1.21789, 103.93000 1.21789...",POINT (103.92500 1.21294),0,26.9,2019-01-01T01:00:00,87.9,0
1213,1204.0,"POLYGON ((103.93000 1.21789, 103.94000 1.21789...",POINT (103.93500 1.21294),0,26.9,2019-01-01T01:00:00,87.9,0
