## Libraries

In [1]:
import requests
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import itertools
from datetime import date, timedelta
from shapely import wkt
from shapely.ops import nearest_points
import geopandas as gpd
import copy
from tqdm import tqdm
import joblib
import re
import math
from collections import Counter

## Loading DataFrames 
To change the naming convention lol

In [2]:
month = "11"

df_temp = pd.read_csv(f"2019 weather data/{month}_df_temp.csv")
df_rain = pd.read_csv(f"2019 weather data/{month}_df_rain.csv")
df_humid = pd.read_csv(f"2019 weather data/{month}_df_humid.csv")

df_temp_stn = pd.read_csv(f"2019 weather data/{month}_df_temp_stn.csv")
df_rain_stn = pd.read_csv(f"2019 weather data/{month}_df_rain_stn.csv")
df_humid_stn = pd.read_csv(f"2019 weather data/{month}_df_humid_stn.csv")

df = joblib.load(f"2019 taxi avail/{month}_taxi_avail.joblib")

### Read grid shape file

In [3]:
# Read SG grid shape file - 47*27
grids = gpd.read_file('SG_grid/SG_grids.shp')
grids['centroid'] = grids['geometry'].apply(lambda x: x.centroid) # grids get centroid
grids.head()

Unnamed: 0,grid_num,geometry,centroid
0,1.0,"POLYGON ((103.60000 1.47500, 103.61000 1.47500...",POINT (103.60500 1.47006)
1,2.0,"POLYGON ((103.61000 1.47500, 103.62000 1.47500...",POINT (103.61500 1.47006)
2,3.0,"POLYGON ((103.62000 1.47500, 103.63000 1.47500...",POINT (103.62500 1.47006)
3,4.0,"POLYGON ((103.63000 1.47500, 103.64000 1.47500...",POINT (103.63500 1.47006)
4,5.0,"POLYGON ((103.64000 1.47500, 103.65000 1.47500...",POINT (103.64500 1.47006)


### Functions for whatever

In [4]:
def convert_gpd(df):
    coord = list(zip(df['longitude'], df['latitude']))
    iterim = df
    iterim["Coordinates"] = [f'POINT({str(i[0])} {str(i[1])})' for i in coord]
    
    iterim['geometry'] = iterim.Coordinates.apply(wkt.loads)
    gdf_stn = gpd.GeoDataFrame(iterim, geometry='geometry')
    gdf_stn.drop('Coordinates', inplace=True, axis=1)
    gdf_stn.reset_index(inplace=True)
    
    return gdf_stn

def near(point, gdf, pts):
    # find the nearest point and return the corresponding Place value
    nearest = gdf.geometry == nearest_points(point, pts)[1]
    return gdf[nearest].station_id.to_numpy()[0]

## Get taxi avail data
def get_taxi_data(query:str):
    '''Returns the coordinates of all taxis via the LTA API endpoint for a given datetime string
    '''
    URL = "https://api.data.gov.sg/v1/transport/taxi-availability"
    params={'date_time':query}
    r=requests.get(URL,params=params)
    return r.json()

def taxi_convert_gpd(taxi_data):
    taxi_list = taxi_data['features'][0]['geometry']["coordinates"]
    df2 = pd.DataFrame({'timestamp': [query for x in range(len(taxi_list))]})
    df2["Coordinates"] = [f'POINT({str(i[0])} {str(i[1])})' for i in taxi_list]

    df2['geometry'] = df2.Coordinates.apply(wkt.loads)
    gdf2 = gpd.GeoDataFrame(df2, geometry='geometry')
    gdf2.drop('Coordinates', inplace=True, axis=1)
    gdf2.reset_index(inplace=True)
    return gdf2

## Merging all data to get that decent df for modelling

In [6]:
import warnings
warnings.filterwarnings('ignore')

# For each timestamp, get weather data for each grid
#  all_queries = df_temp['timestamp'].unique()
all_queries = df_temp['timestamp'].unique()

final_df = pd.DataFrame()

for timestamp in tqdm(all_queries):
    ## GET WEATHER DATA
    # subset for just 1 timestamp (FOR RAIN)
    ss_rain_val = df_rain[df_rain["timestamp"] == timestamp]
    ss_rain_stn = df_rain_stn[df_rain_stn['timestamp'] == timestamp]
    
    gdf_rain_stn = convert_gpd(ss_rain_stn)
    ## Getting nearest rain station for each grid
    pts3 = gdf_rain_stn.geometry.unary_union
    grids_final = copy.deepcopy(grids)
    grids_final['station_id'] = grids_final.apply(lambda row: near(row.geometry, gdf_rain_stn, pts3), axis=1)
    grids_final = grids_final.merge(ss_rain_val, on = ['station_id'], how='inner')
    grids_final.rename(columns = {'value':'rainfall'}, inplace = True)
    grids_final = grids_final.drop(['station_id', 'timestamp'], axis=1)
    
    #############################################
    # subset for just 1 timestamp (FOR TEMP)
    ss_temp_val = df_temp[df_temp["timestamp"] == timestamp]
    ss_temp_stn = df_temp_stn[df_temp_stn['timestamp'] == timestamp]
    
    gdf_temp_stn = convert_gpd(ss_temp_stn)
    ## Getting nearest rain station for each grid
    pts3 = gdf_temp_stn.geometry.unary_union
    grids_final['station_id'] = grids_final.apply(lambda row: near(row.geometry, gdf_temp_stn, pts3), axis=1)
    grids_final = grids_final.merge(ss_temp_val, on = ['station_id'], how='inner')
    grids_final.rename(columns = {'value':'air_temp'}, inplace = True)
    grids_final = grids_final.drop(['station_id', 'timestamp'], axis=1)
    
    #############################################
    # subset for just 1 timestamp (FOR HUMID)
    ss_humid_val = df_humid[df_humid["timestamp"] == timestamp]
    ss_humid_stn = df_humid_stn[df_humid_stn['timestamp'] == timestamp]
    
    gdf_humid_stn = convert_gpd(ss_humid_stn)
    ## Getting nearest rain station for each grid
    pts3 = gdf_humid_stn.geometry.unary_union
    grids_final['station_id'] = grids_final.apply(lambda row: near(row.geometry, gdf_humid_stn, pts3), axis=1)
    grids_final = grids_final.merge(ss_humid_val, on = ['station_id'], how='inner')
    grids_final.rename(columns = {'value':'humidity'}, inplace = True)
    grids_final = grids_final.drop(['station_id'], axis=1)
    
    
    ## GET TAXI AVAIL DATA
    #taxi_data=get_taxi_data(query=timestamp)
    try:
        one_list = df[timestamp]['features'][0]['geometry']['coordinates']

        ## Getting list of grid num
        test = [math.ceil((i[0]-103.6)/0.01) + math.ceil(27 - (i[1] -1.208)/0.009888890000000039)*45 for i in one_list]

        # getting dictionary of items
        c = Counter(test)

        # Getting taxi_count for relevant grid_num
        df_timestamp = pd.DataFrame({'timestamp':[timestamp for x in range(len(c))] ,'grid_num': list(c.keys()), 
                                     'taxi_count': [x[1] for x in list(c.items())]})

        grids_final = pd.merge(grids_final, df_timestamp, how='left')


        # APPEND DATA FROM THIS TIMESTAMP TO final_df
        final_df = pd.concat([final_df, grids_final])
    except:
        pass

100%|████████████████████████████████████████████████████████████████████████████████| 415/415 [47:42<00:00,  6.90s/it]


In [7]:
joblib.dump(final_df, f"{month}_fulldata_2069_2484.joblib")

['11_fulldata_2069_2484.joblib']

## Ignore anything below this

In [17]:
#02,04,12, part 11

df = joblib.load("monthly_data/12_fulldata.joblib")
#df_full = pd.DataFrame()
df_full = pd.concat([df_full, df])

In [18]:
df_full.to_csv("02_04_12_part11.csv")

In [73]:
# testing out 1 timestamp
one_list = df['2019-01-01T00:00:00']['features'][0]['geometry']['coordinates']

## Getting list of grid num
test = [math.ceil((i[0]-103.6)/0.01) + math.ceil(27 - (i[1] -1.208)/0.009888890000000039)*45 for i in one_list]

# getting dictionary of items
c = Counter(test)

In [72]:
# Getting taxi_count for relevant grid_num
df_timestamp = pd.DataFrame({'grid_num': list(c.keys()), 'taxi_count': [x[1] for x in list(c.items())]})
df_timestamp

Unnamed: 0,grid_num,taxi_count
0,768,1
1,769,1
2,814,2
3,680,1
4,770,1
...,...,...
338,579,118
339,580,19
340,445,1
341,490,1


In [78]:
full_df = pd.DataFrame()
type(full_df)

pandas.core.frame.DataFrame

In [80]:
## testing for loop through all keys:
full_df = pd.DataFrame()

for key in tqdm(df.keys()):
    one_list = df[key]['features'][0]['geometry']['coordinates']

    ## Getting list of grid num
    test = [math.ceil((i[0]-103.6)/0.01) + math.ceil(27 - (i[1] -1.208)/0.009888890000000039)*45 for i in one_list]

    # getting dictionary of items
    c = Counter(test)
    
    # Getting taxi_count for relevant grid_num
    df_timestamp = pd.DataFrame({'timestamp':[key for x in range(len(c))] ,'grid_num': list(c.keys()), 
                                 'taxi_count': [x[1] for x in list(c.items())]})
    
    full_df = pd.concat([full_df, df_timestamp])

100%|██████████████████████████████████████████████████████████████████████████████| 2976/2976 [00:45<00:00, 65.00it/s]


In [81]:
len(full_df)

1030219

In [82]:
full_df.head()

Unnamed: 0,timestamp,grid_num,taxi_count
0,2019-01-01T00:00:00,768.0,1.0
1,2019-01-01T00:00:00,769.0,1.0
2,2019-01-01T00:00:00,814.0,2.0
3,2019-01-01T00:00:00,680.0,1.0
4,2019-01-01T00:00:00,770.0,1.0
