# Preprocessing
To clean `JSON` weather and taxi avail files called from NEA and LTA sites <br>

## Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gcsfs

from scipy.spatial import distance
from datetime import date, timedelta
import re
import math
from collections import Counter

import copy
from tqdm import tqdm
import itertools
import joblib

import geopandas as gpd
import fiona
from shapely import geometry
from shapely import wkt
from shapely.geometry import Polygon

from src import jsonParser
from src import assignment

import warnings
warnings.filterwarnings('ignore')

current_directory=os.getcwd()
current_directory

'/home/ianchongweiming/smu-cs611-mleng-project'

## 2. Processing weather and taxi tabular files 
Merge the data together into 1 dataframe<br>
This is only for ONE timestamp. Timestamps should be cleaned min is either 00 or 30; sec is 00<br>
Eg 2019-01-01T12:30:00 <br>

Weather files: <br>
1. rainfall w stations value     ---   timestamp | station_id | value
2. rainfall w stations latlon    ---  timestamp | station_id | latitude | longitude
3. humidity w stations value     ---   timestamp | station_id | value
4. humidity w stations latlon    ---   timestamp | station_id | latitude | longitude
5. temperature w stations value  ---  timestamp | station_id | value
6. temperature w stations latlon --- timestamp | station_id | latitude | longitude
<br>

Taxi files: <br>
7. taxi avail json file

In [3]:
project = 'ml-eng-cs611-group-project'
nea_bucket = 'ml-eng-cs611-group-project-nea'
taxi_bucket = 'ml-eng-cs611-group-project-taxis'
dataset_id='taxi_dataset'
measure = 'rainfall'
measures = ['rainfall','air-temperature','relative-humidity']

fs = gcsfs.GCSFileSystem(project=project)
nea_filenames = fs.glob('/'.join([nea_bucket,measure,"*"]))

### Read grid file

In [4]:
grids_file = './updated codes/filter_grids_2/filter_grids_2.shp' ## To change directory

grids = gpd.read_file(grids_file)
grids['centroid'] = grids['geometry'].apply(lambda x: x.centroid) # get grids' centroid

# convert to dataframe
grids_df = pd.DataFrame(grids)
grids_df['centroid'] = grids_df['centroid'].astype(str)
grids_df['latlon'] = grids_df['centroid'].apply(lambda x: (float(x.split(' ')[1][1:]), float(x.split(' ')[2][:-1])))

# Get unique grid_num
grid_nums = list(grids_df['grid_num'].unique())

In [5]:
grids_df

Unnamed: 0,grid_num,intersect,geometry,centroid,latlon
0,9.0,1,"POLYGON ((103.76364 1.47500, 103.78409 1.47500...",POINT (103.77386363636363 1.4647307692307692),"(103.77386363636363, 1.4647307692307692)"
1,10.0,1,"POLYGON ((103.78409 1.47500, 103.80455 1.47500...",POINT (103.79431818181817 1.464730769230769),"(103.79431818181817, 1.464730769230769)"
2,11.0,1,"POLYGON ((103.80455 1.47500, 103.82500 1.47500...",POINT (103.8147727272727 1.464730769230769),"(103.8147727272727, 1.464730769230769)"
3,12.0,1,"POLYGON ((103.82500 1.47500, 103.84545 1.47500...",POINT (103.83522727272724 1.464730769230769),"(103.83522727272724, 1.464730769230769)"
4,13.0,1,"POLYGON ((103.84545 1.47500, 103.86591 1.47500...",POINT (103.85568181818182 1.464730769230769),"(103.85568181818182, 1.464730769230769)"
...,...,...,...,...,...
176,255.0,1,"POLYGON ((103.84545 1.24908, 103.86591 1.24908...",POINT (103.85568181818182 1.2388076923076925),"(103.85568181818182, 1.2388076923076925)"
177,265.0,1,"POLYGON ((103.60000 1.22854, 103.62045 1.22854...",POINT (103.61022727272726 1.218269230769231),"(103.61022727272726, 1.218269230769231)"
178,266.0,1,"POLYGON ((103.62045 1.22854, 103.64091 1.22854...",POINT (103.6306818181818 1.2182692307692313),"(103.6306818181818, 1.2182692307692313)"
179,267.0,1,"POLYGON ((103.64091 1.22854, 103.66136 1.22854...",POINT (103.65113636363635 1.218269230769231),"(103.65113636363635, 1.218269230769231)"


### Load weather datasets

In [43]:
from datetime import datetime

def query_nea_metadata(measure:str, query_timestamp:str, project_id='ml-eng-cs611-group-project',dataset_id='taxi_dataset_reference'):
    '''Query NEA BigQuery for metadata
    Args:
        measure:            rainfall, relative-humidity or air-temperature
        query_timestamp:    i.e. 2022-06-01 13:15:00
        project_id:         Google Cloud project_id
        dataset_id:         Google Cloud dataset_id
    Returns:
        pandas.DataFrame containing metadata for selected measure
    '''
    table_dict={'rainfall':'rainfall-metadata','relative-humidity':'relative-humidity-metadata','air-temperature':'air-temperature-metadata'}

    sql = f"""
    SELECT timestamp, station, latitude, longitude
    FROM `{dataset_id}.{table_dict[measure]}`
    WHERE timestamp = '{query_timestamp}'
    """

    return pd.read_gbq(sql, project_id=project_id)


def query_nea_items(measure:str, query_timestamp:str, project_id='ml-eng-cs611-group-project',dataset_id='taxi_dataset_reference'):
    '''Query NEA BigQuery for metadata
    Args:
        measure:            rainfall, relative-humidity or air-temperature
        query_timestamp:    i.e. 2022-06-01 13:15:00
        project_id:         Google Cloud project_id
        dataset_id:         Google Cloud dataset_id
    Returns:
        pandas.DataFrame containing metadata for selected measure
    '''
    table_dict={'rainfall':'rainfall-items','relative-humidity':'relative-humidity-items','air-temperature':'air-temperature-items'}

    sql = f"""
    SELECT timestamp, station_id, value
    FROM `{dataset_id}.{table_dict[measure]}`
    WHERE timestamp = '{query_timestamp}'
    """

    return pd.read_gbq(sql, project_id=project_id)

def query_nea_view(measure:str, query_timestamp:str, project_id='ml-eng-cs611-group-project',dataset_id='taxi_dataset_views'):
    '''Query NEA BigQuery for metadata
    Args:
        measure:            rainfall, relative-humidity or air-temperature
        query_timestamp:    i.e. 2022-06-01 13:15:00
        project_id:         Google Cloud project_id
        dataset_id:         Google Cloud dataset_id
    Returns:
        pandas.DataFrame containing metadata for selected measure
    '''
    table_dict={'rainfall':'view-rainfall','relative-humidity':'view-relative-humidity','air-temperature':'view-air-temperature'}

    sql = f"""
    SELECT *
    FROM `{dataset_id}.{table_dict[measure]}`
    WHERE timestamp = '{query_timestamp}'
    """

    return pd.read_gbq(sql, project_id=project_id)

def assign_grids(grids_df, df_metadata, df_items, grid_nums):
    '''
    Arguments
    grids_df: dataframe that contains grid numbers and their centroid latlon    
    df_metadata: dataframe with station metadata i.e. latitude and longitude
    df_: dataframe that contains stn latlon
    ts: timestamp
    grid_nums: list of unique grid numbers
    '''
    
    df_metadata['latlon']=df_metadata[['longitude','latitude',]].apply(tuple,axis=1)
    df_metadata.index=df_metadata['station']
    assignment={}

    for i in range(len(grid_nums)): # for each grid_num
        grid_coordinates = grids_df.iloc[i]['latlon'] # latlon of row i grid_num        
        distances = df_metadata['latlon'].apply(lambda x: distance.euclidean(x,grid_coordinates))
        distance_sorted = distances.sort_values()
        
        for station in distance_sorted.index:
            if any(df_items[df_items['station_id']==station]): # there is a value                
                assignment[i]=station
                break
        
            else:
                continue           
    
    return assignment

In [44]:
query_timestamp = '2022-06-01 13:15:00'
measures = ['air-temperature','rainfall','relative-humidity']
assignment = {measure:[] for measure in measures}
for measure in measures:
    df_metadata=query_nea_metadata(measure=measure,query_timestamp=query_timestamp)
    df_items=query_nea_items(measure=measure,query_timestamp=query_timestamp)
    result=assign_grids(grids_df,df_metadata,df_items,grid_nums)
    assignment[measure]=result

assignment_df=pd.DataFrame(assignment)
assignment_df

Unnamed: 0,air-temperature,rainfall,relative-humidity
0,S100,S104,S100
1,S100,S104,S100
2,S100,S227,S100
3,S100,S227,S100
4,S100,S209,S100
...,...,...,...
176,S108,S108,S108
177,S121,S115,S121
178,S121,S115,S121
179,S121,S115,S121


### Processing weather files

### Load taxi json file

In [41]:
taxi_file = 'taxi_avail.joblib' # change directory and extension of file accordingly
taxi = joblib.load(taxi_file) # change reading method according to the file type

FileNotFoundError: [Errno 2] No such file or directory: 'taxi_avail.joblib'

### Processing taxi file

In [None]:
# Get list of taxi's coord
one_list = taxi['features'][0]['geometry']['coordinates']
# Convert list to corr grid num of the coord
test = [math.ceil((i[0]-103.6)/0.020454545454545583) + (13 - math.ceil((i[1] -1.208)/0.020538461538461547))*22 for i in one_list]

# getting dictionary of items
c = Counter(test)

# Getting taxi_count for relevant grid_num
df_taxicount = pd.DataFrame({'grid_num': [float(x) for x in list(c.keys())], 
                             'taxi_count': [x[1] for x in list(c.items())]})

# Get full list of grid_num as a dataframe:  grid_num | timestamp
all_grids = grids[['grid_num']]
all_grids['timestamp'] = ts


# Merge all_grids and df_taxicount
taxi_clean = pd.merge(all_grids, df_taxicount, how='left')
taxi_clean['taxi_count'] = taxi_clean['taxi_count'].fillna(0) #fill missing taxi_count = 0


## 3. Merging all the cleaned files

In [None]:
## Merge all together
merge_df = pd.merge(humid_clean, rain_clean)
merge_df = pd.merge(merge_df, temp_clean)
merge_df = pd.merge(merge_df, taxi_clean)
merge_df

## Feature engineering

In [None]:
merge_df['timestamp'] = merge_df['timestamp'].apply(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%S'))
merge_df['hour'] = merge_df['timestamp'].apply(lambda x: x.hour)
merge_df['month'] = merge_df['timestamp'].apply(lambda x: x.month)
merge_df['day'] = merge_df['timestamp'].apply(lambda x: x.weekday())
merge_df['minute'] = merge_df['timestamp'].apply(lambda x: x.minute)

merge_df['time_30'] = merge_df['timestamp'].apply(lambda x: x + timedelta(hours=0.5))
merge_df['time_60'] = merge_df['timestamp'].apply(lambda x: x + timedelta(hours=1))

In [None]:
## To get y_30 and y_60 targets
# for each timestamp, get 30min later
merge_df['y_30'] = np.nan
merge_df['y_60'] = np.nan

for i in tqdm(range(len(merge_df))): # for each row
    ts = merge_df.iloc[i]['time_30']
    gridnum = merge_df.iloc[i]['grid_num']
    
    merge_df.iloc[i, merge_df.columns.get_loc('y_30')] = merge_df[(merge_df['grid_num'] == gridnum) & 
                                                                       (merge_df['timestamp'] == ts)].reset_index()['taxi_count'][0]
    

for i in tqdm(range(len(merge_df))): # for each row
    ts = merge_df.iloc[i]['time_60']
    gridnum = merge_df.iloc[i]['grid_num']
    
    merge_df.iloc[i, merge_df.columns.get_loc('y_60')] = merge_df[(merge_df['grid_num'] == gridnum) & 
                                                                       (merge_df['timestamp'] == ts)].reset_index()['taxi_count'][0]
    

In [None]:
## merge_df is the final dataset, ready to be used for EDA/ training etc