# Preprocessing
To clean `JSON` weather and taxi avail files called from NEA and LTA sites <br>

## Libraries

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gcsfs

from scipy.spatial import distance
from datetime import date, timedelta
import re
import math
from collections import Counter

import copy
from tqdm import tqdm
import itertools
import joblib

import geopandas as gpd
import fiona
from shapely import geometry
from shapely import wkt
from shapely.geometry import Polygon

from src import jsonParser
from src import assignment

import warnings
warnings.filterwarnings('ignore')

## 2. Processing weather and taxi tabular files 
Merge the data together into 1 dataframe<br>
This is only for ONE timestamp. Timestamps should be cleaned min is either 00 or 30; sec is 00<br>
Eg 2019-01-01T12:30:00 <br>

Weather files: <br>
1. rainfall w stations value     ---   timestamp | station_id | value
2. rainfall w stations latlon    ---  timestamp | station_id | latitude | longitude
3. humidity w stations value     ---   timestamp | station_id | value
4. humidity w stations latlon    ---   timestamp | station_id | latitude | longitude
5. temperature w stations value  ---  timestamp | station_id | value
6. temperature w stations latlon --- timestamp | station_id | latitude | longitude
<br>

Taxi files: <br>
7. taxi avail json file

In [5]:
project = 'ml-eng-cs611-group-project'
nea_bucket = 'ml-eng-cs611-group-project-nea'
taxi_bucket = 'ml-eng-cs611-group-project-taxis'
dataset_id='taxi_dataset'
measure = 'rainfall'
measures = ['rainfall','air-temperature','relative-humidity']

fs = gcsfs.GCSFileSystem(project=project)
nea_filenames = fs.glob('/'.join([nea_bucket,measure,"*"]))

### Read grid file

In [6]:
grids_file = './updated codes/filter_grids_2/filter_grids_2.shp' ## To change directory

grids = gpd.read_file(grids_file)
grids['centroid'] = grids['geometry'].apply(lambda x: x.centroid) # get grids' centroid

# convert to dataframe
grids_df = pd.DataFrame(grids)
grids_df['centroid'] = grids_df['centroid'].astype(str)
grids_df['latlon'] = grids_df['centroid'].apply(lambda x: (float(x.split(' ')[1][1:]), float(x.split(' ')[2][:-1])))

# Get unique grid_num
grid_nums = list(grids_df['grid_num'].unique())

### Load weather datasets

In [7]:
nea_data = {}

for measure in measures:
    filenames = fs.glob('/'.join([nea_bucket,measure,"*"]))
    file = filenames[0]
    parser = jsonParser.jsonParser(fs)
    
    items = parser.get_items(file,measure)    
    metadata = parser.get_metadata(file,measure)
    
    nea_data[measure]={'items':items,'metadata':metadata}

### Processing weather files

In [9]:
## Function to get weather value for each grid at each timestamp. Returns a dataframe
def get_value(weather, df, df_stn, ts, grid_nums, grids_df):
    '''
    Arguments
    weather: temp/ rain/ humid
    df: dataframe that contains weather value
    df_stn: dataframe that contains stn latlon
    ts: timestamp
    grid_nums: list of unique grid numbers
    grids_df: dataframe that contains grid numbers and their centroid latlon
    '''
    value_list = []
    df_fil = df[df['timestamp'] == ts].reset_index()
    df_stn_fil = df_stn[df_stn['timestamp'] == ts].reset_index()
    
    for i in range(len(grid_nums)): # for each grid_num
        a = grids_df.iloc[i]['latlon'] # latlon of row i grid_num
        stn_id = 0
        shortest = 1000000
        for j in range(len(df_stn_fil)): # for each station
            b = df_stn_fil.iloc[j]['latlon']
            interim = distance.euclidean(a, b) # get euclidean
            if interim < shortest:
                stn_id = df_stn_fil.iloc[j]['station_id']
                shortest = interim
            else:
                pass
            
        # after getting the nearest stn_id
        value = df_fil[df_fil['station_id'] == stn_id].reset_index()['value'][0] # get value
        value_list.append(value) # append value
    
    df_interim = pd.DataFrame({'grid_num': grid_nums, 'timestamp': [ts for x in range(len(grid_nums))], 
                               f'{weather}': value_list})
    
    return df_interim


In [None]:
ts = df_temp_stn.iloc[0]['timestamp'] ## To change if there are other ways to get the timestamp

temp_clean = get_value('temp', nea_data['air-temperature']['items'], nea_data['air-temperature']['metadata'], ts, grid_nums, grids_df)
rain_clean = get_value('rain', nea_data['rainfall']['items'], nea_data['rainfall']['metadata'], ts, grid_nums, grids_df)
humid_clean = get_value('humid', nea_data['relative-humidity']['items'], nea_data['relative-humidity']['metadata'], ts, grid_nums, grids_df)

### Load taxi json file

In [None]:
taxi_file = 'taxi_avail.joblib' # change directory and extension of file accordingly
taxi = joblib.load(taxi_file) # change reading method according to the file type

### Processing taxi file

In [None]:
# Get list of taxi's coord
one_list = taxi['features'][0]['geometry']['coordinates']
# Convert list to corr grid num of the coord
test = [math.ceil((i[0]-103.6)/0.020454545454545583) + (13 - math.ceil((i[1] -1.208)/0.020538461538461547))*22 for i in one_list]

# getting dictionary of items
c = Counter(test)

# Getting taxi_count for relevant grid_num
df_taxicount = pd.DataFrame({'grid_num': [float(x) for x in list(c.keys())], 
                             'taxi_count': [x[1] for x in list(c.items())]})

# Get full list of grid_num as a dataframe:  grid_num | timestamp
all_grids = grids[['grid_num']]
all_grids['timestamp'] = ts


# Merge all_grids and df_taxicount
taxi_clean = pd.merge(all_grids, df_taxicount, how='left')
taxi_clean['taxi_count'] = taxi_clean['taxi_count'].fillna(0) #fill missing taxi_count = 0


## 3. Merging all the cleaned files

In [None]:
## Merge all together
merge_df = pd.merge(humid_clean, rain_clean)
merge_df = pd.merge(merge_df, temp_clean)
merge_df = pd.merge(merge_df, taxi_clean)
merge_df

## Feature engineering

In [None]:
merge_df['timestamp'] = merge_df['timestamp'].apply(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%S'))
merge_df['hour'] = merge_df['timestamp'].apply(lambda x: x.hour)
merge_df['month'] = merge_df['timestamp'].apply(lambda x: x.month)
merge_df['day'] = merge_df['timestamp'].apply(lambda x: x.weekday())
merge_df['minute'] = merge_df['timestamp'].apply(lambda x: x.minute)

merge_df['time_30'] = merge_df['timestamp'].apply(lambda x: x + timedelta(hours=0.5))
merge_df['time_60'] = merge_df['timestamp'].apply(lambda x: x + timedelta(hours=1))

In [None]:
## To get y_30 and y_60 targets
# for each timestamp, get 30min later
merge_df['y_30'] = np.nan
merge_df['y_60'] = np.nan

for i in tqdm(range(len(merge_df))): # for each row
    ts = merge_df.iloc[i]['time_30']
    gridnum = merge_df.iloc[i]['grid_num']
    
    merge_df.iloc[i, merge_df.columns.get_loc('y_30')] = merge_df[(merge_df['grid_num'] == gridnum) & 
                                                                       (merge_df['timestamp'] == ts)].reset_index()['taxi_count'][0]
    

for i in tqdm(range(len(merge_df))): # for each row
    ts = merge_df.iloc[i]['time_60']
    gridnum = merge_df.iloc[i]['grid_num']
    
    merge_df.iloc[i, merge_df.columns.get_loc('y_60')] = merge_df[(merge_df['grid_num'] == gridnum) & 
                                                                       (merge_df['timestamp'] == ts)].reset_index()['taxi_count'][0]
    

In [None]:
## merge_df is the final dataset, ready to be used for EDA/ training etc