# Google Cloud Storage Access Notebook

This notebook contains code for the prototyping of loading to bigquery via Google Cloud Storage.

Important! Clone the [project repository](https://github.com/spencermarley/smu-cs611-mleng-project.git) in order to access the parsing functions.

## Section 1 - Imports

In [1]:
import gcsfs
import sys
import os
import json
import math
import itertools
from collections import Counter
import re

import datetime
from datetime import date, timedelta

import pandas as pd
import geopandas as gpd
from matplotlib import pyplot as plt
import seaborn as sns

from shapely import wkt
from shapely.ops import nearest_points

import warnings
from tqdm import tqdm
import joblib
warnings.filterwarnings('ignore')

from src import jsonParser
from src import assignment

## Section 2 - Bucket Connection Setup

In [2]:
project = 'ml-eng-cs611-group-project'
nea_bucket = 'ml-eng-cs611-group-project-nea'
taxi_bucket = 'ml-eng-cs611-group-project-taxis'
dataset_id='taxi_dataset'
measure = 'rainfall'
measures = ['rainfall','air-temperature','relative-humidity']

fs = gcsfs.GCSFileSystem(project=project)


In [4]:
def get_start_index(start_file:str,file_list:list):
    '''Iteratively search from start of file list to find first occurence of end_file.
    Args:
        start_file (str): Search string. Can give a date of form YYYY-MM-DD to find first file at specific date, or filename to match specific file.
        file_list (list): List of file paths from Google Cloud Storage.
    Yields:
        start_index (int): Index of last occurrence of search string.
    '''
    start_index=0

    
    file_re = re.compile(start_file)
    
    for file in file_list:
        if file_re.findall(file):
            start_index=file_list.index(file)
            print(f"Valid start file provided, starting batch loading at index {start_index}")
            break

    return start_index

def get_end_index(end_file:str,file_list:list):
    '''Iteratively search from end of file list to find last occurence of end_file.
    Args:
        end_file (str): Search string. Can give a date of form YYYY-MM-DD to find last file at specific date, or filename to match specific file.
        file_list (list): List of file paths from Google Cloud Storage.
    Yields:
        end_index (int): Index of last occurrence of search string.
    '''
    end_index=len(file_list)
    
    file_re = re.compile(end_file)
    for file in file_list[::-1]:
        if file_re.findall(file):
            end_index=file_list.index(file)
            print(f"Valid end date provided, end batch loading at index {end_index}")
            break
    
    return end_index

def load_nea_to_gbq(project:str,dataset_id:str,measure:str,filename:str,fs:None):
    '''Load a single json 
    Args:
        project:str:        
        dataset_id:str:
        measure:str:
        filename:str:
    '''
    print(f"Current processing {filename}")
    parser = jsonParser.jsonParser(fs)
    
    items = parser.get_items(filename,measure)
    metadata = parser.get_metadata(filename,measure)

    item_table = dataset_id+'.'+measure+'-items'
    metadata_table = dataset_id+'.'+measure+'-metadata'

    print(f"Writing {measure} items to {item_table}")
    items.to_gbq(item_table,project,chunksize=None,if_exists='append')

    print(f"Writing {measure} metadata to {metadata_table}")    
    metadata.to_gbq(metadata_table,project,chunksize=None,if_exists='append')

def load_taxi_to_gbq(project:str,dataset_id:str,filename:str,fs=None):
    '''Load a single json 
    Args:
        project:str:        
        dataset_id:str:
        measure:str:
        filename:str:
    '''
    print(f"Current processing {filename}")
    parser = jsonParser.jsonParser(fs)

    taxi_data = parser.load_taxi_data(filename)

    print(f"Writing taxi data")
    taxi_data.to_gbq(dataset_id+'.'+'taxi-availability',project,chunksize=None,if_exists='append')

In [3]:
nea_filenames = fs.glob('/'.join([nea_bucket,measure,"*"]))
nea_filenames[-1]

'ml-eng-cs611-group-project-nea/rainfall/2022-06-19T11-00-03.json'

In [5]:
taxi_filenames = fs.glob('/'.join([taxi_bucket,'taxis',"*"]))
taxi_filenames[-1]

'ml-eng-cs611-group-project-taxis/taxis/2022-06-19T11-00-02.json'

## Section 4 - Write to BigQuery

We have created 6 tables in Bigquery:

- `air-temperature-items`
- `air-temperature-metadata`
- `rainfall-items`
- `rainfall-metadata`
- `relative-humidity-items`
- `relative-humidity-metadata`

First, we write the corresponding measure metadata and items to the correct table

In [None]:
measure=

In [None]:
for measure in measures:
    print(f"Writing items for {measure}")
    nea_data[measure]['items'].to_gbq(dataset_id+'.'+measure+'-items',project,chunksize=None,if_exists='append')
    print(f"Writing metadata for {measure}")
    nea_data[measure]['metadata'].to_gbq(dataset_id+'.'+measure+'-metadata',project,chunksize=None,if_exists='append')

Next, we write the taxi data

In [None]:
taxi_data.to_gbq(dataset_id+'.'+'taxi-availability',project,chunksize=None,if_exists='append')

# Load most recent file

In [None]:
import pandas as pd
import gcsfs
import sys
import json

sys.path.append('smu-cs611-mleng-project')
from src import jsonParser

def load_nea_to_gbq(project:str,bucket:str,dataset_id:str,measure:str,filename:str):
    '''Load a single json 
    Args:
        project:str:
        bucket
        dataset_id
        measure
        filename
    '''
    fs = gcsfs.GCSFileSystem(project=project)
    filenames = fs.glob('/'.join([bucket,measure,"*"]))
    current_file=filenames[-1]
    print(f"Current processing {current_file}")

    parser = jsonParser.jsonParser(fs)
    items = parser.get_items(current_file,measure)
    metadata = parser.get_metadata(current_file,measure)

    item_table = dataset_id+'.'+measure+'-items'
    metadata_table = dataset_id+'.'+measure+'-metadata'

    print(f"Writing {measure} items to {item_table}")
    items.to_gbq(item_table,project,chunksize=None,if_exists='append')

    print(f"Writing {measure} metadata to {metadata_table}")
    metadata.to_gbq(metadata_table,project,chunksize=None,if_exists='append')

project=params['project']
bucket=params['bucket']
dataset_id=params['dataset_id']
    

if __name__ == '__main__':
    import argparse
    
    parser = argparse.ArgumentParser(description='Reads a single NEA JSON file to GBQ')    
    parser.add_argument('--project','-p', default='ml-eng-cs611-group-project', type=str, help='GCP project name i.e. ml-eng-cs611-group-project')
    parser.add_argument('--bucket','-b', default='ml-eng-cs611-group-project-nea', type=str, help='GCS bucket name i.e. ml-eng-cs611-group-project-nea')
    parser.add_argument('--dataset_id','-d', default='taxi_dataset', type=str, help='GCS bucket name i.e. ml-eng-cs611-group-project-nea')
    parser.add_argument('--measure','-m', type=str, help='NEA measure i.e. air-temperature,relative-humidity or rainfall')
    parser.add_argument('--filename','-f', type=str, help='If provided, file to load')
    parser.add_argument('--date','-d', type=str, help='YYYY-MM-DD format. If provided, load data up to this date')
    args = parser.parse_args()
    
    project = args.project
    bucket = args.bucket
    dataset_id = args.dataset_id
    measure = args.measure
    filename = args.filename
    date = args.date
    

## Section 4 - Assignment code

First, get grid dataframe from repo

In [None]:
grids = gpd.read_file('smu-cs611-mleng-project/Gridding/SG_grid/SG_grids.shp')
grids['centroid'] = grids['geometry'].apply(lambda x: x.centroid) # grids get centroid

Next, pass the grid and raw data into the Assignment class

In [None]:
taxi_gdf = parser.load_taxi_gdf(taxi_filenames[0])

In [None]:
assigner = assignment.Assignment(grids=grids,nea_data=nea_data,taxi_data=taxi_gdf)

In [None]:
assigner.nea_preprocess()

In [None]:
assigner.taxi_preprocess()

In [None]:
assignment_df = assigner.merge_grids()
assignment_df

In [None]:
taxi_data

In [None]:
from src.jsonParser import jsonParser
from nea_load import load_nea_to_gbq
from taxi_load import load_taxi_to_gbq
import gcsfs
import re

project='ml-eng-cs611-group-project'
dataset_id='taxi_dataset'
measure = 'relative-humidity'
filename = "/home/ianchongweiming/smu-cs611-mleng-project/relative-humidity_2022-06-10T00-30-00.json"

if measure == 'taxi-availability':    
    load_taxi_to_gbq(project=project,dataset_id=dataset_id,filename=filename,fs=None)
else:
    load_nea_to_gbq(project=project,dataset_id=dataset_id,measure=measure,filename=filename,fs=None)