# Google Big Query

In [1]:
from google.cloud import bigquery   

In [2]:
# from utils.helpers import load_cfg 
import yaml


def load_cfg(cfg_file):
    """
    Load configuration from a YAML config file
    """
    cfg = None
    with open(cfg_file, "r") as f:
        try:
            cfg = yaml.safe_load(f)
        except yaml.YAMLError as exc:
            print(exc)

    return cfg


In [3]:
CFG_FILE = "../../config/gg_bigquery/bigquery.yaml"
CFG = load_cfg(CFG_FILE)

In [4]:
PROJECT_ID = CFG['bigquery']["project_id"]
DATASET_ID = CFG['bigquery']["dataset_id"]

In [5]:
DATASET_ID,PROJECT_ID

('data_warehouse', 'bigdata-445102')

In [6]:
client = bigquery.Client(project=PROJECT_ID)

In [8]:
schema = [
        bigquery.SchemaField("id", "INTEGER", mode="REQUIRED"),
        bigquery.SchemaField("name", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("created_at", "TIMESTAMP", mode="NULLABLE"),
]

In [9]:
table_ref = client.dataset(DATASET_ID).table("test_table")
table = bigquery.Table(table_ref, schema=schema)
table = client.create_table(table)  # API request
print("Created table {}".format(table.table_id))

Created table test_table


In [9]:
datasets = list(client.list_datasets())

In [11]:
datasets[0].dataset_id

'data_warehouse'

In [None]:
i

In [2]:
PROJECT_ID = "your_project_id"
DATASET_ID = "your_dataset_id"
TABLE_ID = "your_table_id"

In [None]:
client = bigquery.Client(project=PROJECT_ID)

In [None]:
class GGBigQueryClient:
    def __init__(self, project_id, dataset_id):
        self.project_id = project_id
        self.dataset_id = dataset_id
        self.client = bigquery.Client(project=project_id)
    
    def create_table(self, table_id, schema):
        table_ref = self.client.dataset(self.dataset_id).table(table_id)
        table = bigquery.Table(table_ref, schema=schema)
        table = self.client.create_table(table)

In [10]:
import pandas as pd

In [16]:
df = pd.read_parquet("../../data/2023/green_tripdata_2023-01.parquet")
df

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2023-01-01 00:26:10,2023-01-01 00:37:11,N,1.0,166,143,1.0,2.58,14.90,1.0,0.5,4.03,0.0,,1.0,24.18,1.0,1.0,2.75
1,2,2023-01-01 00:51:03,2023-01-01 00:57:49,N,1.0,24,43,1.0,1.81,10.70,1.0,0.5,2.64,0.0,,1.0,15.84,1.0,1.0,0.00
2,2,2023-01-01 00:35:12,2023-01-01 00:41:32,N,1.0,223,179,1.0,0.00,7.20,1.0,0.5,1.94,0.0,,1.0,11.64,1.0,1.0,0.00
3,1,2023-01-01 00:13:14,2023-01-01 00:19:03,N,1.0,41,238,1.0,1.30,6.50,0.5,1.5,1.70,0.0,,1.0,10.20,1.0,1.0,0.00
4,1,2023-01-01 00:33:04,2023-01-01 00:39:02,N,1.0,41,74,1.0,1.10,6.00,0.5,1.5,0.00,0.0,,1.0,8.00,1.0,1.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68206,2,2023-01-31 22:29:00,2023-01-31 22:42:00,,,49,62,,4070.82,15.70,0.0,0.0,0.00,0.0,,1.0,16.70,,,
68207,2,2023-01-31 22:40:00,2023-01-31 22:48:00,,,10,205,,2.14,4.41,0.0,0.0,0.00,0.0,,1.0,5.41,,,
68208,2,2023-01-31 23:46:00,2023-02-01 00:02:00,,,66,37,,3.44,16.53,0.0,0.0,3.51,0.0,,1.0,21.04,,,
68209,2,2023-01-31 23:01:00,2023-01-31 23:19:00,,,225,189,,3.03,14.98,0.0,0.0,3.20,0.0,,1.0,19.18,,,


In [13]:
df.to_csv("../../data/green_tripdata_2023-01.csv", index=False)

In [17]:
df.sample(1000)

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
68066,2,2023-01-30 20:16:00,2023-01-30 20:40:00,,,7,100,,5.25,20.9,0.00,0.0,4.93,0.00,,1.0,29.58,,,
53094,2,2023-01-26 18:28:13,2023-01-26 18:39:19,N,1.0,65,97,1.0,1.60,12.1,2.50,0.5,0.00,0.00,,1.0,16.10,2.0,1.0,0.00
1963,2,2023-01-02 14:55:30,2023-01-02 15:02:34,N,1.0,74,42,1.0,0.57,7.9,0.00,0.5,0.10,0.00,,1.0,9.50,1.0,1.0,0.00
42842,2,2023-01-22 03:37:57,2023-01-22 03:40:44,N,1.0,236,263,1.0,0.51,5.1,1.00,0.5,0.00,0.00,,0.3,9.65,2.0,1.0,2.75
24340,2,2023-01-13 07:06:47,2023-01-13 07:21:38,N,5.0,130,132,1.0,5.39,20.0,0.00,0.0,0.00,0.00,,1.0,21.00,1.0,2.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17401,2,2023-01-10 05:15:31,2023-01-10 05:17:08,N,1.0,95,95,1.0,0.07,3.7,1.00,0.5,0.00,0.00,,1.0,6.20,2.0,1.0,0.00
49729,2,2023-01-25 13:04:48,2023-01-25 13:34:41,N,1.0,75,157,6.0,8.77,39.4,0.00,0.5,0.00,6.55,,1.0,50.20,2.0,1.0,2.75
32869,1,2023-01-17 15:22:03,2023-01-17 16:06:36,N,1.0,265,55,1.0,0.00,43.2,0.00,1.5,0.00,0.00,,1.0,44.70,1.0,1.0,0.00
46787,1,2023-01-24 08:16:31,2023-01-24 08:36:57,N,1.0,74,239,1.0,4.40,19.1,2.75,1.5,2.00,0.00,,1.0,25.35,1.0,1.0,2.75


In [18]:
df.sample(1000).to_parquet("../../data/green_tripdata_2023-01_view.parquet", index=False)