# Scenario 2

## Step 1: Create the datasets using Python

In [None]:
from google.cloud import bigquery
from google.cloud.exceptions import NotFound

client = bigquery.Client()

datasets_name = ['raw_bikesharing','dwh_bikesharing','dm_bikesharing']
location = 'US'

def create_bigquery_dataset(dataset_name):
    """Create bigquery dataset. Check first if the dataset exists
        Args:
            dataset_name: String
    """

    dataset_id = "{}.{}".format(client.project, dataset_name)
    try:
        client.get_dataset(dataset_id)
        print("Dataset {} already exists".format(dataset_id))
    except NotFound:
        dataset = bigquery.Dataset(dataset_id)
        dataset.location = location
        dataset = client.create_dataset(dataset, timeout=30)  # Make an API request.
        print("Created dataset {}.{}".format(client.project, dataset.dataset_id))

for name in datasets_name:
    create_bigquery_dataset(name)

## Step 2a: Initial loading of the trips table into BigQuery

In [None]:
from google.cloud import bigquery

# TODO : Change to your project id
PROJECT_ID = "sparsh-data-eng-on-gcp"
GCS_URI = "gs://{}-data-bucket/data/trips/20180101/*.json".format(
    project_id)
# This uri for load data from 2018-01-02
#GCS_URI = "gs://{}-data-bucket/data/trips/20180102/*.json".format(project_id)
TABLE_ID = "{}.raw_bikesharing.trips".format(PROJECT_ID)

client = bigquery.Client()


def load_gcs_to_bigquery_event_data(GCS_URI, TABLE_ID, table_schema):
    job_config = bigquery.LoadJobConfig(
        schema=table_schema,
        source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
        write_disposition='WRITE_APPEND'
    )

    load_job = client.load_table_from_uri(
        GCS_URI, TABLE_ID, job_config=job_config
    )

    load_job.result()
    table = client.get_table(TABLE_ID)

    print("Loaded {} rows to table {}".format(table.num_rows, TABLE_ID))


bigquery_table_schema = [
    bigquery.SchemaField("trip_id", "STRING"),
    bigquery.SchemaField("duration_sec", "INTEGER"),
    bigquery.SchemaField("start_date", "TIMESTAMP"),
    bigquery.SchemaField("start_station_name", "STRING"),
    bigquery.SchemaField("start_station_id", "STRING"),
    bigquery.SchemaField("end_date", "TIMESTAMP"),
    bigquery.SchemaField("end_station_name", "STRING"),
    bigquery.SchemaField("end_station_id", "STRING"),
    bigquery.SchemaField("member_gender", "STRING")
]

In [None]:
load_gcs_to_bigquery_event_data(GCS_URI, TABLE_ID, bigquery_table_schema)

The code will load trips data from gcs to BigQuery. There are a few things that you need to pay attention to in the code.

The GCS file path contains date information, for example, **20180101**. We will use the folder name in our **gcs** file path like this:

```
gcs_uri = "gs://{}-data-bucket/data/trips/20180101/*.json".format(project_id)
```

The data stored in **NEWLINE DELIMITED JSON** is compressed in **gzip** files. The BigQuery load job config accepts the **NEWLINE_DELIMITED_JSON** file format, and not standard JSON. In case you have standard JSON, you need to transform it first to the correct JSON format. In the code, we need to define the format like this:

```
source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
```

The write disposition is **WRITE_APPEND**. This won't matter during the initial load, but is an important configuration for handling new data. We will revisit this later in the next steps:

```
write_disposition = 'WRITE_APPEND'
```

Lastly, you will need to change the **project_id** variable to that of your **project_id** variable, since you want to load data from your own GCS bucket. See the following line:

```
project_id = "sparsh-data-eng-on-gcp"
```

## Step 2b: Initial loading of the regions table into BigQuery

In [None]:
from google.cloud import bigquery

# TODO : Change to your project id
PROJECT_ID = "sparsh-data-eng-on-gcp"
PUBLIC_TABLE_ID = "bigquery-public-data.san_francisco_bikeshare.bikeshare_regions"
TARGET_TABLE_ID = "{}.raw_bikesharing.regions".format(PROJECT_ID)


def load_data_from_bigquery_public(PUBLIC_TABLE_ID, TARGET_TABLE_ID):
    client = bigquery.Client()
    job_config = bigquery.QueryJobConfig(
        destination=TARGET_TABLE_ID,
        write_disposition='WRITE_TRUNCATE')

    sql = "SELECT * FROM `{}`;".format(PUBLIC_TABLE_ID)
    query_job = client.query(sql, job_config=job_config)

    try:
        query_job.result()
        print("Query success")
    except Exception as exception:
        print(exception)

In [None]:
load_data_from_bigquery_public(PUBLIC_TABLE_ID, TARGET_TABLE_ID)

## Step 3a: Handle the daily batch data loading for the trips table

In [None]:
from google.cloud import bigquery


# TODO : Change to your project id
PROJECT_ID = "sparsh-data-eng-on-gcp"
GCS_URI = "gs://{}-data-bucket/data/trips/20180102/*.json".format(
    project_id)
TABLE_ID = "{}.raw_bikesharing.trips".format(PROJECT_ID)

client = bigquery.Client()


def load_gcs_to_bigquery_event_data(GCS_URI, TABLE_ID, table_schema):
    job_config = bigquery.LoadJobConfig(
        schema=table_schema,
        source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
        write_disposition='WRITE_APPEND'
    )

    load_job = client.load_table_from_uri(
        GCS_URI, TABLE_ID, job_config=job_config
    )

    load_job.result()
    table = client.get_table(TABLE_ID)

    print("Loaded {} rows to table {}".format(table.num_rows, TABLE_ID))


bigquery_table_schema = [
    bigquery.SchemaField("trip_id", "STRING"),
    bigquery.SchemaField("duration_sec", "INTEGER"),
    bigquery.SchemaField("start_date", "TIMESTAMP"),
    bigquery.SchemaField("start_station_name", "STRING"),
    bigquery.SchemaField("start_station_id", "STRING"),
    bigquery.SchemaField("end_date", "TIMESTAMP"),
    bigquery.SchemaField("end_station_name", "STRING"),
    bigquery.SchemaField("end_station_id", "STRING"),
    bigquery.SchemaField("member_gender", "STRING")
]

In [None]:
load_gcs_to_bigquery_event_data(GCS_URI, TABLE_ID, bigquery_table_schema)

Let's check whether we have data from both 2018-01-01 and 2018-01-02 by using this SQL query on the BigQuery console:

In [None]:
SELECT distinct(date(start_date))
FROM `[your project id].raw_bikesharing.trips`;

And finally, we want to make sure that no records have been duplicated by using this SQL query:

In [None]:
SELECT count(*) cnt_trip_id, trip_id 
FROM `[your project id].raw_bikesharing.trips`
GROUP BY trip_id 
HAVING cnt_trip_id > 1;

## Step 3b: Handle the daily batch data loading for the stations table

In [None]:
from google.cloud import bigquery

# TODO : Change to your project id
PROJECT_ID = "sparsh-data-eng-on-gcp"
TABLE_ID = "{}.raw_bikesharing.stations".format(PROJECT_ID)
GCS_URI = "gs://{}-data-bucket/mysql_export/stations/20180102/stations.csv".format(PROJECT_ID)


def load_gcs_to_bigquery_snapshot_data(GCS_URI, TABLE_ID, table_schema):
    client = bigquery.Client()
    job_config = bigquery.LoadJobConfig(
        schema=table_schema,
        source_format=bigquery.SourceFormat.CSV,
        write_disposition='WRITE_TRUNCATE'
    )

    load_job = client.load_table_from_uri(
        GCS_URI, TABLE_ID, job_config=job_config
    )
    load_job.result()
    table = client.get_table(TABLE_ID)

    print("Loaded {} rows to table {}".format(table.num_rows, TABLE_ID))


bigquery_table_schema = [
    bigquery.SchemaField("station_id", "STRING"),
    bigquery.SchemaField("name", "STRING"),
    bigquery.SchemaField("region_id", "STRING"),
    bigquery.SchemaField("capacity", "INTEGER")
]

In [None]:
load_gcs_to_bigquery_snapshot_data(GCS_URI, TABLE_ID, bigquery_table_schema)

## Step 4a: Create Fact Table

In [None]:
import sys
from google.cloud import bigquery

# TODO : Change to your project id
PROJECT_ID = "sparsh-data-eng-on-gcp"
TARGET_TABLE_ID = "{}.dwh_bikesharing.fact_trips_daily".format(PROJECT_ID)


def create_fact_table(PROJECT_ID, TARGET_TABLE_ID, load_date):
    print("\nLoad date:", load_date)

    client = bigquery.Client()
    job_config = bigquery.QueryJobConfig(
        destination=TARGET_TABLE_ID,
        write_disposition='WRITE_APPEND')

    sql = """SELECT DATE(start_date) as trip_date,
          start_station_id,
          COUNT(trip_id) as total_trips,
          SUM(duration_sec) as sum_duration_sec,
          AVG(duration_sec) as avg_duration_sec
          FROM `{PROJECT_ID}.raw_bikesharing.trips` trips
          JOIN `{load_date}.raw_bikesharing.stations` stations
          ON trips.start_station_id = stations.station_id
          WhERE DATE(start_date) = DATE('{}')
          GROUP BY trip_date, start_station_id
          ;""".format(PROJECT_ID=PROJECT_ID, load_date=load_date)

    query_job = client.query(sql, job_config=job_config)

    try:
        query_job.result()
        print("Query success")
    except Exception as exception:
        print(exception)


In [None]:
create_fact_table(PROJECT_ID, TARGET_TABLE_ID, load_date='2018-01-01')

# Run it again to load the next day's data:
create_fact_table(PROJECT_ID, TARGET_TABLE_ID, load_date='2018-01-02')

## Step 4b: Create dimension table

In [None]:
from google.cloud import bigquery

# TODO : Change to your project id
PROJECT_ID = "sparsh-data-eng-on-gcp"
TARGET_TABLE_ID = "{}.dwh_bikesharing.dim_stations".format(PROJECT_ID)


def create_dim_table(PROJECT_ID, TARGET_TABLE_ID):
    client = bigquery.Client()
    job_config = bigquery.QueryJobConfig(
        destination=TARGET_TABLE_ID,
        write_disposition='WRITE_TRUNCATE')

    sql = """SELECT station_id,
          stations.name as station_name,
          regions.name as region_name,
          capacity
          FROM `{PROJECT_ID}.raw_bikesharing.stations` stations
          JOIN `{PROJECT_ID}.raw_bikesharing.regions` regions
          ON stations.region_id = CAST(regions.region_id AS STRING)
          ;""".format(PROJECT_ID=PROJECT_ID)

    query_job = client.query(sql, job_config=job_config)

    try:
        query_job.result()
        print("Query success")
    except Exception as exception:
        print(exception)

In [None]:
create_dim_table(PROJECT_ID, TARGET_TABLE_ID)