##### This notebook imports source data for this project into BigQuery. 
##### The source data is first exported in GCS in CSV format. The BigQuery source table is "bigquery-public-data.ml_datasets.ulb_fraud_detection". This is a table of credit card transactions that are classified as fradulant, Class =1 or normal, Class = 0.

##### This table has 284807 credit card transaction classified as fradulent or normal in the column Class. In order to protect confidentiality the original features have been transformed using PCA(Principal component analysis) into 28 features named V1,V2,....,V28(float). 
##### Two descriptive features are provided without transformation by PCA.
##### Time (integer): is the seconds elapsed between the transaction and the eraliest transaction in the table
##### Amount (float): is the value of the transaction.

#### Preparation of Data
##### transaction_id (string): a unique id for row/transaction
##### splits (string): this divides the transactions into the set of TRAIN(80%), VALIDATE(10%), TEST(10%)

In [4]:
# Setup
project = !gcloud config get-value project
PROJECT_ID = project[0]
PROJECT_ID

'ml-pipeline-project-401216'

In [9]:
REGION = "us-central1"
EXPERIMENT = "01"
SERIES = "01"

# source data
BQ_PROJECT = PROJECT_ID
BQ_DATASET = "fraud"
BQ_TABLE = "fraud"

# sourcing data from bigquery-public-data.ml_datasets.ulb_fraud_detection
BQ_SOURCE = "bigquery-public-data.ml_datasets.ulb_fraud_detection"

In [10]:
# packages and clients:
from google.cloud import storage
from google.cloud import bigquery

gcs = storage.Client(project = PROJECT_ID)
bq = bigquery.Client(project = PROJECT_ID)

In [11]:
BUCKET = PROJECT_ID + '-bucket1'
BUCKET

'ml-pipeline-project-401216-bucket1'

In [12]:
# Source data in GCS Storage bucket

file = "{}/{}/data/{}.csv".format(SERIES, EXPERIMENT, BQ_TABLE)
file

'01/01/data/fraud.csv'

In [15]:
# Exporting the dataset into GCS bucket

bucketDef = gcs.bucket(BUCKET)
if storage.Blob(bucket = bucketDef, name = file).exists(gcs):
    print("The file is already created at: gs://{}/{}".format(bucketDef.name, file))
else:
    source = bigquery.TableReference.from_string(BQ_SOURCE)
    extract = bq.extract_table(source = source, destination_uris = ["gs://{}/{}".format(bucketDef.name, file)])
    print("Creating the export....")
    extract.result()
    print("Exported the table to gs://{}/{}".format(bucketDef.name, file))
    

The file is already created at: gs://ml-pipeline-project-401216-bucket1/01/01/data/fraud.csv


In [16]:
# list files in the bucket
list(bucketDef.list_blobs(prefix = "{}/{}".format(SERIES, EXPERIMENT)))

[<Blob: ml-pipeline-project-401216-bucket1, 01/01/data/fraud.csv, 1697184037197809>]

In [21]:
# Checking if datasets exists in bigQuery
datasets = list(bq.list_datasets())
for sets in datasets:
    print(sets.dataset_id)

In [27]:
# create dataset if missing

ds = bigquery.Dataset("{}.{}".format(BQ_PROJECT ,BQ_DATASET))
ds.location = REGION
ds.labels = {'experiment' : f'{EXPERIMENT}'}
ds = bq.create_dataset(dataset = ds, exists_ok = True)

In [None]:
# creating table in the dataset

from google.cloud.exceptions import NotFound

try:
    table = bq.get_table("{}.{}.{}".format(BQ_PROJECT, BQ_DATASET, BQ_TABLE))
    if table:
        print("The table already exists: {}.{}.{}".format(BQ_PROJECT, BQ_DATASET, BQ_TABLE))
        
except NotFound as error:
    print("Creating table....")
    destination = bigquery.TableReference.from_string("{}.{}.{}".format(BQ_PROJECT, BQ_DATASET, BQ_TABLE))
    
    job_config = bigquery.LoadJobConfig(
        write_dispostion = "WRITE_TRUNCATE",
        source_format = bigquery.SourceFormat.CSV
        autodetect = True,
        labels = {'experiment' : 'f'{EXPERIMENT}'}
    )
                  
    job = bg.load_table_from_uri(f"gs://{bucketDef.name}/{file}", destination, job_config = job_config)
    job.result()
    print("Table created successfully: {}.{}.{}".format(BQ_PROJECT, BQ_DATASET, BQ_TABLE))