# BigQuery data query

In [13]:
from google.cloud import bigquery
from google.oauth2 import service_account

credentials = service_account.Credentials.from_service_account_file(
    "../credentials/tokopedia-970.json",
    scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

client = bigquery.Client(credentials=credentials, project=credentials.project_id)

In [14]:
sql = """
SELECT full_name as name, CASE WHEN sex=1 THEN 'm' 
                               WHEN sex=2 THEN 'f' 
                          END as gender 
FROM `tokopedia-970.voyager_dwh.bi_dim_user` 
WHERE sex=1 or sex=2"""
names_df = client.query(sql).to_dataframe()
names_df.head()

Unnamed: 0,name,gender
0,Ge Mahenterissa,f
1,Shafa Tasha Nabilah,f
2,Ririn Ramanda,f
3,Anta,f
4,Nuryani Ghelis,f


In [15]:
# write to csv
names_df.to_csv("../data/toko-names.csv", index=False)

In [16]:
names_df.shape

(43991434, 2)

# Writing to sharded tfrecords

In [1]:
#!pip install ipywidgets

In [2]:
#!jupyter nbextension enable --py widgetsnbextension

In [3]:
#!sudo jupyter labextension install @jupyter-widgets/jupyterlab-manager

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np

In [2]:
def write_tfrecord(prefix, chunk, idx):
    # Set writing options with compression
    #options = tf.io.TFRecordOptions(compression_type="ZLIB", compression_level=9)
    options = None
    
    with tf.io.TFRecordWriter("../data/nonzip/toko_names_"+prefix+"_{:>03d}".format(idx)+".tfrecord", options=options) as writer:       
        for row in chunk.values:
            features, label = row[:-1], row[-1]
            
            name = tf.train.Feature(bytes_list=tf.train.BytesList(value=[features[0].encode("utf-8")]))
            gender = tf.train.Feature(bytes_list=tf.train.BytesList(value=[label.encode("utf-8")]))

            data_dict = tf.train.Features(feature={"name": name, "gender": gender})
            example = tf.train.Example(features=data_dict)

            writer.write(example.SerializeToString())

def serialize(chunk_df):
    # Serializes inputs from a pandas dataset (read in chunks)   
    # Write each chunk into individual tfrecord (sharding)
    for idx, chunk in enumerate(chunk_df):
        chunk.dropna(how="any", inplace=True)
        train_chunk, val_chunk, test_chunk = np.split(chunk.sample(frac=1), [int(.7*len(chunk)), int(.9*len(chunk))])

        write_tfrecord("train", train_chunk, idx)
        write_tfrecord("val", val_chunk, idx)
        write_tfrecord("test", test_chunk, idx)

In [None]:
%%time
# Each chunk has 5M names
names_df = pd.read_csv("../data/toko-names.csv", chunksize=5000000, engine='c', iterator=True)
serialize(names_df)