# BigQuery data query

In [1]:
from google.cloud import bigquery
client = bigquery.Client()

In [2]:
sql = """
SELECT
  name,
  gender,
  COUNT(name) AS num_names
FROM
  `bigquery-public-data.usa_names.usa_1910_current`
GROUP BY
  name,
  gender
"""
names_df = client.query(sql).to_dataframe()
print(names_df.shape)
names_df.head()

(34952, 3)


Unnamed: 0,name,gender,num_names
0,Helen,F,4846
1,Mary,F,5551
2,Margaret,F,5463
3,Dorothy,F,4474
4,Ruth,F,4946


In [4]:
# write to csv
names_df[['name', 'gender']].to_csv("../data/us-names.csv", index=False)

# Writing to sharded tfrecords

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np

In [4]:
def write_tfrecord(prefix, chunk, idx):
    # Set writing options with compression
    #options = tf.io.TFRecordOptions(compression_type="ZLIB", compression_level=9)
    options = None
    
    with tf.io.TFRecordWriter("../data/tfrecords/us_names_"+prefix+"_{:>03d}".format(idx)+".tfrecord", options=options) as writer:       
        for row in chunk.values:
            features, label = row[:-1], row[-1]
            
            name = tf.train.Feature(bytes_list=tf.train.BytesList(value=[features[0].encode("utf-8")]))
            gender = tf.train.Feature(bytes_list=tf.train.BytesList(value=[label.encode("utf-8")]))

            data_dict = tf.train.Features(feature={"name": name, "gender": gender})
            example = tf.train.Example(features=data_dict)

            writer.write(example.SerializeToString())

def serialize(chunk_df):
    # Serializes inputs from a pandas dataset (read in chunks)   
    # Write each chunk into individual tfrecord (sharding)
    for idx, chunk in enumerate(chunk_df):
        chunk.dropna(how="any", inplace=True)
        train_chunk, val_chunk, test_chunk = np.split(chunk.sample(frac=1), [int(.7*len(chunk)), int(.9*len(chunk))])

        write_tfrecord("train", train_chunk, idx)
        write_tfrecord("val", val_chunk, idx)
        write_tfrecord("test", test_chunk, idx)

In [6]:
%%time
# Each chunk has 20K names
names_df = pd.read_csv("../data/us-names.csv", chunksize=20000, engine='c', iterator=True)
serialize(names_df)

CPU times: user 946 ms, sys: 11.9 ms, total: 958 ms
Wall time: 956 ms
