# Text Classification using TensorFlow and Google Cloud

This [bigquery-public-data:hacker_news](https://cloud.google.com/bigquery/public-data/hacker-news) contains all stories and comments from Hacker News from its launch in 2006.  Each story contains a story id, url, the title of the story, tthe author that made the post, when it was written, and the number of points the story received.

The objective is, given the title of the story, we want to build an ML model that can predict the source of this story.

### This notebook illustrates:
* Creating a ML datasets using Dataflow
* Create classification models with TensforFlow Estimaor APIs & TF.hub
* Train the best model using Cloud ML Engine
* Deploy the model on Cloud ML Engine and perform predictions


## Note: Use Python 3 Kernel 

In [None]:
%%bash

echo "PROJECT_ID: $(gcloud config get-value project)"

In [None]:
# change these to the project id
BUCKET = ''
PROJECT = ''
REGION = 'us-central1'

In [None]:
import os
os.environ['BUCKET'] = BUCKET
os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION

In [None]:
%%bash
if ! gsutil ls | grep -q gs://${BUCKET}/; then
  gsutil mb -l ${REGION} gs://${BUCKET}
fi

In [None]:
import os

class Params:
    pass


Params.PLATFORM = 'local' # local | GCP

Params.DATA_DIR = 'data/news'  if Params.PLATFORM == 'local' else 'gs://{}/data/news'.format(Params.BUCKET)
Params.TRANSFORMED_DATA_DIR = os.path.join(Params.DATA_DIR, 'transformed')

Params.RAW_TRAIN_DATA_FILE_PREFEX = os.path.join(Params.DATA_DIR, 'train')
Params.RAW_EVAL_DATA_FILE_PREFEX = os.path.join(Params.DATA_DIR, 'eval')

Params.MODELS_DIR = 'models/news' if Params.PLATFORM == 'local' else 'gs://{}/models/news'.format(Params.BUCKET)

Params.TEMP_DIR = os.path.join(Params.DATA_DIR, 'tmp')

Params.TRANSFORM = True

Params.TRAIN = True

Params.RESUME_TRAINING = False

Params.EAGER = False

if Params.EAGER:
    tf.enable_eager_execution()

## Create a ML Data Files using Dataflow

The data processing pipeline will do the following:
1. Read the data (key, title, source) from BigQuery
2. Process text (if needed) and convert each BQ raw to tsv
3. Save data to tsv files

### 1. Source Query

In [None]:
bq_query = '''
SELECT
    key,
    REGEXP_REPLACE(title, '[^a-zA-Z0-9 $.-]', ' ') AS title, 
    source
FROM
(
    SELECT
        ARRAY_REVERSE(SPLIT(REGEXP_EXTRACT(url, '.*://(.[^/]+)/'), '.'))[OFFSET(1)] AS source,
        title,
        ABS(FARM_FINGERPRINT(title)) AS Key
    FROM
      `bigquery-public-data.hacker_news.stories`
    WHERE
      REGEXP_CONTAINS(REGEXP_EXTRACT(url, '.*://(.[^/]+)/'), '.com$')
      AND LENGTH(title) > 10
)
WHERE (source = 'github' OR source = 'nytimes' OR source = 'techcrunch')
'''

### 2. Beam Pipeline

In [None]:
import apache_beam as beam


def to_tsv(bq_row):
    
    CSV_HEADER = 'key,title,source'.split(',')
    
    ### process bq_row['title'] 
    
    csv_row = '\t'.join([str(bq_row[column]) for column in CSV_HEADER])
    return csv_row



def run_pipeline(runner, opts):
  
    pipeline = beam.Pipeline(runner, options=opts)
    
    print("Sink train data files: {}".format(Params.RAW_TRAIN_DATA_FILE_PREFEX))
    print("Sink data files: {}".format(Params.RAW_EVAL_DATA_FILE_PREFEX))
    print("Temporary directory: {}".format(Params.TEMP_DIR))
    print("")
    
    for step in ['train', 'eval']:
        
        if step == 'train':
            source_query = 'SELECT * FROM ({}) WHERE MOD(key,100) <= 75'.format(bq_query)
            sink_location = Params.RAW_TRAIN_DATA_FILE_PREFEX
        else:
            source_query = 'SELECT * FROM ({}) WHERE MOD(key,100) > 75'.format(bq_query)
            sink_location = Params.RAW_EVAL_DATA_FILE_PREFEX
            
        (
            pipeline 
           | '{} - Read from BigQuery'.format(step) >> beam.io.Read(beam.io.BigQuerySource(query=source_query, use_standard_sql=True))
           | '{} - Process to TSV'.format(step) >> beam.Map(to_tsv)
           | '{} - Write to TSV '.format(step) >> beam.io.Write(beam.io.WriteToText(sink_location,
                                                                file_name_suffix='.tsv', num_shards=5))
        )
        
    job = pipeline.run()
    if runner == 'DirectRunner':
        job.wait_until_finish()
    

### 5. Run Pipeline

In [None]:
from datetime import datetime
import shutil

job_name = 'preprocess-hackernews-data' + '-' + datetime.utcnow().strftime('%y%m%d-%H%M%S')

options = {
    'region': REGION,
    'staging_location': os.path.join(Params.TEMP_DIR, 'staging'),
    'temp_location': Params.TEMP_DIR,
    'job_name': job_name,
    'project': PROJECT
}

opts = beam.pipeline.PipelineOptions(flags=[], **options)
runner = 'DirectRunner' if Params.PLATFORM == 'local' else 'DirectRunner'

if Params.TRANSFORM:
    
    if Params.PLATFORM == 'local':
        shutil.rmtree(Params.DATA_DIR, ignore_errors=True)
    
    print 'Launching {} job {} ... hang on'.format(runner, job_name)

    run_pipeline(runner, opts)
    print "Pipline completed."
else:
    print "Transformation skipped!"

In [None]:
%%bash

ls data/news
echo ""
head data/news/train-00000-of-00005.tsv

## TF Text Classification Model with TF Hub for Text Encoding

### 1. Define metadata & input function

In [None]:
import tensorflow as tf
from tensorflow import data
print(tf.__version__)

In [None]:
RAW_HEADER = 'key,title,source'.split(',')
RAW_DEFAULTS = [['NA'],['NA'],['NA']]
TARGET_FEATRUE_NAME = 'source'
TARGET_LABELS = ['github', 'nytimes', 'techcrunch']
TEXT_FEATURE_NAME = 'title'
KEY_COLUMN = 'key'

def parse_tsv(tsv_row):
    
    columns = tf.decode_csv(tsv_row, record_defaults=RAW_DEFAULTS, field_delim='\t')
    features = dict(zip(RAW_HEADER, columns))
    
    features.pop(KEY_COLUMN)
    target = features.pop(TARGET_FEATRUE_NAME)
    
    return features, target


def generate_tsv_input_fn(files_pattern, 
                          mode=tf.estimator.ModeKeys.EVAL, 
                          num_epochs=1, 
                          batch_size=200):
    

    def _input_fn():
        
        #file_names = data.Dataset.list_files(files_pattern)
        file_names = tf.matching_files(files_pattern)

        if Params.EAGER:
            print(file_names)

        dataset = data.TextLineDataset(file_names)

        dataset = dataset.apply(
                tf.contrib.data.shuffle_and_repeat(count=num_epochs,
                                                   buffer_size=batch_size*2)
        )

        dataset = dataset.apply(
                tf.contrib.data.map_and_batch(parse_tsv, 
                                              batch_size=batch_size, 
                                              num_parallel_batches=2)
        )

        datset = dataset.prefetch(batch_size)

        if Params.EAGER:
            return dataset

        iterator = dataset.make_one_shot_iterator()
        features, target = iterator.get_next()
        return features, target
    
    return _input_fn

In [None]:
!pip install tensorflow-hub

### 2. Create feature columns

In [None]:
import tensorflow_hub as hub
print(hub.__version__)

In [None]:
def create_feature_columns(hparams):
    
    title_embeding_column = hub.text_embedding_column(
        "title", "https://tfhub.dev/google/universal-sentence-encoder/1")
    
    feature_columns = [title_embeding_column]
    
    print("feature columns: \n {}".format(feature_columns))
    print("")
    
    return feature_columns
    

### 3. Create a model using a the  premade DNNClassifer

In [None]:
def create_estimator_hub(hparams, run_config):
    
    feature_columns = create_feature_columns(hparams)
    
    optimizer = tf.train.AdamOptimizer(learning_rate=hparams.learning_rate)
    
    estimator = tf.estimator.DNNClassifier(
        feature_columns=feature_columns,
        n_classes =len(TARGET_LABELS),
        label_vocabulary=TARGET_LABELS,
        hidden_units=hparams.hidden_units,
        optimizer=optimizer,
        config=run_config
    )
    
    
    return estimator

### 4. Define experiment

##### a) HParams and RunConfig

In [None]:
TRAIN_SIZE = 73124
NUM_EPOCHS = 10
BATCH_SIZE = 1000

TOTAL_STEPS = (TRAIN_SIZE/BATCH_SIZE)*NUM_EPOCHS
EVAL_EVERY_SEC = 60

hparams  = tf.contrib.training.HParams(
    num_epochs=NUM_EPOCHS,
    batch_size=BATCH_SIZE,
    trainable_embedding=False,
    learning_rate=0.01,
    hidden_units=[256, 128],
    max_steps=TOTAL_STEPS
)

MODEL_NAME = 'dnn_estimator_hub' 
model_dir = os.path.join(Params.MODELS_DIR, MODEL_NAME)

run_config = tf.estimator.RunConfig(
    tf_random_seed=19830610,
    log_step_count_steps=1000,
    save_checkpoints_secs=EVAL_EVERY_SEC,
    keep_checkpoint_max=1,
    model_dir=model_dir
)


print(hparams)
print("")
print("Model Directory:", run_config.model_dir)
print("Dataset Size:", TRAIN_SIZE)
print("Batch Size:", BATCH_SIZE)
print("Steps per Epoch:",TRAIN_SIZE/BATCH_SIZE)
print("Total Steps:", TOTAL_STEPS)

##### b) Serving function

In [None]:
def generate_serving_input_fn():
    
    def _serving_fn():
    
        receiver_tensor = {
          'title': tf.placeholder(dtype=tf.string, shape=[None])
        }

        return tf.estimator.export.ServingInputReceiver(
            receiver_tensor, receiver_tensor)
    
    return _serving_fn

##### c) TrainSpec & EvalSpec

In [None]:
train_spec = tf.estimator.TrainSpec(
    input_fn = generate_tsv_input_fn(
        Params.RAW_TRAIN_DATA_FILE_PREFEX+"*",
        mode = tf.estimator.ModeKeys.TRAIN,
        num_epochs=hparams.num_epochs,
        batch_size=hparams.batch_size
    ),
    max_steps=hparams.max_steps,
    hooks=None
)

eval_spec = tf.estimator.EvalSpec(
    input_fn = generate_tsv_input_fn(
        Params.RAW_EVAL_DATA_FILE_PREFEX+"*",
        mode=tf.estimator.ModeKeys.EVAL,
        num_epochs=1,
        batch_size=hparams.batch_size
    ),
    exporters=[tf.estimator.LatestExporter(
        name="estimate", # the name of the folder in which the model will be exported to under export
        serving_input_receiver_fn=generate_serving_input_fn(),
        exports_to_keep=1,
        as_text=False)],
    steps=None,
    throttle_secs=EVAL_EVERY_SEC
)

### 5. Run experiment

In [None]:
from datetime import datetime
import shutil

if Params.TRAIN:
    if not Params.RESUME_TRAINING:
        print("Removing previous training artefacts...")
        shutil.rmtree(model_dir, ignore_errors=True)
    else:
        print("Resuming training...") 


    tf.logging.set_verbosity(tf.logging.INFO)

    time_start = datetime.utcnow() 
    print("Experiment started at {}".format(time_start.strftime("%H:%M:%S")))
    print(".......................................") 

    estimator = create_estimator_hub(hparams, run_config)

    tf.estimator.train_and_evaluate(
        estimator=estimator,
        train_spec=train_spec, 
        eval_spec=eval_spec
    )

    time_end = datetime.utcnow() 
    print(".......................................")
    print("Experiment finished at {}".format(time_end.strftime("%H:%M:%S")))
    print("")
    time_elapsed = time_end - time_start
    print("Experiment elapsed time: {} seconds".format(time_elapsed.total_seconds()))
else:
    print("Training was skipped!")

### 6. Evaluate the model

In [None]:
TRAIN_SIZE = 73124
VALID_SIZE = 23079

tf.logging.set_verbosity(tf.logging.ERROR)

estimator = create_estimator_hub(hparams, run_config)

train_metrics = estimator.evaluate(
    input_fn = generate_tsv_input_fn(
        files_pattern= Params.RAW_TRAIN_DATA_FILE_PREFEX+"*", 
        mode= tf.estimator.ModeKeys.EVAL,
        batch_size= TRAIN_SIZE), 
    steps=1
)


print("############################################################################################")
print("# Train Measures: {}".format(train_metrics))
print("############################################################################################")

eval_metrics = estimator.evaluate(
    input_fn=generate_tsv_input_fn(
        files_pattern=Params.RAW_EVAL_DATA_FILE_PREFEX+"*", 
        mode= tf.estimator.ModeKeys.EVAL,
        batch_size= TRAIN_SIZE), 
    steps=1
)
print("")
print("############################################################################################")
print("# Valid Measures: {}".format(eval_metrics))
print("############################################################################################")


## 7. Use SavedModel for predictions 

In [None]:
import os

export_dir = model_dir +"/export/estimate/"
saved_model_dir = os.path.join(export_dir, os.listdir(export_dir)[0])

print(saved_model_dir)
print("")

predictor_fn = tf.contrib.predictor.from_saved_model(
    export_dir = saved_model_dir,
    signature_def_key="predict"
)

output = predictor_fn(
    {
        'title':[
            'Microsoft and Google are joining forces for a new AI framework',
            'A new version of Python is mind blowing',
            'EU is investigating new data privacy policies'
        ]
        
    }
)
print(output)