# Custom estimator - binary output

In [1]:
%%bash
source env1/bin/activate

In [14]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os
from tensorboard import summary as summary_lib
from tensorflow.python.lib.io.file_io import FileIO as open_file

tf.logging.set_verbosity(tf.logging.INFO)

### Constants of the programm

In [58]:
MODEL_DIR = 'model_dir_v2'

INPUT_FILE_CSV = 'gs://guardati-test/comments/raw_Opinioni_Zuegg_LinkMultipli_campi_standard.csv'
N_COMMENTS_TO_IMPORT = 1000

WORDS_NOT_ALLOWED = [',','.',':',';']
INDEX_PADDING = 0

SENTENCE_LEN = 30 # N° of words per sentences

TRAIN_QUOTE = 0.6

TRAIN_STEPS = 10

EMBEDDING_SIZE = 10

### Retreive the csv with the comments
- the comment must to be under a column named 'Testo' 

In [16]:
with open_file(INPUT_FILE_CSV,'r') as f:
    df=pd.read_csv(f,skip_blank_lines=True)
    df = df['Testo'][0:N_COMMENTS_TO_IMPORT]

### Dictionary creation
- words_dict = {'word', word_unique_index}

In [17]:
index = INDEX_PADDING + 1
words_dict = {}
words_dict_inverted = {}
comments_splitted = []

for comment in df:
    for word_not_allowed in WORDS_NOT_ALLOWED:
        comment = comment.replace(word_not_allowed, ' ')
    
    words = comment.lower().split()
    
    comments_splitted.append(words)
    
    for word in words:
        if word not in words_dict:
            words_dict[word] = index
            words_dict_inverted[index] = word
            index += 1

print('len(words_dict) = ',len(words_dict))

('len(words_dict) = ', 9450)


### Convert comments -> list[word1_idx, word2_idx...wordN_idx]

In [22]:
comments_int = []

start = True   # maybe exist a more fashion way

for comment in comments_splitted:
    comment_int = []
    for word in comment:
        comment_int.append(words_dict[word])
        
    if start:
        comments_int = [comment_int[0:SENTENCE_LEN]]
        start = False
    else:    
        comments_int = np.append(comments_int,
                            [comment_int[0:SENTENCE_LEN]],axis=0)
    
comments_int.shape

(1000, 30)

In [23]:
def comment_inverter(comment):
    txt = ''
    for word in comment:
        txt += ' '
        txt += words_dict_inverted[word]
    txt += '\n'
    return txt

### Labels creation - binary classification

In [25]:
np.random.seed(seed=10)
np.set_printoptions(1000)

x_data = comments_int
y_data = np.random.randint(2,size=(len(x_data),1))

y_data.shape

(1000, 1)

### Divide in train and test

In [29]:
len_data = len(x_data)

x_train = x_data[:int(round(len_data * TRAIN_QUOTE))]
y_train = y_data[:int(round(len_data * TRAIN_QUOTE))]

x_test = x_data[len(x_train):]
y_test = y_data[len(y_train):]

print('Train: ',x_train.shape,y_train.shape)
print('Test: ',x_test.shape,y_test.shape)

('Train: ', (600, 30), (600, 1))
('Test: ', (400, 30), (400, 1))


## Estimator

### Input tensor making

In [49]:
def parser(x, y):
    features = {'x': x}
    return features, y

def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    dataset = dataset.shuffle(buffer_size = len(x_data))
    dataset = dataset.batch(100)
    dataset = dataset.map(parser)
    dataset = dataset.repeat()
    iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()

def eval_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
    dataset = dataset.batch(100)
    dataset = dataset.map(parser)
    iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()

### Estimator tester function

In [50]:
all_classifiers = {}
def train_and_evaluate(classifier):
    # Save a reference to the classifier to run predictions later
    all_classifiers[classifier.model_dir] = classifier
    classifier.train(input_fn=train_input_fn, steps=TRAIN_STEPS)
    eval_results = classifier.evaluate(input_fn=eval_input_fn)
    predictions = np.array([p['logistic'][0] for p in classifier.predict(input_fn=eval_input_fn)])
        
    # Reset the graph to be able to reuse name scopes
    tf.reset_default_graph() 
    # Add a PR summary in addition to the summaries that the classifier writes
    pr = summary_lib.pr_curve('precision_recall', predictions=predictions, labels=y_test.astype(bool), num_thresholds=21)
    with tf.Session() as sess:
        writer = tf.summary.FileWriter(os.path.join(classifier.model_dir, 'eval'), sess.graph)
        writer.add_summary(sess.run(pr), global_step=0)
        writer.close()

### Canned linear estimator - test

In [51]:
column = tf.feature_column.categorical_column_with_identity('x', len(words_dict))
classifier = tf.estimator.LinearClassifier(feature_columns=[column], model_dir=os.path.join(MODEL_DIR, 'bow_sparse'))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_global_id_in_cluster': 0, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fbffc57a810>, '_evaluation_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_device_fn': None, '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': 'model_dir_v2/bow_sparse', '_train_distribute': None, '_save_summary_steps': 100}


In [52]:
train_and_evaluate(classifier)
print('END')

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from model_dir_v2/bow_sparse/model.ckpt-30
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 30 into model_dir_v2/bow_sparse/model.ckpt.
INFO:tensorflow:loss = 9.934425, step = 31
INFO:tensorflow:Saving checkpoints for 40 into model_dir_v2/bow_sparse/model.ckpt.
INFO:tensorflow:Loss for final step: 9.727288.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-09-25-15:23:33
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from model_dir_v2/bow_sparse/model.ckpt-40
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-09-25-15:23:34
INFO:tensorflow:Saving dict for glob

## Custom estimator

In [64]:
head = tf.contrib.estimator.binary_classification_head()

In [65]:
def cnn_model_fn(features, labels, mode, params):    
    
    # INPUT
    input_layer = tf.contrib.layers.embed_sequence(
        features['x'], len(words_dict), EMBEDDING_SIZE,
        initializer=params['embedding_initializer'])
    
    # DROPOUT LAYER
    training = mode == tf.estimator.ModeKeys.TRAIN
    dropout_emb = tf.layers.dropout(inputs=input_layer, 
                                    rate=0.2, 
                                    training=training)
    
    # CNN
    conv = tf.layers.conv1d(
        inputs=dropout_emb,
        filters=32,
        kernel_size=3,
        padding="same",
        activation=tf.nn.relu)
    
    # Global Max Pooling
    pool = tf.reduce_max(input_tensor=conv, axis=1)
    
    # Full connected layer + dropout
    hidden = tf.layers.dense(inputs=pool, units=250, activation=tf.nn.relu)
    dropout_hidden = tf.layers.dropout(inputs=hidden, 
                                       rate=0.2, 
                                       training=training)
    
    # Output-logits layer
    logits = tf.layers.dense(inputs=dropout_hidden, units=1)
    
    # Metto il label in (una) colonna
    if labels is not None: 
        labels = tf.reshape(labels, [-1, 1])
        
    # Gradient descend optimizator
    optimizer = tf.train.AdamOptimizer()
    
    # Usata nell'output
    def _train_op_fn(loss):
        return optimizer.minimize(
            loss=loss,
            global_step=tf.train.get_global_step())
    
    # Output
    # head = tf.contrib.estimator.binary_classification_head()
    output = head.create_estimator_spec(
        features=features,
        labels=labels,
        mode=mode,
        logits=logits, 
        train_op_fn=_train_op_fn)
    
    return output

#### Embedding (farlocco)

In [66]:
params = {'embedding_initializer': tf.random_uniform_initializer(-1.0, 1.0)}

In [67]:
cnn_classifier = tf.estimator.Estimator(model_fn=cnn_model_fn,
                                        model_dir=os.path.join(MODEL_DIR, 'cnn'),
                                        params=params)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_global_id_in_cluster': 0, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc001682e50>, '_evaluation_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_device_fn': None, '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': 'model_dir_v2/cnn', '_train_distribute': None, '_save_summary_steps': 100}


In [68]:
train_and_evaluate(cnn_classifier)
print('END')

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into model_dir_v2/cnn/model.ckpt.
INFO:tensorflow:loss = 0.72143805, step = 1
INFO:tensorflow:Saving checkpoints for 10 into model_dir_v2/cnn/model.ckpt.
INFO:tensorflow:Loss for final step: 0.7081385.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-09-25-15:33:15
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from model_dir_v2/cnn/model.ckpt-10
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-09-25-15:33:15
INFO:tensorflow:Saving dict for global step 10: accuracy = 0.495, accuracy_baseline = 0.52250004, auc = 0.51109743, auc_precision_recall