In [None]:
! pip install tensorflow
! pip install transformers 

In [None]:
import tensorflow as tf
import pandas as pd
from tensorflow.python.lib.io.tf_record import TFRecordWriter

In [None]:
# fraction of sample pass to the train and test as example
SAMPLE_FRAC = 0.2
# 80% data for training and 20% data for validate
TRAIN_FRAC = 0.8

# load train data from train.csv
train = pd.read_csv('data/train.csv')
train.reset_index(inplace=True)
# change sentiment label form 'pos' and 'neg' to 1 and 0, which bert model knows
train['sentiment'].replace({'pos':1,'neg':0},inplace=True)

# train set
train_sample = train.sample(frac=SAMPLE_FRAC,random_state=0)
train_select = train_sample.sample(frac= TRAIN_FRAC,random_state=0)
train_csv = train_select.values

# validate set 
validate_select = train_sample.drop(index=train_select.index)
validate_csv = validate_select.values


# load test data , here should be validation set
test = pd.read_csv('data/test.csv')
test.reset_index(inplace=True)
test['sentiment'].replace({'pos':1,'neg':0},inplace=True)
test_csv = test.sample(frac=SAMPLE_FRAC,random_state=0).values

In [None]:
train.tail()

In [None]:
import time
def create_tf_example(features,label):
    """
    Create tf example using features and label

    Args:
        features: list, feature list with format  ['idx','sentence']
        label: string, 

    Return:
        A binary-string of tf example.
        All proto messages can be serialized to a binary-string using the .SerializeToString method.
    """
    tf_example = tf.train.Example(features = tf.train.Features(feature = {
        'idx': tf.train.Feature(int64_list=tf.train.Int64List(value=[features[0]])),
        'sentence': tf.train.Feature(bytes_list=tf.train.BytesList(value=[features[1].encode('utf-8')])),
        'label': tf.train.Feature(int64_list=tf.train.Int64List(value=[label]))
    }))
    return tf_example.SerializeToString()

def convert_csv_to_tfrecord(csv, file_name):
    """
    Convert the numpy arryes to tfrecord and write files

    Args:
        csv: numpy arrays, each row feed (features+label)
        file_name: location TFRecord to be saved 
    """
    start_time = time.time()
    writer = TFRecordWriter(file_name)
    for idx,row in enumerate(csv):
        # check the row retionality, raise error when missing value
        try:
            if row is None:
                raise Exception('Row Missing')
            if row[0] is None or row[1] is None or row[2] is None:
                raise Exception('Value Missing')
            if row[1].strip() is '':
                raise Exception('Utterance is empty')
            
            features, label = row[:-1],row[-1]
            example =  create_tf_example(features,label)
            writer.write(example)

        except Exception as inst:
            print(type(inst))
            print(inst.args)
            print(inst)
    writer.close()
    print(f"{file_name}: --- {(time.time() - start_time)} seconds ---")

In [None]:
convert_csv_to_tfrecord(train_csv, "data/movie_train.tfrecord") 
convert_csv_to_tfrecord(validate_csv, "data/movie_validate.tfrecord") 
convert_csv_to_tfrecord(test_csv, "data/movie_test.tfrecord") 


In [None]:
import json
# generate exmaple number , save for use in the future 
def generate_json_info(local_file_name,df_train=[],df_val=[],df_test=[]):
    info = {"train_length": len(df_train), "validation_length": len(df_val),
            "test_length": len(df_test)}

    with open(local_file_name, 'w') as outfile:
        json.dump(info, outfile)

generate_json_info('data/info.json',train_csv,validate_csv,test_csv)

In [None]:
tr_ds = tf.data.TFRecordDataset("data/movie_train.tfrecord")

In [None]:
# Create a description of the features.
feature_spec = {
    'idx': tf.io.FixedLenFeature([], tf.int64),
    'sentence': tf.io.FixedLenFeature([], tf.string),
    'label': tf.io.FixedLenFeature([], tf.int64)
}
def parse_example(example_proto):
  # Parse the input tf.Example proto using the dictionary above.
    return tf.io.parse_single_example(example_proto, feature_spec)
tr_parse_ds = tr_ds.map(parse_example)
dataset_iterator = iter(tr_parse_ds)

In [None]:
dataset_iterator.get_next()

In [None]:
import tensorflow as tf
from transformers import *
from transformers import BertTokenizer, TFBertForSequenceClassification, glue_convert_examples_to_features
from transformers.configuration_bert import BertConfig

In [None]:
tr_ds = tf.data.TFRecordDataset("data/movie_train.tfrecord")
val_ds = tf.data.TFRecordDataset("data/movie_validate.tfrecord")
test_ds = tf.data.TFRecordDataset("data/movie_test.tfrecord")


In [None]:
# Create a description of the features.
feature_spec = {
    'idx': tf.io.FixedLenFeature([], tf.int64),
    'sentence': tf.io.FixedLenFeature([], tf.string),
    'label': tf.io.FixedLenFeature([], tf.int64)
}
def parse_example(example_proto):
  # Parse the input tf.Example proto using the dictionary above.
    return tf.io.parse_single_example(example_proto, feature_spec)

# convert the encoded string tensor into the separate tensors that will feed into the model
tr_parse_ds = tr_ds.map(parse_example)
val_parse_ds = val_ds.map(parse_example)
test_parse_ds =  test_ds.map(parse_example)

In [None]:
def clean_string(features):
    revised_sentence = tf.strings.regex_replace(features['sentence'], "\.\.\.", "", replace_global=True)
    revised_sentence = tf.strings.regex_replace(revised_sentence, "\\'", "'", replace_global=True)
    revised_sentence = tf.strings.regex_replace(revised_sentence, "\\n", "", replace_global=True)
    features['sentence'] = revised_sentence
    return features

In [None]:
tr_clean_ds = tr_parse_ds.map(lambda features: clean_string(features))
val_clean_ds = val_parse_ds.map(lambda features: clean_string(features))
test_clean_ds =  test_parse_ds.map(lambda features: clean_string(features))

In [None]:
BATCH_SIZE = 8

EVAL_BATCH_SIZE = BATCH_SIZE * 2

# XLA is the optimizing compiler for machine learning
# It can potentially increase speed by 15% with no source code changes
USE_XLA = False

# mixed precision results on https://github.com/huggingface/transformers/tree/master/examples
# Mixed precision can help to speed up training time
USE_AMP = False

In [None]:
tf.config.optimizer.set_jit(USE_XLA)
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP})

In [None]:
# Steps is determined by the number of examples
import json
with open('data/info.json') as json_file:
    data_info = json.load(json_file)
    
train_examples = data_info['train_length']
valid_examples = data_info['validation_length']
test_examples = data_info['test_length']

train_examples, valid_examples, test_examples

In [None]:
# Load tokenizer and model from pretrained model/vocabulary. Specify the number of labels to classify (2+: classification, 1: regression)
num_labels = 2 
config = BertConfig.from_pretrained("bert-base-cased", num_labels=num_labels)
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
model = TFBertForSequenceClassification.from_pretrained('bert-base-cased', config=config)

In [None]:
# Make use of the following config parameters

# {
#   "architectures": [
#     "BertForMaskedLM"
#   ],
#   "attention_probs_dropout_prob": 0.1,
#   "hidden_act": "gelu",
#   "hidden_dropout_prob": 0.1,
#   "hidden_size": 768,
#   "initializer_range": 0.02,
#   "intermediate_size": 3072,
#   "max_position_embeddings": 512,
#   "num_attention_heads": 12,
#   "num_hidden_layers": 12,
#   "type_vocab_size": 2,
#   "vocab_size": 28996
# }

In [None]:
import time
start_time = time.time()
train_dataset = glue_convert_examples_to_features(examples=tr_clean_ds, tokenizer=tokenizer
                                                  , max_length=512, task='sst-2',
                                                  label_list=['0','1']
                                                  )
print(f"---{time.time()-start_time} seconds---")

In [None]:
import time
start_time = time.time()
valid_dataset = glue_convert_examples_to_features(examples=val_clean_ds, tokenizer=tokenizer
                                                  , max_length=512, task='sst-2'
                                                  , label_list =['0', '1'])
print(f"---{time.time()-start_time} seconds---")

In [None]:
train_dataset = train_dataset.shuffle(train_examples).batch(BATCH_SIZE).repeat(-1)

valid_dataset = valid_dataset.batch(EVAL_BATCH_SIZE)

In [None]:
opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)

if USE_AMP:
    # loss scaling is currently required when using mixed precision
    opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic')

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=opt, loss=loss, metrics=[metric])

In [None]:
train_steps = train_examples//BATCH_SIZE
valid_steps = valid_examples//EVAL_BATCH_SIZE


In [None]:
# GPU USAGE
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
tf.config.experimental.list_physical_devices()

In [None]:
model.summary()

In [None]:
history = model.fit(train_dataset, epochs=6, steps_per_epoch=train_steps,
                    validation_data=valid_dataset, validation_steps=valid_steps)

In [None]:
import time
start_time = time.time()
test_dataset = glue_convert_examples_to_features(examples=test_clean_ds, tokenizer=tokenizer
                                                  , max_length=512, task='sst-2'
                                                  , label_list =['0', '1'])
print(f"---{time.time()-start_time} seconds---")

In [None]:
test_dataset = test_dataset.batch(EVAL_BATCH_SIZE)

In [None]:
model.evaluate(test_dataset)

In [None]:
y_pred = tf.nn.softmax(model.predict(test_dataset))

In [None]:
y_pred_argmax = tf.math.argmax(y_pred, axis=1)

In [None]:
y_true = tf.Variable([], dtype=tf.int64)

for features, label in test_dataset.take(-1):
    y_true = tf.concat([y_true, label], 0)
  

In [None]:
%matplotlib inline  
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import classification_report

def visualize_confusion_matrix(y_pred_argmax, y_true):
    """

    :param y_pred_arg: This is an array with values that are 0 or 1
    :param y_true: This is an array with values that are 0 or 1
    :return:
    """

    cm = tf.math.confusion_matrix(y_true, y_pred_argmax).numpy()
    con_mat_df = pd.DataFrame(cm)
    
    print(classification_report(y_pred_argmax, y_true))

    sns.heatmap(con_mat_df, annot=True, fmt='g', cmap=plt.cm.Blues)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

print(classification_report(test_labels, baseline_predicted))
visualize_confusion_matrix(y_pred_argmax, y_true)

In [None]:
tf.saved_model.save(model, './202002')

In [None]:
savedmodel = tf.saved_model.load('./202002')

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
example = {'idx': tf.constant(1, dtype=tf.int64), 'label': tf.constant(0, dtype=tf.int64) ,
           'sentence': tf.constant('This is the best store that I have ever visited', dtype=tf.string)}

In [None]:
ds = tf.data.Dataset.from_tensors(example)
feature_ds = glue_convert_examples_to_features(ds, tokenizer, max_length=128, task='sst-2')
feature_dataset = feature_ds.batch(1)

In [None]:
def predict_dataset(feature_dataset, savedmodel):
    """
    :param feature_dataset: Contains information needed for BERT
    :param savedmodel: This is the model that has been pretrained in a sep process.
    :return: JSON output with the predicted classification. 
    """
    
    json_examples = []
    for feature_batch in feature_dataset.take(-1):
        feature_example = feature_batch[0]

        # The SavedModel is going to generate log probabilities (logits) as to whether the sentence
        # is negative (0) or positive (1).
        logits = savedmodel.signatures["serving_default"](attention_mask=feature_example['attention_mask'],
                            input_ids=feature_example['input_ids'],
                            token_type_ids=feature_example['token_type_ids'])['output_1']
        print(f"logits {logits}")
        
        # It is more helpful to have the actual probabilities of success. The TensorFlow softmax 
        # function will convert the logits into probabilities.
        probs = tf.nn.softmax(logits)
        
        # At this point we have probabilities (probs) of whether the sentence is negative or positive. 
        # These probabilites (by definition) will always sum to 100%.
        
        # It would be better though if we could just report out which probability is higher. 
        # This is done with the argmax function.
        
        prediction = tf.math.argmax(probs, axis=1)

        print(f"probs {probs}")
        print(f"prediction {prediction}")

        json_example = {"SENTIMENT_PREDICTION": str(prediction.numpy()[0])}
        json_examples.append(json_example)

    return json_examples

In [None]:
negative_example = {'idx': tf.constant(1, dtype=tf.int64), 'label': tf.constant(0, dtype=tf.int64) ,
                    'sentence': tf.constant('This store is absolutely horrible and I hate it!!',
                                            dtype=tf.string)}

In [None]:
negative_example

In [None]:
def predict(example, tokenizer, savedmodel):
    """

    :param example: This is a single dictionary of tensors which contains a idx, a label, and a sentence
    :return: The prediction in JSON format. 1 is positive, and 0 is negative.
    """
    # The Transformers glue_convert_examples_to_features works well with datasets. 
    # It does not work well with a dictionary of examples. 
    ds = tf.data.Dataset.from_tensors(example)
    
    # Use the transformers library in order to convert an English sentence into something that 
    # BERT recognizes.
    
    # The conversion requires giving a label (even if we don't have one). The e-asiest way to get around this is to get around
    # this is to assign a default label of zero when you don't have a label. 
    
    feature_ds = glue_convert_examples_to_features(ds, tokenizer, max_length=512, task='sst-2')

    feature_dataset = feature_ds.batch(64)
    json_examples = predict_dataset(feature_dataset, savedmodel)

    return json_examples

In [None]:
json_result = predict(negative_example, tokenizer, savedmodel)

In [None]:
predict(example, tokenizer, savedmodel)

# Base Line

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

In [None]:
train_texts, train_labels = [row[1] for row in train_csv], [row[2] for row in train_csv]
test_texts, test_labels =  [row[1] for row in test_csv], [row[2] for row in test_csv]

In [None]:
len(train_texts) ,  len(train_labels)

In [None]:
baseline_model = make_pipeline(CountVectorizer(ngram_range=(1,3)), LogisticRegression()).fit(train_texts, train_labels)

In [None]:
baseline_predicted = baseline_model.predict(test_texts)

In [None]:
print(classification_report(test_labels, baseline_predicted))

In [None]:
visualize_confusion_matrix(baseline_predicted,test_labels)