## 0. Imports

In [28]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import logging
from google.cloud.storage import Client
from sklearn.model_selection import train_test_split
import datetime
import gcsfs
import pickle
from googleapiclient import discovery
from oauth2client.client import GoogleCredentials
import json

## 1. Config work

#### EXECUTE THE FOLLOWING COMMAND ONLY ONCE

In [3]:
MODEL_TIME_VERSION  = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')

In [29]:
PROJECT_ID = !(gcloud config get-value core/project)
PROJECT = PROJECT_ID[0]
BUCKET_NAME = f"{PROJECT}-machine-learning"
BUCKET= f"gs://{PROJECT}-machine-learning"
RAW_DATA_FOLDER_NAME = "raw-data"
RAW_DATA_FOLDER_PATH = f"gs://{PROJECT}-machine-learning/raw-data"
ROOT='level-0-models'
MODEL_DIR=os.path.join(ROOT,'models').replace("\\","/")
PACKAGES_DIR=os.path.join(ROOT,'packages').replace("\\","/")
REGION = 'europe-west1'
MODEL_NAME = 'tweet_sentiment_classifier'

if not os.path.exists('./model-'+ MODEL_TIME_VERSION +'/'):
    os.makedirs('./model-'+ MODEL_TIME_VERSION +'/')
temp_model = './model-'+ MODEL_TIME_VERSION +'/'

In [30]:
!gcloud config set project {PROJECT}

Updated property [core/project].


In [31]:
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)

## 2. Get data

### 2.1. Input data

data can be downloaded from: https://www.kaggle.com/kazanova/sentiment140

In [32]:
sentiment_mapping={
    0:"negative",
    2:"neutral",
    4:"positive"
}

df_twitter = pd.read_csv("gs://"+BUCKET_NAME+"/raw-data/training_VA.csv",encoding="latin1", header=None)\
             .rename(columns={
                 0:"sentiment",
                 1:"id",
                 2:"time",
                 3:"query",
                 4:"username",
                 5:"text"
             })[["sentiment","text"]]

df_twitter["sentiment_label"] = df_twitter["sentiment"].map(sentiment_mapping)
print(df_twitter.shape)
df_twitter.head()

Unnamed: 0,sentiment,text,sentiment_label
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative
1,0,is upset that he can't update his Facebook by ...,negative
2,0,@Kenichan I dived many times for the ball. Man...,negative
3,0,my whole body feels itchy and like its on fire,negative
4,0,"@nationwideclass no, it's not behaving at all....",negative


### 2.2. Data processing fn

In [33]:
%%writefile preprocess.py

from tensorflow.python.keras.preprocessing import sequence
from tensorflow.keras.preprocessing import text
import re

class TextPreprocessor(object):
    def _clean_line(self, text):
        text = re.sub(r"http\S+", "", text)
        text = re.sub(r"@[A-Za-z0-9]+", "", text)
        text = re.sub(r"#[A-Za-z0-9]+", "", text)
        text = text.replace("RT","")
        text = text.lower()
        text = text.strip()
        return text
    
    def __init__(self, vocab_size, max_sequence_length):
        self._vocab_size = vocab_size
        self._max_sequence_length = max_sequence_length
        self._tokenizer = None

    def fit(self, text_list):        
        # Create vocabulary from input corpus.
        text_list_cleaned = [self._clean_line(txt) for txt in text_list]
        tokenizer = text.Tokenizer(num_words=self._vocab_size)
        tokenizer.fit_on_texts(text_list)
        self._tokenizer = tokenizer

    def transform(self, text_list):        
        # Transform text to sequence of integers
        text_list = [self._clean_line(txt) for txt in text_list]
        text_sequence = self._tokenizer.texts_to_sequences(text_list)

        # Fix sequence length to max value. Sequences shorter than the length are
        # padded in the beginning and sequences longer are truncated
        # at the beginning.
        padded_text_sequence = sequence.pad_sequences(
          text_sequence, maxlen=self._max_sequence_length)
        return padded_text_sequence

Overwriting preprocess.py


Some small test:

In [34]:
from preprocess import TextPreprocessor
processor = TextPreprocessor(5, 5)
processor.fit(['hello machine learning','test'])
processor.transform(['hello machine learning',"lol"])

array([[0, 0, 1, 2, 3],
       [0, 0, 0, 0, 0]], dtype=int32)

### 2.3. Prep data

In [35]:
CLASSES = {'negative':0, 'positive': 1}  # label-to-int mapping
VOCAB_SIZE = 25000  # Limit on the number vocabulary size used for tokenization
MAX_SEQUENCE_LENGTH = 50  # Sentences will be truncated/padded to this length

In [36]:
sents = df_twitter.text
labels = np.array(df_twitter.sentiment_label.map(CLASSES))

# Train and test split
X, X_test, y, y_test = train_test_split(sents, labels, test_size=0.2)
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.1)

# Create vocabulary from training corpus.
processor = TextPreprocessor(VOCAB_SIZE, MAX_SEQUENCE_LENGTH)
processor.fit(X_train)

# Preprocess the data
train_texts_vectorized = processor.transform(X_train)
eval_texts_vectorized = processor.transform(X_test)
validation_texts_vectorized = processor.transform(X_validation)

with open('./model-'+ MODEL_TIME_VERSION +'/processor_state.pkl', 'wb') as f:
    pickle.dump(processor, f)

## 3. Model

In [46]:
# model parameters
LEARNING_RATE=.001
EMBEDDING_DIM=25
FILTERS=64
DROPOUT_RATE=0.5
POOL_SIZE=3
NUM_EPOCH=2
BATCH_SIZE=128
KERNEL_SIZES=[2,5,8]

### 3.1. Basic model

In [47]:
def create_model(vocab_size, embedding_dim, filters, kernel_sizes, dropout_rate, pool_size, embedding_matrix):
    
    # Input layer
    model_input = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

    # Embedding layer
    z = tf.keras.layers.Embedding(
        input_dim=vocab_size+1,
        output_dim=embedding_dim,
        input_length=MAX_SEQUENCE_LENGTH,
        weights=[embedding_matrix]
    )(model_input)

    z = tf.keras.layers.Dropout(dropout_rate)(z)

    # Convolutional block
    conv_blocks = []
    for kernel_size in kernel_sizes:
        conv = tf.keras.layers.Convolution1D(
            filters=filters,
            kernel_size=kernel_size,
            padding="valid",
            activation="relu",
            bias_initializer='random_uniform',
            strides=1)(z)
        conv = tf.keras.layers.MaxPooling1D(pool_size=2)(conv)
        conv = tf.keras.layers.Flatten()(conv)
        conv_blocks.append(conv)
        
    z = tf.keras.layers.Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]

    z = tf.keras.layers.Dropout(dropout_rate)(z)
    z = tf.keras.layers.Dense(100, activation="relu")(z)
    model_output = tf.keras.layers.Dense(1, activation="sigmoid")(z)

    model = tf.keras.models.Model(model_input, model_output)
    
    return model

### 3.2. Pretrained Glove embeddings

embedding can be downloaded here: https://nlp.stanford.edu/projects/glove/

In [40]:
client = Client()
bucket = client.get_bucket(BUCKET_NAME)
temp_folder = "raw-data/"
if not os.path.exists(temp_folder):
    os.makedirs(temp_folder)
blob = bucket.get_blob("raw-data/glove.twitter.27B.25d.txt")
downloaded_file = blob.download_to_filename('raw-data/glove.twitter.27B.25d.txt')

In [41]:
def get_coaefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coaefs(*o.strip().split()) for o in
                                                open("raw-data/glove.twitter.27B.25d.txt","r",encoding="utf8"))

In [42]:
word_index = processor._tokenizer.word_index
nb_words = min(VOCAB_SIZE, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))

for word, i in word_index.items():
    if i >= VOCAB_SIZE: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

### 3.3. Create - compile - train

In [43]:
model = create_model(VOCAB_SIZE, EMBEDDING_DIM, FILTERS, KERNEL_SIZES, DROPOUT_RATE,POOL_SIZE, embedding_matrix)

2022-06-05 16:19:38.244710: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2299995000 Hz
2022-06-05 16:19:38.245272: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55b2809415c0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2022-06-05 16:19:38.245303: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2022-06-05 16:19:38.248090: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [44]:
# Compile model with learning parameters.
optimizer = tf.keras.optimizers.Nadam(lr=0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['acc'])

In [48]:
#keras train
history = model.fit(
    train_texts_vectorized, 
    y_train, 
    epochs=NUM_EPOCH, 
    batch_size=BATCH_SIZE,
    validation_data=(validation_texts_vectorized, y_validation),
    verbose=2,
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_acc',
            min_delta=0.005,
            patience=3,
            factor=0.5),
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            min_delta=0.005, 
            patience=5, 
            verbose=0, 
            mode='auto'
        ),
        tf.keras.callbacks.History()
    ]
)

Epoch 1/2
9000/9000 - 739s - loss: 0.4408 - acc: 0.7948 - val_loss: 0.4170 - val_acc: 0.8083
Epoch 2/2
9000/9000 - 742s - loss: 0.4288 - acc: 0.8016 - val_loss: 0.4114 - val_acc: 0.8124


In [49]:
# test model : acc loss
[loss, acc] = model.evaluate(eval_texts_vectorized, y_test )



In [50]:
scores = model.predict(eval_texts_vectorized)
predictions = np.array([int(np.round(i)) for i in scores ])
confusion_matrix=tf.math.confusion_matrix(predictions, y_test)

In [51]:
# confusion matrix
print (str("matrix-co : "+str(confusion_matrix)))

matrix-co : tf.Tensor(
[[132431  31322]
 [ 27829 128418]], shape=(2, 2), dtype=int32)


In [52]:
# std prediction 
np.std(scores)

0.3225778

In [53]:
with open("history.pkl",'wb') as file:
    pickle.dump(history.history,file)

In [54]:
tf.keras.models.save_model(model,temp_model)

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


2022-06-05 16:59:53.456952: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ./model-2022-06-05-13-38-45/assets


In [62]:
#copy file to gcp storage

In [55]:
!gsutil cp -r {temp_model} {BUCKET}/{MODEL_DIR}/

Copying file://./model-2022-06-05-13-38-45/processor_state.pkl [Content-Type=application/octet-stream]...
Copying file://./model-2022-06-05-13-38-45/saved_model.pb [Content-Type=application/octet-stream]...
Copying file://./model-2022-06-05-13-38-45/variables/variables.index [Content-Type=application/octet-stream]...
Copying file://./model-2022-06-05-13-38-45/variables/variables.data-00000-of-00001 [Content-Type=application/octet-stream]...
\ [4 files][ 48.6 MiB/ 48.6 MiB]                                                
Operation completed over 4 objects/48.6 MiB.                                     


## 4. Deployment

### 4.1. Prepare custom model prediction

In [56]:
%%writefile model_prediction.py
import os
import pickle
import numpy as np
import tensorflow as tf
from tensorflow.python.lib.io import file_io

class CustomModelPrediction(object):

    def __init__(self, model, processor):
        # Class gets instantiated with a trained model file and a persisted processor
        self._model = model
        self._processor = processor

    def _postprocess(self, predictions):
    # Create an output signature
        labels = ['negative', 'positive']
        return [
            {
            "label":labels[int(np.round(prediction))],
            "score":float(np.round(prediction,4))
            } for prediction in predictions]

    def predict(self, instances, **kwargs):
    # Clean the data, make predictions and postprocess
        preprocessed_data = self._processor.transform(instances)
        predictions =  self._model.predict(preprocessed_data)
        labels = self._postprocess(predictions)
        return labels

    @classmethod
    def from_path(cls, model_dir):
    # Load the keras model and the persisted processor
        
        print ('test model')
        model = tf.keras.models.load_model(model_dir,custom_objects={'tf': tf})
    
    # I know, pickle is bad and I should feel bad
    
        with file_io.FileIO(os.path.join(model_dir, 'processor_state.pkl'), 'rb') as f:
            processor = pickle.load(f)

        return cls(model, processor)

Overwriting model_prediction.py


Test

In [57]:
requests = (["God I hate the north","god I love this"])

In [58]:
BUCKET+'/'+MODEL_DIR+temp_model[1:]

'gs://sound-splicer-351114-machine-learning/level-0-models/models/model-2022-06-05-13-38-45/'

In [59]:
from model_prediction import CustomModelPrediction

classifier = CustomModelPrediction.from_path(BUCKET+'/'+MODEL_DIR+temp_model[1:])
results = classifier.predict(requests)
results

test model


[{'label': 'negative', 'score': 0.03759999945759773},
 {'label': 'positive', 'score': 0.9294000267982483}]

### 4.2. Package it

In [61]:
MODEL_TIME_VERSION
print(MODEL_TIME_VERSION)
MODEL_TIME_VERSION_WITH_UNDERSCORES = str(MODEL_TIME_VERSION).replace('-','_')
print(MODEL_TIME_VERSION_WITH_UNDERSCORES)
MODEL_TIME_VERSION_WITHOUT_SPACE = str(MODEL_TIME_VERSION).replace('-','')
print(MODEL_TIME_VERSION_WITHOUT_SPACE)

'2022-06-05-13-38-45'

#### update VERSION in the cell below with the MODEL_TIME_VERSION above

In [62]:
%%writefile setup.py

from setuptools import setup

MODEL_NAME = "tweet_sentiment_classifier"
REQUIRED_PACKAGES = ['gcsfs']
VERSION = '2022-06-xx-xx-xx-xx'

setup(
    name=MODEL_NAME,
    packages=[],
    include_package_data=False,
    version=VERSION,
    scripts=["preprocess.py", "model_prediction.py"]
)

Overwriting setup.py


Wrap it up and copy to GCP

In [63]:
!python setup.py sdist --formats=gztar
!gsutil cp ./dist/{MODEL_NAME}-{MODEL_TIME_VERSION_WITHOUT_SPACE}.tar.gz {BUCKET}/{PACKAGES_DIR}/{MODEL_NAME}-{MODEL_TIME_VERSION_WITHOUT_SPACE}.tar.gz

  "details." % version
running sdist
running egg_info
writing tweet_sentiment_classifier.egg-info/PKG-INFO
writing dependency_links to tweet_sentiment_classifier.egg-info/dependency_links.txt
writing top-level names to tweet_sentiment_classifier.egg-info/top_level.txt
reading manifest file 'tweet_sentiment_classifier.egg-info/SOURCES.txt'
writing manifest file 'tweet_sentiment_classifier.egg-info/SOURCES.txt'

running check


creating tweet_sentiment_classifier-2022-06-05-13-38-45
creating tweet_sentiment_classifier-2022-06-05-13-38-45/tweet_sentiment_classifier.egg-info
copying files to tweet_sentiment_classifier-2022-06-05-13-38-45...
copying model_prediction.py -> tweet_sentiment_classifier-2022-06-05-13-38-45
copying preprocess.py -> tweet_sentiment_classifier-2022-06-05-13-38-45
copying setup.py -> tweet_sentiment_classifier-2022-06-05-13-38-45
copying tweet_sentiment_classifier.egg-info/PKG-INFO -> tweet_sentiment_classifier-2022-06-05-13-38-45/tweet_sentiment_classifier.egg-info

## 5. Create model and version

In [74]:
VERSION_NAME='V_' + MODEL_TIME_VERSION_WITHOUT_SPACE # MODEL_TIME_VERSION.replace("-","_")
RUNTIME_VERSION='2.5' # tensorflow version
MODEL_REGION='europe-west1'
id_model = 'model-'+MODEL_TIME_VERSION

#### If no model has been created before, run this command.

In [67]:
!gcloud ai-platform models create {MODEL_NAME} --regions {MODEL_REGION}

Using endpoint [https://ml.googleapis.com/]
[1;31mERROR:[0m (gcloud.ai-platform.models.create) Resource in projects [sound-splicer-351114] is the subject of a conflict: Field: model.name Error: A model with the same name already exists.
- '@type': type.googleapis.com/google.rpc.BadRequest
  fieldViolations:
  - description: A model with the same name already exists.
    field: model.name


In [68]:
!gcloud ai-platform models list --region global

Using endpoint [https://ml.googleapis.com/]
NAME                        DEFAULT_VERSION_NAME
tweet_sentiment_classifier  V2


In [77]:
!gcloud beta ai-platform versions create {VERSION_NAME} \
--model {MODEL_NAME} \
--origin {BUCKET}/{MODEL_DIR}/{id_model} \
--python-version 3.7 \
--runtime-version {RUNTIME_VERSION} \
--package-uris {BUCKET}/{PACKAGES_DIR}/{MODEL_NAME}-{MODEL_TIME_VERSION_WITHOUT_SPACE}.tar.gz \
--prediction-class=model_prediction.CustomModelPrediction \
--region global 

Using endpoint [https://ml.googleapis.com/]
Creating version (this might take a few minutes)......done.                    


## 6. Testing

In [86]:
requests = [
    "god this episode sucks",
    "meh, I kinda like it",
    "what were the writer thinking, omg it doesn't make any sense!",
    "omg! what a twist, who would've though :o!",
    "woohoow, sansa for the win!"
]

# JSON format the requests
request_data = {'instances': requests}

# Authenticate and call CMLE prediction API 
credentials = GoogleCredentials.get_application_default()

In [88]:
%%time

api = discovery.build('ml', 'v1')
model_url = 'projects/{}/models/{}'.format(PROJECT, MODEL_NAME)
response = api.projects().predict(body=request_data, name=model_url).execute()
response["predictions"]

CPU times: user 20.5 ms, sys: 1.51 ms, total: 22 ms
Wall time: 66.1 ms


[{'label': 'negative', 'score': 0.050200000405311584},
 {'label': 'positive', 'score': 0.7918000221252441},
 {'label': 'negative', 'score': 0.373199999332428},
 {'label': 'negative', 'score': 0.193900004029274},
 {'label': 'positive', 'score': 0.8440999984741211}]