# Sommaire

* [I. Pré-traitement des données](#I)
* [II. Entraînement du modèle](#II)
* [III. Connection à l'espace Azure](#III)
* [IV. Enregistrement du modèle](#IV)
* [V. Déploiement du modèle](#V)
* [VI. Utilisation du web service](#VI)

In [1]:
# Import libraries
import os
import requests
import json

# Math libraries to process the data 
import numpy as np 
import pandas as pd

# Libraries for preprocessing 
import nltk
from nltk.tokenize import word_tokenize
import tensorflow
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Classification libraries
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Dense,
    Embedding,
    Dropout,
    LSTM,
    GlobalMaxPool1D,
    Bidirectional,
    BatchNormalization,
)
from tensorflow.keras import backend as K
import azureml.core
from azureml.core import Experiment, Workspace, Environment
from azureml.core.compute import ComputeTarget
from azureml.core.model import InferenceConfig, Model
from azureml.core.webservice import AciWebservice
from azureml.core.environment import CondaDependencies
from azureml.core import Webservice
import pickle

pd.options.mode.chained_assignment = None

print("Tensorflow version:", tensorflow.__version__)
print("Using GPU build:", tensorflow.test.is_built_with_cuda())

Tensorflow version: 2.1.0
Using GPU build: True


## I. Pré-traitement des données<a class="anchor" id="I"></a>

In [2]:
nltk.download('punkt')

# Download dataset
dataframe_sample = pd.read_csv("data/dataframe_sample.csv", index_col=[0])
dataframe_sample = dataframe_sample.drop(columns=["Unnamed: 1", "target.1"], axis=1)

# Normalize target
dataframe_sample['target'] = dataframe_sample['target'] / 4

# Transform target from float to integer
dataframe_sample['target'] = dataframe_sample['target'].apply(lambda x: int(x))

# Vectorize (convert words to numbers) with Keras tokenizer
tk = Tokenizer(num_words=None)
tk.fit_on_texts(dataframe_sample.text)

def embed(corpus): 
    return word_tokenizer.texts_to_sequences(corpus)

# Set text values
text_values = dataframe_sample.text.values

# Get longest sentence length
longest_train = max(text_values, key=lambda sentence: len(word_tokenize(sentence)))
length_long_sentence = len(word_tokenize(longest_train))

# Vectorize (convert words to numbers) with Keras tokenizer
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(text_values)

# Get the dictionary of vocab created by Keras
word_index = tk.word_index

# Transform in fixed length structure
padded_sentences = pad_sequences(embed(text_values), length_long_sentence, padding='post')

# Set target values
sentiments = dataframe_sample.target.values

# Split the data in train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    padded_sentences, 
    sentiments, 
    test_size=0.3
)

# Define path variable
glove_path = 'data/glove.6B.50d.txt'

# Load GloVe vectors in a dictionary
embeddings_index = {}

glove = open(glove_path, 'r', encoding='utf-8')
for line in glove:
    values = line.split(' ')
    word = values[0]
    vector = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = vector
glove.close()

# Number of dimension of GloVe word embedding 
GLOVE_DIM = 50

# Create embedding matrix for word in train set
embeddings_matrix_glove = np.zeros((len(word_index) + 1, GLOVE_DIM))
hits = 0
misses = 0

for word, i in tqdm(word_index.items()):
    # Check if the word occurs in Glove embedding
    embeddings_vector = embeddings_index.get(word)
    if embeddings_vector is not None:
        # If not, keep the vector with zeros only
        embeddings_matrix_glove[i] = embeddings_vector
        hits += 1
    else:
        misses += 1

print('\n')
print('Word index length: ', len(word_index) + 1)
print('Converted words: {}, missing words: {}'.format(hits, misses))
print('% of missing words: {:.1f}%'.format(misses / (hits + misses)*100))

# Define model constants
epochs = 50
batch_size = 200

[nltk_data] Downloading package punkt to /home/azureuser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|██████████| 5298/5298 [00:00<00:00, 553184.53it/s]




Word index length:  5299
Converted words: 3920, missing words: 1378
% of missing words: 26.0%


## II. Entraînement du modèle<a class="anchor" id="II"></a>

In [3]:
# Define the model
model = Sequential(name='glove_model')
model.add(Embedding(input_dim=embeddings_matrix_glove.shape[0],
                          output_dim=embeddings_matrix_glove.shape[1],
                          weights=[embeddings_matrix_glove],
                          input_length=length_long_sentence))
model.add(Bidirectional(LSTM(length_long_sentence, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(length_long_sentence, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(length_long_sentence, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(1, activation = 'sigmoid'))

# Take a look at the model summary
model.summary()

# Define model metrics
RECALL = tensorflow.keras.metrics.Recall(name='recall')
PRECISION = tensorflow.keras.metrics.Precision(name='precision')
METRICS = [RECALL, PRECISION]

# Compile the model
model.compile(loss='binary_crossentropy', optimizer="adam", metrics=METRICS)

# Fit the model
hist = model.fit(
    X_train, 
    y_train, 
    epochs=epochs,
    batch_size=batch_size,
    validation_split=0.3,
)

# Evaluate the model
score = model.evaluate(X_test, y_test, batch_size=batch_size)
print("Test recall:", score[1])
print("Test precision:", score[2])
print("Test f1-score:", 2 * ((score[1] * score[2]) / (score[1] + score[2])))

Model: "glove_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 46, 50)            264950    
_________________________________________________________________
bidirectional (Bidirectional (None, 46, 92)            35696     
_________________________________________________________________
global_max_pooling1d (Global (None, 92)                0         
_________________________________________________________________
batch_normalization (BatchNo (None, 92)                368       
_________________________________________________________________
dropout (Dropout)            (None, 92)                0         
_________________________________________________________________
dense (Dense)                (None, 46)                4278      
_________________________________________________________________
dropout_1 (Dropout)          (None, 46)                

## III. Connection à l'espace Azure<a class="anchor" id="III"></a>

In [4]:
# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

ws = Workspace.from_config()
print("Workspace name:" + ws.name,
     "Azure region:" + ws.location,
     "Resource group:" + ws.resource_group, sep="\n")

SDK version: 1.37.0
Workspace name:Projet-7
Azure region:francecentral
Resource group:oc-ia-p7


## IV. Enregistrement du modèle<a class="anchor" id="IV"></a>

In [5]:
model.save('sentiment_model/sentiment_model.h5')

with open('sentiment_model/tokenizer.pickle', 'wb') as handle:
    pickle.dump(word_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

classification_model = Model.register(workspace=ws,
                       model_name='sentiment_model',
                       model_path='sentiment_model',
                       description='A sentiment classification model')

Registering model sentiment_model


## V. Déploiement du modèle<a class="anchor" id="V"></a>

In [6]:
compute_target_name = "compute-model"
compute_target = ComputeTarget(workspace=ws, name=compute_target_name)
print("Found existing:", compute_target.name)

env = Environment(name="env")

conda = CondaDependencies()
conda.add_conda_package('scikit-learn')
conda.add_conda_package('numpy')
conda.add_conda_package('keras')
conda.add_conda_package('tensorflow')
conda.add_conda_package('tensorflow-gpu')
conda.add_conda_package('pyspark')

env.python.conda_dependencies=conda 

inference_config = InferenceConfig(
    entry_script="score.py",
    environment=env
)

deployment_config = AciWebservice.deploy_configuration(cpu_cores=1, memory_gb=3)

model = ws.models["sentiment_model"]

service = Model.deploy(
    workspace=ws, 
    name="service-aci", 
    models=[model], 
    inference_config=inference_config, 
    deployment_config=deployment_config,
    deployment_target=compute_target,
    overwrite=True,
)
service.wait_for_deployment(show_output = True)
print(service.state)

Found existing: compute-model
Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2022-02-15 21:48:46+00:00 Creating Container Registry if not exists.
2022-02-15 21:48:46+00:00 Registering the environment.
2022-02-15 21:48:48+00:00 Use the existing image.
2022-02-15 21:48:48+00:00 Generating deployment configuration.
2022-02-15 21:48:53+00:00 Submitting deployment to compute.
2022-02-15 21:48:55+00:00 Checking the status of deployment service-aci..
2022-02-15 21:53:59+00:00 Checking the status of inference endpoint service-aci.
Succeeded
ACI service creation operation finished, operation "Succeeded"
Healthy


## VI. Utilisation du web service<a class="anchor" id="VI"></a>

In [None]:
# Get web service
service = Webservice(workspace=ws, name='service-aci')

In [7]:
# Test after deployment
def get_sentiment_from_tweet(tweet):
    # Set environment variables
    headers = {'Content-Type': 'application/json'}

    # Provide a text example
    data = json.dumps({'text': tweet})

    # Call with POST request
    response = requests.post(service.scoring_uri, data=data, headers=headers)
    response = response.json()

    # Print result
    print('The tweet is %s' % response["label"])
    print('Elapsed time: %s' % response["elapsed_time"])

The tweet is NEGATIVE with a score of 0.48939430713653564
Elapsed time: 0.8393192291259766


In [None]:
get_sentiment_from_tweet('I love the users of this platform')