# Loading Libraries

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sn
from azureml.core import Workspace, Dataset

In [2]:
# import dataset

df = pd.read_csv('Dataset/amazonLabelled - amazonLabelled.csv')

In [3]:
df.head()

Unnamed: 0,S,Feedback,Sentiment
0,1,"Good case, Excellent value.",Positive
1,2,Great for the jawbone.,Positive
2,3,Tied to charger for conversations lasting more...,Negative
3,4,The mic is great.,Positive
4,5,I have to jiggle the plug to get it to line up...,Negative


In [4]:
df.shape

(999, 3)

In [5]:
df.isnull().values.any()

False

In [6]:
df["Sentiment"].value_counts()

Positive    500
Negative    499
Name: Sentiment, dtype: int64

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
lb=LabelEncoder()

In [9]:
lb.fit(df["Sentiment"])

LabelEncoder()

In [10]:
df["Sentiment"]=lb.transform(df["Sentiment"])

In [11]:
df.head()

Unnamed: 0,S,Feedback,Sentiment
0,1,"Good case, Excellent value.",1
1,2,Great for the jawbone.,1
2,3,Tied to charger for conversations lasting more...,0
3,4,The mic is great.,1
4,5,I have to jiggle the plug to get it to line up...,0


# Train Test Split

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train,X_test,y_train,y_test=train_test_split(df.drop("Sentiment",axis=1),df["Sentiment"],test_size=0.2)

In [14]:
X_train.shape

(799, 2)

In [15]:
train_df=pd.concat([X_train,y_train],axis=1).to_csv("Dataset/train_set.csv",index=False)

In [16]:
test_df=pd.concat([X_test,y_test],axis=1).to_csv("Dataset/test_set.csv",index=False)

# Register dataset to the workspace

In [20]:
subscription_id = "29514374-60e8-4ea7-b14f-6778779cf8e4"
resource_group = 'RGDAY5'
workspace_name = 'Michelinday5'

In [21]:
workspace = Workspace(subscription_id, resource_group, workspace_name)

In [22]:
# get the datastore to upload prepared data
datastore = workspace.get_default_datastore()

In [23]:
# upload the local file from src_dir to the target_path in datastore
datastore.upload(src_dir='Dataset', target_path='nlpdata')

Uploading an estimated of 3 files
Uploading Dataset/amazonLabelled - amazonLabelled.csv
Uploaded Dataset/amazonLabelled - amazonLabelled.csv, 1 files out of an estimated total of 3
Uploading Dataset/test_set.csv
Uploaded Dataset/test_set.csv, 2 files out of an estimated total of 3
Uploading Dataset/train_set.csv
Uploaded Dataset/train_set.csv, 3 files out of an estimated total of 3
Uploaded 3 files


$AZUREML_DATAREFERENCE_ccdbe7fc35ee49a78480d14ba72e295f

In [24]:
train_dataset = Dataset.Tabular.from_delimited_files(datastore.path('nlpdata/train_set.csv'))

In [25]:
test_dataset = Dataset.Tabular.from_delimited_files(datastore.path('nlpdata/test_set.csv'))

In [26]:
train_ds = train_dataset.register(workspace=workspace,
                                 name='nlp_train_set',
                                 description='Training data for nlp usecase')

In [27]:
test_ds = test_dataset.register(workspace=workspace,
                                 name='nlp_test_set',
                                 description='Test data for nlp usecase')

# Data ingestion step - Training dataset

In [28]:
dataset = Dataset.get_by_name(workspace, name='nlp_train_set')
print(dataset.name, dataset.version)

nlp_train_set 1


In [29]:
df = dataset.to_pandas_dataframe()

In [30]:
df.head()

Unnamed: 0,S,Feedback,Sentiment
0,700,"Also, the phone doesn't seem to accept anythin...",0
1,214,fast service.,1
2,521,Thanks again to Amazon for having the things I...,1
3,233,Great sound and service.,1
4,80,I wear it everyday and it holds up very well.,1


In [31]:
df.shape

(799, 3)

# Preprocessing Data

In [32]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.6.2-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 3.8 MB/s eta 0:00:01
Installing collected packages: nltk
Successfully installed nltk-3.6.2


In [33]:
import nltk

In [34]:
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [35]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [36]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
lemmatizer=WordNetLemmatizer()

In [37]:
def preprocess_data(data):
    corpus=[]
    for i in data:
        mess=re.sub("[^a-zA-Z0-9]"," ",i)
        mess=mess.lower().split()
        mess=[lemmatizer.lemmatize(word) for word in mess if word not in stopwords.words("english")]
        mess=" ".join(mess)
        corpus.append(mess)
    return corpus    

In [38]:
corpus=preprocess_data(df["Feedback"])

In [39]:
len(corpus)

799

In [40]:
from sklearn.feature_extraction.text import CountVectorizer

In [41]:
cv=CountVectorizer(ngram_range=(1,2))

In [42]:
cv.fit(corpus)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 2), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [43]:
count_train=cv.transform(corpus)

In [44]:
count_train.shape

(799, 4456)

# Creating experiment and run to log metrics and hypermeters

In [45]:
from azureml.core.experiment import Experiment
myexperiment = Experiment(workspace, "rf_sent_analysis")
# initialize a run in Azureml
run = myexperiment.start_logging()

In [46]:
run.log("dataset name", dataset.name)
run.log("dataset Version", dataset.version)

# Model Training

In [47]:
from sklearn.ensemble import RandomForestClassifier

In [48]:
rf=RandomForestClassifier()

In [49]:
rf.fit(count_train,df["Sentiment"])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [50]:
rf.score(count_train,df["Sentiment"])

0.9974968710888611

In [51]:
from sklearn.model_selection import cross_val_score

In [52]:
scores=cross_val_score(rf,count_train,df["Sentiment"],cv=3)

In [53]:
scores.mean()

0.722128354594351

In [54]:
scores.std()

0.026814886955222647

# Hyperparameter Tuning

In [55]:
from sklearn.model_selection import GridSearchCV

In [56]:
param_grid={'n_estimators': [100, 400,700,1000,2000,2500], 'min_samples_split': [2,4,8,16]}

In [57]:
grid=GridSearchCV(rf,param_grid,n_jobs=-1)

In [58]:
grid.fit(count_train,df["Sentiment"])

GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              ra

In [59]:
n_est=grid.get_params(deep=True)['estimator__n_estimators']

In [60]:
min_sam_splt=grid.get_params(deep=True)['estimator__min_samples_split']

In [61]:
rf=RandomForestClassifier(n_estimators=n_est,min_samples_split=min_sam_splt)

In [62]:
rf.fit(count_train,df["Sentiment"])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [63]:
# Logging training parameters to AzureML and MLFlow experiments
run.log("n_estimators", grid.get_params(deep=True)['estimator__n_estimators'])
run.log("min_samples_split", grid.get_params(deep=True)['estimator__min_samples_split'])

In [64]:
run.complete()

# Model Packaging Step

In [65]:
import joblib

In [66]:
joblib.dump(cv,"outputs/count_vectorizer.pkl")

['outputs/count_vectorizer.pkl']

In [67]:
joblib.dump(rf,"outputs/rf_sent_model.pkl")

['outputs/rf_sent_model.pkl']

In [68]:
from azureml.core.environment import Environment
from azureml.core.conda_dependencies import CondaDependencies

# to install required packages
env = Environment('tutorial-env')
cd = CondaDependencies.create(pip_packages=['azureml-dataset-runtime[pandas,fuse]', 'azureml-defaults',"numpy",  "joblib", "azureml-core", "azureml-monitoring", "azureml-defaults", "scikit-learn==0.20.3", "inference-schema", "inference-schema[numpy-support]","nltk-corpus","nltk-stem"], conda_packages = ['scikit-learn==0.22.1'])

env.python.conda_dependencies = cd

# Register environment to re-use later
env.register(workspace = workspace)

{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210714.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "tutorial-env",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "anaconda",
                "conda-forge"
 

# Model Registering Step

In [69]:
from azureml.core.model import Model

In [70]:
# Register Model on AzureML WS
model = Model.register(model_path = './outputs/count_vectorizer.pkl', # this points to a local file 
                       model_name = "NLP_Count_Vectorizer", # this is the name the model is registered as
                       tags = {'dataset': dataset.name, 'version': dataset.version, }, 
                       model_framework='pandas==0.23.4',
                       description = "Count Vectorizer",
                       workspace = workspace)

print('Name:', model.name)
print('Version:', model.version)

Registering model NLP_Count_Vectorizer
Name: NLP_Count_Vectorizer
Version: 1


In [71]:
# Register Model on AzureML WS
model = Model.register(model_path = './outputs/rf_sent_model.pkl', # this points to a local file 
                       model_name = "NLP_RF_Model", # this is the name the model is registered as
                       tags = {'dataset': dataset.name, 'version': dataset.version, }, 
                       model_framework='pandas==0.23.4',
                       description = "Random Forest Model",
                       workspace = workspace)

print('Name:', model.name)
print('Version:', model.version)

Registering model NLP_RF_Model
Name: NLP_RF_Model
Version: 1


# Deploy model as a webservice on Azure Container Instance

In [132]:
%%writefile score.py
import json
import numpy as np
import os
import pickle
import joblib
import time
from azureml.core.model import Model
import nltk
nltk.download("stopwords")
nltk.download("wordnet")
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
lemmatizer=WordNetLemmatizer()

def preprocess_data(data):
    corpus=[]
    for i in data:
        mess=re.sub("[^a-zA-Z0-9]"," ",i)
        mess=mess.lower().split()
        mess=[lemmatizer.lemmatize(word) for word in mess if word not in stopwords.words("english")]
        mess=" ".join(mess)
        corpus.append(mess)
    return corpus    


def init():
    global count_vect,rf_model
    
    count_vect_path=Model.get_model_path('NLP_Count_Vectorizer')
    count_vect= joblib.load(count_vect_path)
    
    rf_model_path=Model.get_model_path('NLP_RF_Model')
    rf_model=joblib.load(rf_model_path)
    
def run(raw_data):
    try:
        data = json.loads(raw_data)['data']
        corpus=preprocess_data(data[0])
        count_test=count_vect.transform(corpus)
        prediction=rf_model.predict(count_test)
        # you can return any data type as long as it is JSON-serializable
        return json.dumps({"result": prediction.tolist()})
    except Exception as e:
        result = str(e)
        # return error message back to the client
        return json.dumps({"error": result})
        
            


Overwriting score.py


 # Define Environment

In [133]:
from azureml.core.environment import Environment
from azureml.core.environment import CondaDependencies


# Create the environment
myenv = Environment(name="MyEnvironment")

# Create the dependencies object
print("Creating dependencies....")
myenv_dep = CondaDependencies.create(conda_packages=['scikit-learn', 'pip'],
                                     pip_packages=['azureml-defaults','joblib','numpy','azureml-core', "azureml-monitoring", "inference-schema", "inference-schema[numpy-support]","nltk"])

myenv.python.conda_dependencies = myenv_dep

# Register the environment
print("Registering the environment...")
myenv.register(workspace)

Creating dependencies....
Registering the environment...


{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210714.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "MyEnvironment",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "anaconda",
                "conda-forge"


In [134]:
from azureml.core.model import InferenceConfig

In [135]:
inference_config = InferenceConfig(entry_script="score.py", environment=myenv)

In [136]:
from azureml.core.webservice import AciWebservice

deployment_config = AciWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 1, collect_model_data=True,auth_enabled=True)

In [137]:
model1 = Model(workspace, 'NLP_Count_Vectorizer')
model2 = Model(workspace, 'NLP_RF_Model')

service_name = 'amazon-feedback-analysis'

In [138]:
service = Model.deploy(workspace, service_name, models=[model1, model2], inference_config=inference_config, deployment_config=deployment_config, overwrite=True)
service.wait_for_deployment(show_output = True)
print(service.state)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2021-08-21 10:27:50+00:00 Creating Container Registry if not exists.
2021-08-21 10:27:50+00:00 Registering the environment.
2021-08-21 10:27:51+00:00 Use the existing image.
2021-08-21 10:27:51+00:00 Generating deployment configuration.
2021-08-21 10:27:52+00:00 Submitting deployment to compute.
2021-08-21 10:27:55+00:00 Checking the status of deployment amazon-feedback-analysis..
2021-08-21 10:30:39+00:00 Checking the status of inference endpoint amazon-feedback-analysis.
Succeeded
ACI service creation operation finished, operation "Succeeded"
Healthy


In [139]:
print(service.get_logs())

2021-08-21T10:30:30,551945300+00:00 - iot-server/run 
2021-08-21T10:30:30,557946200+00:00 - rsyslog/run 
2021-08-21T10:30:30,567498500+00:00 - nginx/run 
2021-08-21T10:30:30,568593300+00:00 - gunicorn/run 
Dynamic Python package installation is disabled.
Starting HTTP server
EdgeHubConnectionString and IOTEDGE_IOTHUBHOSTNAME are not set. Exiting...
2021-08-21T10:30:30,893731100+00:00 - iot-server/finish 1 0
2021-08-21T10:30:30,907116600+00:00 - Exit code 1 is normal. Not restarting iot-server.
Starting gunicorn 20.1.0
Listening at: http://127.0.0.1:31311 (61)
Using worker: sync
worker timeout is set to 300
Booting worker with pid: 86
SPARK_HOME not set. Skipping PySpark Initialization.
Initializing logger
2021-08-21 10:30:35,754 | root | INFO | Starting up app insights client
logging socket was found. logging is available.
logging socket was found. logging is available.
2021-08-21 10:30:35,760 | root | INFO | Starting up request id generator
2021-08-21 10:30:35,760 | root | INFO | Star

In [140]:
service.update(enable_app_insights=True)

# Test web service

In [141]:
print(service.scoring_uri)

http://d023601e-1f80-4ea3-95b2-d54ab79af9d9.centralus.azurecontainer.io/score


In [142]:
print(service.swagger_uri)

http://d023601e-1f80-4ea3-95b2-d54ab79af9d9.centralus.azurecontainer.io/swagger.json


In [143]:
service.state

'Healthy'

In [144]:
import json


input_payload = json.dumps({
    'data': [["I love this phone , It is very handy and has a lot of features ."]],
     # If you have a classification model, you can get probabilities by changing this to 'predict_proba'.
})

output = service.run(input_payload)

print(output)

{"result": [1]}


In [146]:
import urllib.request
import json
import os
import ssl

def allowSelfSignedHttps(allowed):
    # bypass the server certificate verification on client side
    if allowed and not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None):
        ssl._create_default_https_context = ssl._create_unverified_context

allowSelfSignedHttps(True) # this line is needed if you use self-signed certificate in your scoring service.

# Request data goes here
data = {
    'data': [["I love this phone , It is very handy and has a lot of features ."]],
}

body = str.encode(json.dumps(data))

url = 'http://d023601e-1f80-4ea3-95b2-d54ab79af9d9.centralus.azurecontainer.io/score'
api_key = 'UpKgwdTVPGZC3w8w4ltqRjvMWgvDho4N' # Replace this with the API key for the web service
headers = {'Content-Type':'application/json', 'Authorization':('Bearer '+ api_key)}

req = urllib.request.Request(url, body, headers)

try:
    response = urllib.request.urlopen(req)

    result = response.read()
    print(result)
except urllib.error.HTTPError as error:
    print("The request failed with status code: " + str(error.code))

    # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure
    print(error.info())
    print(json.loads(error.read().decode("utf8", 'ignore')))

b'"{\\"result\\": [1]}"'
