In [94]:
import datetime
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.externals import joblib

In [95]:

import azureml.core
from azureml.core import Workspace
from azureml.core.model import Model
from azureml.core import Experiment
from azureml.core.webservice import Webservice
from azureml.core.image import ContainerImage
from azureml.core.webservice import AciWebservice
from azureml.core.conda_dependencies import CondaDependencies

In [96]:
print(azureml.core.VERSION)

1.18.0


In [276]:

ws = Workspace.from_config()
print(ws.name, ws.location, ws.resource_group, sep='\t')

imdb_workspace	westeurope	IMDBDeploy-rg


In [277]:
experiment_name = 'lstm-imdb3'

exp = Experiment(workspace=ws, name=experiment_name)

In [278]:
from azureml.core import Datastore
from azureml.core.dataset import Dataset
from azureml.data import dataset_type_definitions

In [279]:
datastores = ws.datastores
for name, datastore in datastores.items():
    print(name, datastore.datastore_type)

azureml_globaldatasets AzureBlob
workspaceblobstore AzureBlob
workspacefilestore AzureFile


In [280]:
datastore = Datastore.get(ws, datastore_name='workspaceblobstore')

In [281]:
from azureml.core import Workspace, Dataset

subscription_id = '6c8ba98b-ed1b-4d48-aeff-a06760245180'
resource_group = 'IMDBDeploy-rg'
workspace_name = 'imdb_workspace'

workspace = Workspace(subscription_id, resource_group, workspace_name)

df = Dataset.get_by_name(workspace, name='imdbtest')
df = df.to_pandas_dataframe()

In [285]:
run = exp.start_logging(snapshot_directory=None)                   
run.log("Experiment start time", str(datetime.datetime.now()))

In [286]:
print(type(df))

<class 'pandas.core.frame.DataFrame'>


In [287]:
df = df.sample(frac=0.1, random_state=0) #uncomment to use full set of data

# Drop missing values
df.dropna(inplace=True)

df

Unnamed: 0,review,sentiment
11841,John Cassavetes is on the run from the law. He...,positive
19602,It's not just that the movie is lame. It's mor...,negative
45519,"Well, if it weren't for Ethel Waters and a 7-y...",negative
25747,I find Alan Jacobs review very accurate concer...,positive
42642,This movie is simply awesome. It is so hilario...,positive
...,...,...
9869,This is an excellent movie that tackles the is...,positive
42192,i was kinda interested in this movie as a tras...,negative
12143,I think I am some kind of Road Runner fan. I d...,positive
34963,Two years passed and mostly everyone looks dif...,positive


In [288]:
print("Summary statistics of numerical features : \n", df.describe())

print("=======================================================================")

print("\nTotal number of reviews: ",len(df))

print("=======================================================================")

print("\nTotal number of Sentiments: ", len(list(set(df['sentiment']))))

df['sentiment'] = np.where(df['sentiment'] == "positive", 1, 0)
df

Summary statistics of numerical features : 
                                                    review sentiment
count                                                5000      5000
unique                                               4995         2
top     Ill-conceived sequel(..the absurd idea of havi...  negative
freq                                                    2      2553

Total number of reviews:  5000

Total number of Sentiments:  2


Unnamed: 0,review,sentiment
11841,John Cassavetes is on the run from the law. He...,1
19602,It's not just that the movie is lame. It's mor...,0
45519,"Well, if it weren't for Ethel Waters and a 7-y...",0
25747,I find Alan Jacobs review very accurate concer...,1
42642,This movie is simply awesome. It is so hilario...,1
...,...,...
9869,This is an excellent movie that tackles the is...,1
42192,i was kinda interested in this movie as a tras...,0
12143,I think I am some kind of Road Runner fan. I d...,1
34963,Two years passed and mostly everyone looks dif...,1


In [289]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], \
                                                    test_size=0.1, random_state=0)

print('Load %d training examples and %d validation examples. \n' %(X_train.shape[0],X_test.shape[0]))
print('Show a review in the training set : \n', X_train.iloc[10])
X_train,y_train

Load 4500 training examples and 500 validation examples. 

Show a review in the training set : 
 THE CRIMSON RIVERS is one of the most over-directed, over-the-top, over-everything mess I've ever seen come out of France. There's nothing worse than a French production trying to out-do films made in Hollywood and CR is a perfect example of such a wannabe horror/action/buddy flick. I almost stopped it halfway through because I knew it wouldn't amount to anything but French guys trying to show-off.<br /><br />The film starts off promisingly, like some sort of expansive horror film, but it quickly shifts genres, from horror to action to x-files type to buddy flick, that in the end, CR is all of it and also none of it. It's so full of clichés that at one point I thought the whole thing was a comedy. The painful dialogue and those silent pauses, with fades outs and fades ins just at the right expositionary moments, made me groan. I thought only films made in Hollywood used this hackneyed techn

(23499    Paul Naschy as a ghostly security guard in thi...
 32754    For every series that makes it to television, ...
 41199    I had the displeasure of watching this movie w...
 4152     Hear are some of the interesting things our co...
 36723    Every time I think about this film I feel phys...
                                ...                        
 33530    I can not believe such slanted, jingoistic mat...
 16729    This movie was rented by a friend. Her choice ...
 33642    A British teen movies which centres around a g...
 10195    I only gave this film a 4 because I saw it in ...
 7520     What a disappointment!<br /><br />This film se...
 Name: review, Length: 4500, dtype: object,
 23499    1
 32754    0
 41199    0
 4152     0
 36723    0
         ..
 33530    0
 16729    0
 33642    0
 10195    0
 7520     0
 Name: sentiment, Length: 4500, dtype: int64)

In [349]:
X_train.to_csv(r'File Name.csv', index = True,index_label="index")

  """Entry point for launching an IPython kernel.


In [290]:
def cleanText(raw_text, remove_stopwords=False, stemming=False, split_text=False, \
             ):
    '''
    Convert a raw review to a cleaned review
    '''
    text = BeautifulSoup(raw_text, 'html.parser').get_text()
    letters_only = re.sub("[^a-zA-Z]", " ", text)
    words = letters_only.lower().split() 
    
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
        
    if stemming==True:

        stemmer = SnowballStemmer('english') 
        words = [stemmer.stem(w) for w in words]
        
    if split_text==True:
        return (words)
    
    return( " ".join(words))

In [15]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [657]:
print("great")

great


In [631]:
import re
import nltk
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize, pos_tag
from bs4 import BeautifulSoup 
import logging
from wordcloud import WordCloud
from gensim.models import word2vec
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors

X_train_cleaned = []
X_test_cleaned = []

for d in X_train:
    X_train_cleaned.append(cleanText(d))
print('Show a cleaned review in the training set : \n',  X_train_cleaned[10])
    
for d in X_test:
    X_test_cleaned.append(cleanText(d))

Show a cleaned review in the training set : 
 the crimson rivers is one of the most over directed over the top over everything mess i ve ever seen come out of france there s nothing worse than a french production trying to out do films made in hollywood and cr is a perfect example of such a wannabe horror action buddy flick i almost stopped it halfway through because i knew it wouldn t amount to anything but french guys trying to show off the film starts off promisingly like some sort of expansive horror film but it quickly shifts genres from horror to action to x files type to buddy flick that in the end cr is all of it and also none of it it s so full of clich s that at one point i thought the whole thing was a comedy the painful dialogue and those silent pauses with fades outs and fades ins just at the right expositionary moments made me groan i thought only films made in hollywood used this hackneyed technique the chase scene with vincent cassel running after the killer is so over 

In [662]:
%%memit

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
countVect = CountVectorizer() 
X_train_countVect = countVect.fit_transform(X_train_cleaned)
print("Number of features : %d \n" %len(countVect.get_feature_names())) #6378 
print("Show some feature names : \n", countVect.get_feature_names()[::1000])


# Train mnb
mnb = MultinomialNB()
mnb.fit(X_train_countVect, y_train)

predictions = mnb.predict(countVect.transform(X_test_cleaned))
modelEvaluation(predictions)

Number of features : 36751 

Show some feature names : 
 ['aa', 'ameche', 'auggie', 'betrayals', 'bright', 'cathryn', 'clownhouse', 'copying', 'dazzle', 'disarray', 'dvd', 'estimation', 'fighter', 'fusion', 'greenfinch', 'henson', 'imaginings', 'ir', 'kint', 'linklater', 'maropis', 'misik', 'nectar', 'organise', 'performing', 'pre', 'rages', 'reputedly', 'saddled', 'sexiness', 'smith', 'steal', 'swoozie', 'tinfoil', 'unattuned', 'vernacular', 'willed']

Accuracy on validation set: 0.8140

AUC score : 0.8142

Classification report : 
               precision    recall  f1-score   support

           0       0.79      0.86      0.82       249
           1       0.85      0.77      0.81       251

    accuracy                           0.81       500
   macro avg       0.82      0.81      0.81       500
weighted avg       0.82      0.81      0.81       500


Confusion Matrix : 
 [[214  35]
 [ 58 193]]
peak memory: 3365.85 MiB, increment: 0.00 MiB


In [663]:
%%memit

from sklearn.linear_model import LogisticRegression
tfidf = TfidfVectorizer(min_df=5) #minimum document frequency of 5
X_train_tfidf = tfidf.fit_transform(X_train)
print("Number of features : %d \n" %len(tfidf.get_feature_names())) #1722
print("Show some feature names : \n", tfidf.get_feature_names()[::1000])

# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train_tfidf, y_train)

feature_names = np.array(tfidf.get_feature_names())
sorted_coef_index = lr.coef_[0].argsort()
print('\nTop 10 features with smallest coefficients :\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Top 10 features with largest coefficients : \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

predictions = lr.predict(tfidf.transform(X_test_cleaned))
modelEvaluation(predictions)

Number of features : 10505 

Show some feature names : 
 ['00', 'belonged', 'completion', 'dubious', 'garbage', 'interviewing', 'million', 'plays', 'rough', 'strike', 'vein']

Top 10 features with smallest coefficients :
['bad' 'worst' 'awful' 'no' 'waste' 'poor' 'terrible' 'boring' 'even'
 'minutes']

Top 10 features with largest coefficients : 
['great' 'and' 'excellent' 'best' 'it' 'wonderful' 'very' 'also' 'well'
 'love']

Accuracy on validation set: 0.8500

AUC score : 0.8500

Classification report : 
               precision    recall  f1-score   support

           0       0.85      0.85      0.85       249
           1       0.85      0.85      0.85       251

    accuracy                           0.85       500
   macro avg       0.85      0.85      0.85       500
weighted avg       0.85      0.85      0.85       500


Confusion Matrix : 
 [[211  38]
 [ 37 214]]
peak memory: 3365.37 MiB, increment: 0.02 MiB


In [658]:
%%memit 

from sklearn.linear_model import LogisticRegression
tfidf = TfidfVectorizer(min_df=5) #minimum document frequency of 5
X_train_tfidf = tfidf.fit_transform(X_train)
print("Number of features : %d \n" %len(tfidf.get_feature_names())) #1722
print("Show some feature names : \n", tfidf.get_feature_names()[::1000])

# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train_tfidf, y_train)

feature_names = np.array(tfidf.get_feature_names())
sorted_coef_index = lr.coef_[0].argsort()
print('\nTop 10 features with smallest coefficients :\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Top 10 features with largest coefficients : \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

predictions = lr.predict(tfidf.transform(X_test_cleaned))
modelEvaluation(predictions)

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


UsageError: Line magic function `%%memit` not found.


In [640]:
pip install memory_profiler

Collecting memory_profiler
  Downloading memory_profiler-0.58.0.tar.gz (36 kB)
Building wheels for collected packages: memory-profiler
  Building wheel for memory-profiler (setup.py) ... [?25ldone
[?25h  Created wheel for memory-profiler: filename=memory_profiler-0.58.0-py3-none-any.whl size=30183 sha256=9e6b05fde95fa02c96073166c92d1bf1f282dc45fd4ec49aabd812f2542f6275
  Stored in directory: /home/azureuser/.cache/pip/wheels/8c/47/b0/6aa7f5774be599d4f5256b58061f8264dd0ec24bb9de56f568
Successfully built memory-profiler
Installing collected packages: memory-profiler
Successfully installed memory-profiler-0.58.0
Note: you may need to restart the kernel to use updated packages.


In [655]:
%%memit

from sklearn.model_selection import  GridSearchCV
from sklearn import metrics
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.pipeline import Pipeline
estimators = [("tfidf", TfidfVectorizer()), ("lr", LogisticRegression())]
model = Pipeline(estimators)


params = {"lr__C":[0.1, 1, 10], 
          "tfidf__min_df": [1, 3], 
          "tfidf__max_features": [1000, None], 
          "tfidf__ngram_range": [(1,1), (1,2)], 
          "tfidf__stop_words": [None, "english"]} 

grid = GridSearchCV(estimator=model, param_grid=params, scoring="accuracy", n_jobs=-1)
grid.fit(X_train_cleaned, y_train)
print("The best paramenter set is : \n", grid.best_params_)


# Evaluate on the validaton set
predictions = grid.predict(X_test_cleaned)
modelEvaluation(predictions)

The best paramenter set is : 
 {'lr__C': 10, 'tfidf__max_features': None, 'tfidf__min_df': 3, 'tfidf__ngram_range': (1, 2), 'tfidf__stop_words': None}

Accuracy on validation set: 0.8720

AUC score : 0.8720

Classification report : 
               precision    recall  f1-score   support

           0       0.87      0.87      0.87       249
           1       0.87      0.88      0.87       251

    accuracy                           0.87       500
   macro avg       0.87      0.87      0.87       500
weighted avg       0.87      0.87      0.87       500


Confusion Matrix : 
 [[216  33]
 [ 31 220]]
peak memory: 3527.63 MiB, increment: 0.00 MiB


In [616]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def parseSent(review, tokenizer, remove_stopwords=False):

    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(cleanText(raw_sentence, remove_stopwords, split_text=True))
    return sentences


# Parse each review in the training set into sentences
sentences = []
for review in X_train_cleaned:
    sentences += parseSent(review, tokenizer,remove_stopwords=False)
    
print('%d parsed sentence in the training set\n'  %len(sentences))
print('Show a parsed sentence in the training set : \n',  sentences[10])

4500 parsed sentence in the training set

Show a parsed sentence in the training set : 
 ['the', 'crimson', 'rivers', 'is', 'one', 'of', 'the', 'most', 'over', 'directed', 'over', 'the', 'top', 'over', 'everything', 'mess', 'i', 've', 'ever', 'seen', 'come', 'out', 'of', 'france', 'there', 's', 'nothing', 'worse', 'than', 'a', 'french', 'production', 'trying', 'to', 'out', 'do', 'films', 'made', 'in', 'hollywood', 'and', 'cr', 'is', 'a', 'perfect', 'example', 'of', 'such', 'a', 'wannabe', 'horror', 'action', 'buddy', 'flick', 'i', 'almost', 'stopped', 'it', 'halfway', 'through', 'because', 'i', 'knew', 'it', 'wouldn', 't', 'amount', 'to', 'anything', 'but', 'french', 'guys', 'trying', 'to', 'show', 'off', 'the', 'film', 'starts', 'off', 'promisingly', 'like', 'some', 'sort', 'of', 'expansive', 'horror', 'film', 'but', 'it', 'quickly', 'shifts', 'genres', 'from', 'horror', 'to', 'action', 'to', 'x', 'files', 'type', 'to', 'buddy', 'flick', 'that', 'in', 'the', 'end', 'cr', 'is', 'all', 

In [622]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [623]:
%%time
from wordcloud import WordCloud
from gensim.models import word2vec
from gensim.models.keyedvectors import KeyedVectors
num_features = 300  #embedding dimension                     
min_word_count = 10                
num_workers = 4       
context = 10                                                                                          
downsampling = 1e-3 

print("Training Word2Vec model ...\n")
w2v = Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count,\
                 window = context, sample = downsampling)
w2v.init_sims(replace=True)
w2v.save("w2v_300features_10minwordcounts_10context") #save trained word2vec model

print("Number of words in the vocabulary list : %d \n" %len(w2v.wv.index2word)) #4016 
print("Show first 10 words in the vocalbulary list  vocabulary list: \n", w2v.wv.index2word[0:10])

X_train_cleaned = []
for review in X_train:
    X_train_cleaned.append(cleanText(review, remove_stopwords=True, split_text=True))
trainVector = getAvgFeatureVecs(X_train_cleaned, w2v, num_features)
print("Training set : %d feature vectors with %d dimensions" %trainVector.shape)


# Get feature vectors for validation set
X_test_cleaned = []
for review in X_test:
    X_test_cleaned.append(cleanText(review, remove_stopwords=True, split_text=True))
testVector = getAvgFeatureVecs(X_test_cleaned, w2v, num_features)
print("Validation set : %d feature vectors with %d dimensions" %testVector.shape)

Training Word2Vec model ...

Number of words in the vocabulary list : 6945 

Show first 10 words in the vocalbulary list  vocabulary list: 
 ['the', 'and', 'a', 'of', 'to', 'is', 'it', 'in', 'i', 'this']


  del sys.path[0]


Training set : 4500 feature vectors with 300 dimensions
Validation set : 500 feature vectors with 300 dimensions
CPU times: user 29.3 s, sys: 474 ms, total: 29.8 s
Wall time: 17.6 s


In [618]:
def makeFeatureVec(review, model, num_features):
    '''
    Transform a review to a feature vector by averaging feature vectors of words 
    appeared in that review and in the volcabulary list created
    '''
    featureVec = np.zeros((num_features,),dtype="float32")
    nwords = 0.
    index2word_set = set(model.wv.index2word) #index2word is the volcabulary list of the Word2Vec model
    isZeroVec = True
    for word in review:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec, model[word])
            isZeroVec = False
    if isZeroVec == False:
        featureVec = np.divide(featureVec, nwords)
    return featureVec

In [619]:
def getAvgFeatureVecs(reviews, model, num_features):
    '''
    Transform all reviews to feature vectors using makeFeatureVec()
    '''
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        reviewFeatureVecs[counter] = makeFeatureVec(review, model,num_features)
        counter = counter + 1
    return reviewFeatureVecs

In [664]:
%%memit


from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1000)
rf.fit(trainVector, y_train)
predictions = rf.predict(testVector)
modelEvaluation(predictions)


Accuracy on validation set: 0.7600

AUC score : 0.7600

Classification report : 
               precision    recall  f1-score   support

           0       0.75      0.77      0.76       249
           1       0.77      0.75      0.76       251

    accuracy                           0.76       500
   macro avg       0.76      0.76      0.76       500
weighted avg       0.76      0.76      0.76       500


Confusion Matrix : 
 [[192  57]
 [ 63 188]]
peak memory: 3365.38 MiB, increment: 0.00 MiB


In [665]:
from tensorflow.python import keras
print(keras.__version__)

2.2.4-tf


In [487]:
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Lambda
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, SimpleRNN, GRU
from keras.preprocessing.text import Tokenizer
from collections import defaultdict
from keras.layers.convolutional import Convolution1D
from keras import backend as K
from keras.layers.embeddings import Embedding

In [502]:
top_words = 40000 
maxlen = 200 
batch_size = 62
nb_classes = 2
nb_epoch = 6


# Vectorize X_train and X_test to 2D tensor
tokenizer = Tokenizer(num_words=top_words) #only consider top 20000 words in the corpse
tokenizer.fit_on_texts(X_train)
# tokenizer.word_index #access word-to-index dictionary of trained tokenizer

sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)

X_train_seq = sequence.pad_sequences(sequences_train, maxlen=maxlen)
X_test_seq = sequence.pad_sequences(sequences_test, maxlen=maxlen)


# one-hot encoding of y_train and y_test
y_train_seq = np_utils.to_categorical(y_train, nb_classes)
y_test_seq = np_utils.to_categorical(y_test, nb_classes)

print('X_train shape:', X_train_seq.shape)
print("========================================")
print('X_test shape:', X_test_seq.shape)
print("========================================")
print('y_train shape:', y_train_seq.shape)
print("========================================")
print('y_test shape:', y_test_seq.shape)
print("========================================")

X_train shape: (4500, 200)
X_test shape: (500, 200)
y_train shape: (4500, 2)
y_test shape: (500, 2)


In [503]:
y_train_seq[0]

array([0., 1.], dtype=float32)

In [504]:
x  =  np.concatenate((X_train_seq,X_test_seq),axis=0)
print(x.shape)
y  =  np.concatenate((y_train_seq,y_test_seq),axis=0)
print(y.shape)
res  =  np.concatenate((x,y),axis=1)
print(res.shape)

(5000, 200)
(5000, 2)
(5000, 202)


In [336]:
import pandas as pd 
pd.DataFrame(res).to_csv("res.csv")

In [505]:
model1 = Sequential()
model1.add(Embedding(top_words, 128, dropout=0.2))
model1.add(LSTM(128, dropout_W=0.2, dropout_U=0.2)) 
model1.add(Dense(nb_classes))
model1.add(Activation('softmax'))
model1.summary()

  
  This is separate from the ipykernel package so we can avoid doing imports until


Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, None, 128)         5120000   
_________________________________________________________________
lstm_7 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_7 (Dense)              (None, 2)                 258       
_________________________________________________________________
activation_7 (Activation)    (None, 2)                 0         
Total params: 5,251,842
Trainable params: 5,251,842
Non-trainable params: 0
_________________________________________________________________


In [626]:
%%time

model1.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model1.fit(X_train_seq, y_train_seq, batch_size=batch_size, nb_epoch=nb_epoch, verbose=1)

# Model evluation
score = model1.evaluate(X_test_seq, y_test_seq, batch_size=batch_size)
print('Test loss : {:.4f}'.format(score[0]))
print('Test accuracy : {:.4f}'.format(score[1]))

  """
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Test loss : 1.2437
Test accuracy : 0.7120
CPU times: user 7min 50s, sys: 1min 31s, total: 9min 22s
Wall time: 3min 7s


In [659]:
%%memit

model1.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model1.fit(X_train_seq, y_train_seq, batch_size=batch_size, nb_epoch=nb_epoch, verbose=1)

# Model evluation
score = model1.evaluate(X_test_seq, y_test_seq, batch_size=batch_size)
print('Test loss : {:.4f}'.format(score[0]))
print('Test accuracy : {:.4f}'.format(score[1]))

  
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Test loss : 1.7507
Test accuracy : 0.7360
peak memory: 3718.12 MiB, increment: 224.91 MiB


In [507]:
len(X_train_seq),len(y_train_seq)

(4500, 4500)

In [508]:
import pickle
pickle.dump(model1,open('model1.pkl','wb'))

In [509]:
print("Size of weight matrix in the embedding layer : ", \
      model1.layers[0].get_weights()[0].shape)

# get weight matrix of the hidden layer
print("Size of weight matrix in the hidden layer : ", \
      model1.layers[1].get_weights()[0].shape)

# get weight matrix of the output layer
print("Size of weight matrix in the output layer : ", \
      model1.layers[2].get_weights()[0].shape)

Size of weight matrix in the embedding layer :  (40000, 128)
Size of weight matrix in the hidden layer :  (128, 512)
Size of weight matrix in the output layer :  (128, 2)


In [299]:
run.log("Experiment end time", str(datetime.datetime.now()))
run.complete()

In [300]:
print(run.get_portal_url())

https://ml.azure.com/experiments/lstm-imdb3/runs/d0530400-8029-45a9-b44f-2dba97e2dcf3?wsid=/subscriptions/6c8ba98b-ed1b-4d48-aeff-a06760245180/resourcegroups/IMDBDeploy-rg/workspaces/imdb_workspace


In [301]:
from azureml.widgets import RunDetails
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [302]:
run.get_metrics()

{'Experiment start time': '2020-12-08 15:15:37.504926',
 'Experiment end time': '2020-12-08 15:18:05.677733'}

In [184]:
print(os.getenv('AZUREML_MODEL_DIR'))

None


In [407]:
import os
from sklearn.externals import joblib

def init():
    global loaded_model
    model_path = os.path.join('.', 'sklearn_regression_model.pkl')
    loaded_model = joblib.load(model_path)

init()
    
twt = ['bad movie']
#vectorizing the tweet by the pre-fitted tokenizer instance
twt = tokenizer.texts_to_sequences(twt)
#print(twt)
sentiment = model1.predict(twt)

if(np.argmax(sentiment) == 0):
    print("negative")
elif (np.argmax(sentiment) == 1):
    print("positive")

print(type(sentiment))

negative
<class 'numpy.ndarray'>


In [180]:
from azureml.core.model import Model
import logging
logging.basicConfig(level=logging.DEBUG)
print(Model.get_model_path(model_name='model-lstm', version=2))

ERROR - Model not found in cache or in root at ./model-lstm. For more info,set logging level to DEBUG.



WebserviceException: WebserviceException:
	Message: Model not found in cache or in root at ./model-lstm. For more info,set logging level to DEBUG.
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Model not found in cache or in root at ./model-lstm. For more info,set logging level to DEBUG."
    }
}

In [122]:
score = model.evaluate(X_test_seq, y_test_seq, batch_size=batch_size)



In [123]:
print('Test loss : {:.4f}'.format(score[0]))
print('Test accuracy : {:.4f}'.format(score[1]))

Test loss : 0.4354
Test accuracy : 0.8680


In [176]:
model = Model.register(model_path = os.path.join('.', 'model1.pkl'),
                       model_name = "model-lstm",
                       tags = {"key": "1"},
                       description = "lstm",
                       workspace = ws)

Registering model model-lstm


In [552]:
%%writefile score.py

import keras
import numpy as np
from keras import losses
from keras import metrics
from keras import optimizers
from azureml.core.model import Model
from keras.models import load_model
import os
import joblib
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
#from sklearn.externals import joblib

#Load the model
def init():
    global model
    model_path = os.path.join(os.getenv('AZUREML_MODEL_DIR'), 'model1.pkl')
    model = joblib.load(model_path)

#Score
def run(data):
    import json
    tokenizer = Tokenizer(num_words=40000) #only consider top 20000 words in the corpse
    twt = tokenizer.texts_to_sequences(json.loads(data))
    #padding the tweet to have exactly the same shape as `embedding_2` input
    twt = sequence.pad_sequences(twt, maxlen=28, dtype='int32', value=0)
    sentiment = model.predict(twt)[0]
    if(np.argmax(sentiment) == 0):
        label = "NEGATIVE"
    elif (np.argmax(sentiment) == 1):
        label = "POSITIVE"
    prediction = json.dumps({"label": label, "score": sentiment.tolist(), "twt":twt.tolist(),"data":json.loads(data) })
    #prediction = json.dumps({"TWT": twt.tolist(),"TCT": tCt.tolist() })
    return prediction
    #return pred
    #return json.dumps(str(pred))

# Predict sentiment using the model
# def predict(data):
#     # Tokenize text
#     # Vectorize X_train and X_test to 2D tensor
#     import json
#     tokenizer = Tokenizer(num_words=40000) #only consider top 20000 words in the corpse
#     twt = tokenizer.texts_to_sequences(json.loads(data))
#     #padding the tweet to have exactly the same shape as `embedding_2` input
#     twt = sequence.pad_sequences(twt, maxlen=28, dtype='int32', value=0)
#     #print(twt)
#     sentiment = model.predict(twt)[0]
#     if(np.argmax(sentiment) == 0):
#         label = "NEGATIVE"
#     elif (np.argmax(sentiment) == 1):
#         label = "POSITIVE"

#     return {"label": label, "score": sentiment}  

# twt = ['Nothing but a disgusting materialistic pageant of glistening abed remote control greed zombies, totally devoid of any heart or heat. A romantic comedy that has zero romantic chemestry and zero laughs']
# print(run(twt))

Overwriting score.py


In [587]:
%%writefile score.py

import keras
import numpy as np
from keras import losses
from keras import metrics
from keras import optimizers
from azureml.core.model import Model
from keras.models import load_model
import os
import joblib
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
#from sklearn.externals import joblib

#Load the model
def init():
    global model
    model_path = os.path.join(os.getenv('AZUREML_MODEL_DIR'), 'model1.pkl')
    model = joblib.load(model_path)

#Score
def run(data):
    import json
    inputdata = json.loads(data)
    twt = inputdata['data']
    sentiment = model.predict(twt)[0]
    if(np.argmax(sentiment) == 0):
        label = "NEGATIVE"
    elif (np.argmax(sentiment) == 1):
        label = "POSITIVE"
    prediction = json.dumps({"label": label, "score": sentiment.tolist()})
    #prediction = json.dumps({"TWT": twt.tolist(),"TCT": tCt.tolist() })
    return prediction
    #return pred
    #return json.dumps(str(pred))


Overwriting score.py


In [205]:
from azureml.core.conda_dependencies import CondaDependencies 

myenv = CondaDependencies()
required_packages = ["numpy", "keras==2.3.1", "tensorflow==2.0.0", "joblib", "scikit-learn", "ruamel.yaml==0.16"]

for package in required_packages:
    myenv.add_conda_package(package)
with open("myenv.yml","w") as f:
    f.write(myenv.serialize_to_string())
    
print("myenv.yml created succesfully, check the current directory to find it")

myenv.yml created succesfully, check the current directory to find it


In [206]:
with open("myenv.yml","r") as f:
    print(f.read())

# Conda environment specification. The dependencies defined in this file will
# be automatically provisioned for runs with userManagedDependencies=False.

# Details about the Conda environment file format:
# https://conda.io/docs/user-guide/tasks/manage-environments.html#create-env-file-manually

name: project_environment
dependencies:
  # The python interpreter version.
  # Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2

- pip:
    # Required packages for AzureML execution, history, and data preparation.
  - azureml-defaults

- numpy
- keras==2.3.1
- tensorflow==2.0.0
- joblib
- scikit-learn
- ruamel.yaml==0.16
channels:
- anaconda
- conda-forge



In [588]:
from azureml.core.webservice import AciWebservice

aciconfig = AciWebservice.deploy_configuration(cpu_cores = 1, 
                                               memory_gb = 1
                                               )

In [660]:
%%memit

from azureml.core.image import ContainerImage

image_config = ContainerImage.image_configuration(execution_script = "score.py",
                                                  runtime = "python",
                                                  conda_file = "myenv.yml",
                                                  description = "Image for IMBD Reviews Classifier Keras Model",
                                                  tags = {"data": "imdb", "reviews": "classifier"})

  


peak memory: 3365.26 MiB, increment: 0.00 MiB


In [590]:
os.path.join('.', 'model1.pkl')

'./model1.pkl'

In [661]:
%%memit

from azureml.core.webservice import Webservice
from azureml.core import Workspace

#<PLEASE ADD YOUR AZURE SUBSCRIPTION ID HERE>
#<PLEASE ADD THE RESOURCE GROUP YOU CREATED AT THE BEGGINING>
ws= Workspace.get(name='imdb_workspace',subscription_id='6c8ba98b-ed1b-4d48-aeff-a06760245180', resource_group='IMDBDeploy-rg')
service_name = 'test400'
service = Webservice.deploy(deployment_config = aciconfig,
                                image_config = image_config,
                                model_paths = [os.path.join('.', 'model1.pkl')],
                                name = service_name,
                                workspace = ws)


service.wait_for_deployment(show_output = True)
print(service.state)

Registering model model1.pkl
Creating image
Running.....................................................................................................................
Succeeded
Image creation operation finished for image test400:1, operation "Succeeded"
Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running..........................
Succeeded
ACI service creation operation finished, operation "Succeeded"
Healthy
peak memory: 3365.89 MiB, increment: 0.62 MiB


In [199]:
service.get_logs()

'2020-12-08T01:52:44,482759138+00:00 - nginx/run \n2020-12-08T01:52:44,481548430+00:00 - gunicorn/run \n2020-12-08T01:52:44,491266791+00:00 - iot-server/run \n2020-12-08T01:52:44,494895713+00:00 - rsyslog/run \nEdgeHubConnectionString and IOTEDGE_IOTHUBHOSTNAME are not set. Exiting...\n2020-12-08T01:52:44,743188547+00:00 - iot-server/finish 1 0\n2020-12-08T01:52:44,750498693+00:00 - Exit code 1 is normal. Not restarting iot-server.\nStarting gunicorn 19.9.0\nListening at: http://127.0.0.1:31311 (13)\nUsing worker: sync\nworker timeout is set to 300\nBooting worker with pid: 42\nException in worker process\nTraceback (most recent call last):\n  File "/opt/miniconda/lib/python3.6/site-packages/gunicorn/arbiter.py", line 583, in spawn_worker\n    worker.init_process()\n  File "/opt/miniconda/lib/python3.6/site-packages/gunicorn/workers/base.py", line 129, in init_process\n    self.load_wsgi()\n  File "/opt/miniconda/lib/python3.6/site-packages/gunicorn/workers/base.py", line 138, in load_

In [566]:

from azureml.core.model import InferenceConfig

inference_config = InferenceConfig(entry_script='score.py',
                                   runtime = "python",
                                conda_file = "myenv.yml"
                                   )

In [567]:
model1

<keras.engine.sequential.Sequential at 0x7f59035855f8>

In [568]:
from azureml.core.webservice import LocalWebservice, Webservice
model_path = os.path.join('.', 'model1.pkl')
model = joblib.load(model_path)
deployment_config = LocalWebservice.deploy_configuration(port=8890)
service = Model.deploy(ws, "myservice", [os.path.join('.', 'model1.pkl')], inference_config, deployment_config)
service.wait_for_deployment(show_output = True)
print(service.state)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
ERROR - ModelNotFound: Model with id ./model1.pkl not found in provided workspace



Copying local model ./model1.pkl to /tmp/azureml_k2epve_l/model1.pkl/0
Generating Docker build context.


KeyboardInterrupt: 

In [592]:
print(service.scoring_uri)

http://bced7c1a-787b-49b7-9955-9a3c4c2412e5.westeurope.azurecontainer.io/score


In [265]:
primary, secondary = service.get_keys()
print(primary)

LGQxCwCh6BbqjlDyCHWIfsZYFs4leRaa


In [603]:
import requests
import json

# URL for the web service
scoring_uri = 'http://bced7c1a-787b-49b7-9955-9a3c4c2412e5.westeurope.azurecontainer.io/score'


with open(os.path.join('.', 'tokenizer.pkl'),'rb') as handle:
        tokenizer = pickle.load(handle)
        
#twt = ['the movie is the best i have ever seen, share it with my friends definitely!!!']
twt = ['Nothing but a disgusting materialistic pageant of glistening abed remote control greed zombies, totally devoid of any heart or heat. A romantic comedy that has zero romantic chemestry and zero laughs']
twt = tokenizer.texts_to_sequences(twt)
#twt = [163,18,3,3048,12088,13052,4,2850,1222,8390,125,437,3760,4,98,479,39,3033,3,710,190,12,45,1803,710,2,1803,795]
#twt = sequence.pad_sequences(twt, maxlen=28, dtype='int32', value=0)

# Two sets of data to score, so we get two results back
data = {"data": twt}
#data = {"data": twt}

# Convert to JSON string
input_data = json.dumps(data)

# Set the content type
headers = {'Content-Type': 'application/json'}
# If authentication is enabled, set the authorization header

# Make the request and display the response
resp = requests.post(scoring_uri, input_data, headers=headers)
print(resp.text)

"{\"label\": \"NEGATIVE\", \"score\": [0.8598217964172363, 0.14017817378044128]}"


In [585]:
import requests
import json

# URL for the web service
scoring_uri = 'http://62f96f23-4a6c-463e-95a6-aec90713f572.westeurope.azurecontainer.io/score'

twt = ['Nothing but a disgusting materialistic pageant of glistening abed remote control greed zombies, totally devoid of any heart or heat. A romantic comedy that has zero romantic chemestry and zero laughs']
#twt = ['the movie is the best i have ever seen, share it with my friends definitely!!!']
twt = tokenizer.texts_to_sequences(twt)
#twt = [163,18,3,3048,12088,13052,4,2850,1222,8390,125,437,3760,4,98,479,39,3033,3,710,190,12,45,1803,710,2,1803,795]
twt = sequence.pad_sequences(twt, maxlen=28, dtype='int32', value=0)
# Two sets of data to score, so we get two results back
data = {"data": twt.tolist()}
#data = {"data": twt}
# Convert to JSON string
input_data = json.dumps(data)
#print(input_data)
uploaddata = json.loads(input_data)
print(uploaddata['data'])
# Set the content type
headers = {'Content-Type': 'application/json'}

# Make the request and display the response
resp = requests.post(scoring_uri, input_data, headers=headers)
#print(resp.text.split(',')[0].split(':')[1][3:-2])
print(resp.text)

[[163, 18, 3, 3048, 12088, 13052, 4, 2850, 1222, 8390, 1425, 437, 3760, 4, 98, 479, 39, 3033, 3, 710, 190, 12, 45, 1803, 710, 2, 1803, 795]]
"{\"label\": \"POSITIVE\", \"score\": [0.006039343774318695, 0.9939606189727783], \"twt\": [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], \"data\": {\"data\": [[163, 18, 3, 3048, 12088, 13052, 4, 2850, 1222, 8390, 1425, 437, 3760, 4, 98, 479, 39, 3033, 3, 710, 190, 12, 45, 1803, 710, 2, 1803, 795]]}}"


In [594]:
import pickle
pickle.dump(tokenizer,open('tokenizer.pkl','wb'))

In [595]:
word = ['Nothing but a disgusting materialistic pageant of glistening abed remote control greed zombies, totally devoid of any heart or heat. A romantic comedy that has zero romantic chemestry and zero laughs']
    #padding the tweet to have exactly the same shape as `embedding_2` input
#word = ['oh fuck this movie is fucking crazy, what the fuck is it telling, piece of shit']

with open(os.path.join('.', 'tokenizer.pkl'),'rb') as handle:
        newtokenizer = pickle.load(handle)


word = newtokenizer.texts_to_sequences(word)
#twt = [163,18,3,3048,12088,13052,4,2850,1222,8390,125,437,3760,4,98,479,39,3033,3,710,190,12,45,1803,710,2,1803,795]
twt = sequence.pad_sequences(word, maxlen=28, dtype='int32', value=0)
#word = np.array(word)
sentiment = model1.predict(uploaddata['data'])[0]
if(np.argmax(sentiment) == 0):
    label = "NEGATIVE"
elif (np.argmax(sentiment) == 1):
    label = "POSITIVE"
print(type(word))
print(sentiment)
print(label)
print(twt)


<class 'list'>
[0.85982186 0.14017816]
NEGATIVE
[[  163    18     3  3048 12088 13052     4  2850  1222  8390  1425   437
   3760     4    98   479    39  3033     3   710   190    12    45  1803
    710     2  1803   795]]


In [None]:
import json
from azureml.core import Datastore
from azureml.core.dataset import Dataset
from azureml.data import dataset_type_definitions

input_json = {'data': [163,18,3,3048,12088,13052,4,2850,1222,8390,125,437,3760,4,98,479,39,3033,3,710,190,12,45,1803,710,2,1803,795]}
# create a string that can be utf-8 encoded and
# put in the body of the request
serialized_input_json = json.dumps(input_json)
dataset_content = []
for i in range(100):
    dataset_content.append(serialized_input_json)
dataset_content = '\n'.join(dataset_content)
file_name = 'sample_data.txt'
f = open(file_name, 'w')
f.write(dataset_content)
f.close()

# upload the txt file created above to the Datastore and create a dataset from it
data_store = Datastore.get_default(ws)
data_store.upload_files(['./' + file_name], target_path='sample_data')
datastore_path = [(data_store, 'sample_data' +'/' + file_name)]
sample_request_data = Dataset.Tabular.from_delimited_files(
    datastore_path, separator='\n',
    infer_column_types=True,
    header=dataset_type_definitions.PromoteHeadersBehavior.NO_HEADERS)
sample_request_data = sample_request_data.register(workspace=ws,
                                                   name='sample_data',
                                                   create_new_version=True)

In [None]:
from azureml.core.model import InferenceConfig, Model
from azureml.core.dataset import Dataset


model = Model(ws, id="model1.pkl:32")
inference_config = InferenceConfig(entry_script='score.py',
                                   runtime = "python",
                                conda_file = "myenv.yml"
                                   )
input_dataset = Dataset.get_by_name(workspace=ws, name='sample_data')
profile = Model.profile(ws,
            'unique_name',
            [model],
            inference_config,
            input_dataset=input_dataset)

profile.wait_for_completion(True)

# see the result
details = profile.get_details()