In [1]:
import tensorflow as tf 
import pandas as pd
import numpy as np 

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils import shuffle

import pickle
import os
import numpy as np

In [2]:
!gsutil cp 'gs://cloudml-demo-lcm/SO_ml_tags_avocado_188k_v2.csv' ./

Copying gs://cloudml-demo-lcm/SO_ml_tags_avocado_188k_v2.csv...
\ [1 files][276.7 MiB/276.7 MiB]                                                
Operation completed over 1 objects/276.7 MiB.                                    


In [3]:
file = os.path.join('.', 'SO_ml_tags_avocado_188k_v2.csv')

In [6]:
data = pd.read_csv('SO_ml_tags_avocado_188k_v2.csv', names=['tags', 'original_text,', 'text'], header=0)

In [7]:
data.head()

Unnamed: 0,tags,"original_text,",text
0,"matplotlib,pandas","python,matplotlib,pandas",setting xticks and yticks for scatter plot mat...
1,"scikitlearn,keras","python,numpy,scikit-learn,keras,grid-search",gridseachcv - valueerror: found input variable...
2,"matplotlib,scikitlearn","python,numpy,matplotlib,scikit-learn,nmf",non negative matrix factorisation in python on...
3,"pandas,tensorflow","python,pandas,tensorflow,time-series",avocado equivalent to avocado.dataframe.resamp...
4,"matplotlib,pandas","python,matplotlib,plot,pandas",how to plot on avocado python i have a data fr...


In [9]:
data= data.dropna()

In [11]:
data = data.drop(columns=['original_text,'])

# Feature engineering

In [13]:
data = shuffle(data, random_state=20)
data.head()

Unnamed: 0,tags,text
64533,pandas,selecting specific rows in df based on 2 colum...
124736,matplotlib,avocado: border line drawn on only the first b...
184298,"pandas,matplotlib",avocado dataframe groupby plot i have a datafr...
21765,pandas,how to set avocado dataframe multiindex in con...
7607,keras,what's the difference between lstm() and lstmc...


In [14]:
data.iloc[0].text

'selecting specific rows in df based on 2 columns in python avocado i have data from excel loaded into a avocado dataframe. i now want to select only those rows whose assessment id is the max assessment id per appid and for all the ui seq numbers for that appid.  appid   appname assessment id   ui seq number   question    answer text .    1   appname 2493    11  question    no .    1   appname 13808   11  question    ctry of domicile .    1   appname 13808   11  question    name .    1   appname 35316   11  question    ctry of domicile .        1   appname 35316   11  question    name .    1   appname 35316   11  question    nationality .        1   appname 2493    12  question    corp name .    1   appname 2493    12  question    cr br scr .    1   appname 2493    12  question    inc and assests .    1   appname 2493    12  question    int, ext reg reports .    1   appname 13808   12  question    corp name .    1   appname 35316   12  question    corp name .    1   appname 2493    13 

In [15]:
tags_split = [tags.split(',') for tags in data['tags'].values]

In [17]:
tags_split[2]

['pandas', 'matplotlib']

In [18]:
tag_encoder = MultiLabelBinarizer()
tags_encoded = tag_encoder.fit_transform(tags_split)

In [19]:
num_tags =len(tags_encoded[0])

In [20]:
num_tags

5

In [21]:
print(tag_encoder.classes_)

['keras' 'matplotlib' 'pandas' 'scikitlearn' 'tensorflow']


In [22]:
tags_encoded[0]

array([0, 0, 1, 0, 0])

In [23]:
tags_encoded[2]

array([0, 1, 1, 0, 0])

In [25]:
train_size = int(len(data)*.8)
print("train size: %d" % train_size)

train size: 150559


In [26]:
print("test size: %d" % (len(data) -train_size))

test size: 37640


In [27]:
train_tags = tags_encoded[:train_size]
test_tags = tags_encoded[train_size:]

In [28]:
train_tags

array([[0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 1, 0, 0],
       ...,
       [0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0]])

# Feature Engineering for our X's (predictors)

In [32]:
%%writefile preprocess.py

from tensorflow.keras.preprocessing import text

class TextPreprocessor(object):
    def __init__(self, vocab_size):
        self._vocab_size = vocab_size
        self._tokenizer = None
        
    def create_tokenizer(self, text_list):
        tokenizer = text.Tokenizer(num_words=self._vocab_size)
        tokenizer.fit_on_texts(text_list)
        self._tokenizer = tokenizer
    
    def transform_text(self, text_list):
        text_matrix=self._tokenizer.texts_to_matrix(text_list)
        return text_matrix

Overwriting preprocess.py


In [33]:
from preprocess import TextPreprocessor

In [34]:
train_qs = data['text'].values[:train_size]
test_qs =data['text'].values[train_size:]

In [37]:
print(type(train_qs))

<class 'numpy.ndarray'>


In [38]:
VOCAB_SIZE=400
processor = TextPreprocessor(VOCAB_SIZE)
type(processor)

preprocess.TextPreprocessor

In [39]:
processor.create_tokenizer(train_qs)

In [40]:
body_train = processor.transform_text(train_qs)
body_test = processor.transform_text(test_qs)

In [41]:
print(body_train[0])

[0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0.
 1. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.

In [42]:
len(body_train[0])

400

In [44]:
import pickle
with open('./processor_state.pkl', 'wb') as f:
    pickle.dump(processor,f)

# Build and train our model

In [47]:
def create_model(vocab_size, num_tags):
    model = tf.keras.models.Sequential()
    
    model.add(tf.keras.layers.Dense(50, input_shape=(VOCAB_SIZE,), activation='relu'))
    model.add(tf.keras.layers.Dense(25, activation='relu'))
    model.add(tf.keras.layers.Dense(num_tags, activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model
                  
                                    

In [48]:
model =create_model(VOCAB_SIZE, num_tags)

2022-03-18 15:09:21.138579: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
2022-03-18 15:09:21.147647: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-03-18 15:09:21.147702: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (vm-1a1132b4-6adb-45c9-b1ce-76bd1b8b9bc5): /proc/driver/nvidia/version does not exist
2022-03-18 15:09:21.184566: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  

In [49]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 50)                20050     
                                                                 
 dense_1 (Dense)             (None, 25)                1275      
                                                                 
 dense_2 (Dense)             (None, 5)                 130       
                                                                 
Total params: 21,455
Trainable params: 21,455
Non-trainable params: 0
_________________________________________________________________


In [50]:
model.fit(body_train, train_tags, epochs=5, batch_size=128, validation_split=0.1)

2022-03-18 15:12:10.876390: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 216804800 exceeds 10% of free system memory.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fa74c23db90>

In [51]:
print('Eval loss/accuracy:{}'.format(model.evaluate(body_test, test_tags, batch_size=128)))

 68/295 [=====>........................] - ETA: 0s - loss: 0.0974 - accuracy: 0.8997

2022-03-18 15:13:22.127536: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 60224000 exceeds 10% of free system memory.


Eval loss/accuracy:[0.09957630932331085, 0.8975026607513428]


In [52]:
model.save('keras_saved_model.h5')

In [74]:
%%writefile model_prediction2.py
import pickle
import os
import numpy as np

class CustomModelPrediction(object):
    def __init__(self, model, processor):
        self._model= model
        self._processor = processor
    
    def predict(self, instances, **kwargs):
        preprocessed_data = self._processor.transform_text(instances)
        predictions = self._model.predict(preprocessed_data)
        return predictions.tolist()
    
    @classmethod
    def from_path(cls, model_dir):
        import os
        import tensorflow.keras as keras
        model = keras.models.load_model(os.path.join(model_dir,'keras_saved_model.h5'))
        with open(os.path.join(model_dir, 'processor_state.pkl'), 'rb') as f:
                  processor = pickle.load(f)
        return cls(model, processor)
                                        

Writing model_prediction2.py


In [75]:
test_request = [
  "How to preprocess strings in Keras models Lambda layer? I have the problem that the value passed on to the Lambda layer (at compile time) is a placeholder generated by keras (without values). When the model is compiled, the .eval () method throws the error: You must feed a value for placeholder tensor 'input_1' with dtype string and shape [?, 1] def text_preprocess(x): strings = tf.keras.backend.eval(x) vectors = [] for string in strings: vector = string_to_one_hot(string.decode('utf-8')) vectors.append(vector) vectorTensor = tf.constant(np.array(vectors),dtype=tf.float32) return vectorTensor input_text = Input(shape=(1,), dtype=tf.string) embedding = Lambda(text_preprocess)(input_text) dense = Dense(256, activation='relu')(embedding) outputs = Dense(2, activation='softmax')(dense) model = Model(inputs=[input_text], outputs=outputs) model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy']) model.summary() model.save('test.h5') If I pass a string array into the input layer statically, I can compile the model, but I get the same error if I want to convert the model to tflite. #I replaced this line: input_text = Input(shape=(1,), dtype=tf.string) #by this lines: test = tf.constant(['Hello', 'World']) input_text = Input(shape=(1,), dtype=tf.string, tensor=test) #but calling this ... converter = TFLiteConverter.from_keras_model_file('string_test.h5') tfmodel = converter.convert() #... still leads to this error: InvalidArgumentError: You must feed a value for placeholder tensor 'input_3' with dtype string and shape [2] [[{{node input_3}}]] ",
  "Change the bar item name in Pandas I have a test excel file like: df = pd.DataFrame({'name':list('abcdefg'), 'age':[10,20,5,23,58,4,6]}) print (df) name  age 0    a   10 1    b   20 2    c    5 3    d   23 4    e   58 5    f    4 6    g    6 I use Pandas and matplotlib to read and plot it: import pandas as pd import numpy as np import matplotlib.pyplot as plt import os excel_file = 'test.xlsx' df = pd.read_excel(excel_file, sheet_name=0) df.plot(kind='bar') plt.show() the result shows: enter image description here it use index number as item name, how can I change it to the name, which stored in column name?"]


In [76]:
from model_prediction2 import CustomModelPrediction

In [77]:
classifier = CustomModelPrediction.from_path('.')

In [81]:
results=classifier.predict(test_request)

In [82]:
results[0]

[0.9935271739959717,
 6.608804170582516e-08,
 0.0012646317481994629,
 0.0001308917999267578,
 0.7115932703018188]

In [83]:
for i in range(len(results)):
  print('Predicted labels for text-{}:'.format(i))
  for idx, val in enumerate(results[i]):
    if val > 0.7:
      print(tag_encoder.classes_[idx])
  print('\n')

Predicted labels for text-0:
keras
tensorflow


Predicted labels for text-1:




# Package our Model and deploy it INTO AI PLATFORM!!!

In [None]:
...