In [None]:
#https://www.instaclustr.com/blog/machine-learning-over-streaming-kafka-data-part-5-incremental-tensorflow-training-with-kafka-data/

In [1]:
import os
from datetime import datetime 
 
import time 
 
import threading 
 
import json 
 
from kafka import KafkaProducer 
 
from kafka.errors import KafkaError 
 
from sklearn.model_selection import train_test_split 
 
import pandas as pd 
 
import tensorflow as tf 
 
import tensorflow_io as tfio 

drone_iterator = pd.read_csv('data/train.csv',chunksize=10000) 
drone_df = next(drone_iterator) 
drone_df.head() 

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
NUM_COLUMNS = 784
 
BATCH_SIZE=32 
 
EPOCHS=20 
print(NUM_COLUMNS)

784


In [3]:
train_df, test_df = train_test_split(drone_df, test_size=0.01, shuffle=False) 
print("Number of training samples: ",len(train_df)) 
x_train_df = train_df.drop('label', axis=1)
y_train_df=train_df['label']

Number of training samples:  9900


In [4]:
# The labels are set as the kafka message keys so as to store data 
# in multiple-partitions. Thus, enabling efficient data retrieval 
# using the consumer groups. 
 
x_train = list(filter(None, x_train_df.to_csv(index=False).split("\n")[1:])) 
y_train = list(filter(None, y_train_df.to_csv(index=False).split("\n")[1:])) 

len(x_train), len(y_train) 

(9900, 9900)

In [5]:
def error_callback(exc): 
 
    raise Exception('Error while sending data to kafka: {0}'.format(str(exc))) 
 
 
def write_to_kafka(topic_name, items): 
 
  count=0 
 
  producer = KafkaProducer(bootstrap_servers=['127.0.0.1:9092']) 
 
  for message, key in items: 
 
    producer.send(topic_name, key=key.encode('utf-8'), value=message.encode('utf-8')).add_errback(error_callback) 
 
    count+=1 
 
  producer.flush() 
 
  print("Wrote {0} messages into topic: {1}".format(count, topic_name)) 
 
 
write_to_kafka("train-test1", zip(x_train, y_train))

Wrote 9900 messages into topic: train-test1


In [6]:
train_df, test_df = train_test_split(drone_df, test_size=0.00001, shuffle=False) 
NUM_COLUMNS = len(x_train_df.columns) 
# build model  
# Set the parameters 
OPTIMIZER="adam"    
LOSS=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False) 
METRICS=['accuracy']  

In [7]:
train_df, test_df = train_test_split(drone_df, test_size=0.00001, shuffle=False) 
 
NUM_COLUMNS = len(x_train_df.columns) 

# build model  
# Set the parameters 
OPTIMIZER="adam" 
LOSS=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False) 
METRICS=['accuracy'] 
# 
 
# 32 is the default 
 
BATCH_SIZE=32 
 
EPOCHS=20 
 

model = tf.keras.Sequential([ 
 
  tf.keras.layers.Input(shape=(NUM_COLUMNS,)), 
  #tf.keras.layers.Flatten(input_shape=(784,)),
  tf.keras.layers.Dense(128, activation='relu'), 
 
  tf.keras.layers.Dropout(0.2), 
 
  tf.keras.layers.Dense(256, activation='relu'), 
 
  tf.keras.layers.Dropout(0.4), 
 
  tf.keras.layers.Dense(128, activation='relu'), 
 
  tf.keras.layers.Dropout(0.4), 
 
  tf.keras.layers.Dense(10, activation='softmax') 
 
]) 
  
print(model.summary()) 
 
# compile the model 
 
model.compile(optimizer=OPTIMIZER, loss=LOSS, metrics=METRICS)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               100480    
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dropout_2 (Dropout)         (None, 128)               0         
                                                                 
 dense_3 (Dense)             (None, 10)                1

In [8]:
drone_features = drone_df.drop('label',axis=1)
drone_labels = drone_df['label']

In [9]:
list=[]
ss = 100 
start_t_ns = time.process_time_ns() 

for x in range(0, len(drone_features)-ss, ss): 
    continue 

df = drone_features.iloc[x:x+ss] 
dl = drone_labels.iloc[x:x+ss] 

In [10]:
df

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
9800,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9801,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9802,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9803,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9804,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9895,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9896,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9897,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9898,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:

model.fit(df, dl, batch_size=8, epochs=EPOCHS) 
print("any good?") 

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
any good?


In [12]:
# evaluate on all data seen so far and keep a record 
 
res = model.evaluate(drone_features[0:x+ss], drone_labels[0:x+ss]) 
print(res) 
list.append(res[1]) 
end_t_ns = time.process_time_ns() 
time_diff = end_t_ns - start_t_ns 
print("training time (ns) = ", time_diff) 

[3.211879253387451, 0.5548484921455383]
training time (ns) =  7390625000


In [13]:
res = model.predict(drone_features[0:99]) 
for r in res: 
    for v in r: 
        print(v)

0.00090539735
0.0032441884
0.0031895512
0.1184624
0.004148268
0.00044991638
0.002607971
1.3339077e-05
0.86696327
1.5734084e-05
0.8815217
1.9860747e-14
0.11760843
5.3297346e-13
4.947201e-08
7.8943225e-07
0.00085997157
8.688085e-06
3.1708164e-10
4.095587e-07
9.6901966e-11
0.99999833
7.938477e-09
1.08220334e-07
2.5299216e-12
4.334167e-09
1.07855186e-07
1.573983e-12
1.4049332e-06
5.6652207e-09
5.495702e-08
2.311055e-15
0.9924986
4.273457e-16
2.5291807e-05
1.7147909e-09
0.0074760914
1.5796098e-09
8.1593776e-14
3.22294e-12
0.9953366
3.566907e-11
0.00039041927
4.469805e-09
2.1260233e-07
1.3197891e-06
0.00385525
0.00010612473
7.834047e-07
0.00030931117
0.27057776
6.1148644e-06
0.19343399
0.00070479495
0.008963304
0.01687324
0.006928435
0.00047543517
0.0002772299
0.5017597
2.3957726e-17
1.9399822e-13
7.469631e-14
8.858824e-18
3.4775698e-12
2.7007032e-13
2.1879094e-17
1.0
2.1914733e-12
3.8052415e-16
7.2012005e-15
8.136722e-10
8.40692e-09
0.99990165
3.4101012e-07
7.4357547e-07
4.340258e-09
1.6687

In [14]:
for r in res: 
    for v in r: 
        if v <= 0.2 or v >= 0.8: 
            print(round(v)) 
        else: 
            print("-1")

0
0
0
0
0
0
0
0
1
0
1
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
-1
0
0
0
0
0
0
0
0
-1
0
0
0
0
0
0
0
1
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
-1
-1
0
0
0
0
0
0
0
0
1
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
-1
0
0
-1
0
0
0
0
1
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
-1
0
0
0
0
0
0
0
-1
0
0
0
0
-1
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
-1
0
0
0
-1
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
1
0
0
0
1
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
-1
-1
0
0
0
0
0
0
0
0
-1
0
0
-1
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
1
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
1
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
1
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
-1
0
-1
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
-1
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
1
0
0
0


In [15]:
#Read from kafka
import pandas as pd 
 
import tensorflow as tf 
 
import tensorflow_io as tfio 

online_train_ds = tfio.experimental.streaming.KafkaBatchIODataset( 
 
    topics=["test-train1"], 
 
    group_id="drone_1", 
 
    servers="127.0.0.1:9092", 
 
    stream_timeout=20000, # in milliseconds, to block indefinitely, set it to -1. 
 
    configuration=[ 
        "session.timeout.ms=10000", 
        "max.poll.interval.ms=12000", 
        "auto.offset.reset=earliest",  
        "batch.num.messages=100" 
    ], 
 
) 
 

def decode_kafka_online_item(raw_message, raw_key):
    message = tf.io.decode_csv(raw_message, [[0.0] for i in range(NUM_COLUMNS)]) 
    key = tf.strings.to_number(raw_key) 
    return (message, key)

Instructions for updating:
Use `tf.data.Dataset.take_while(...)


In [16]:
online_train_ds

<KafkaBatchIODataset element_spec=DatasetSpec((TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.string, name=None)), TensorShape([]))>

In [17]:
for mini_ds in online_train_ds: 
    mini_ds = mini_ds.map(decode_kafka_online_item) 

    #mini_ds = mini_ds.batch(1) 

    l = len(mini_ds) 

    print("len = ", l) 

    if l > 0: 
        model.fit(mini_ds, batch_size=BATCH_SIZE, epochs=EPOCHS)

len =  0
len =  0
len =  0


In [18]:
"""for mini_ds in online_train_ds: 
    mini_ds = mini_ds.map(decode_kafka_online_item)    
    print(mini_ds)
    mini_ds = mini_ds.batch(1) 
    l = len(mini_ds) 
    mini_ds = mini_ds.unbatch()
    print(type(mini_ds))
    if l > 0: 
        print('in fit')
        model.fit(mini_ds, batch_size=BATCH_SIZE, epochs=EPOCHS)"""

"for mini_ds in online_train_ds: \n    mini_ds = mini_ds.map(decode_kafka_online_item)    \n    print(mini_ds)\n    mini_ds = mini_ds.batch(1) \n    l = len(mini_ds) \n    mini_ds = mini_ds.unbatch()\n    print(type(mini_ds))\n    if l > 0: \n        print('in fit')\n        model.fit(mini_ds, batch_size=BATCH_SIZE, epochs=EPOCHS)"

In [19]:
ss=100
loop = 0 
 
total = 0 
 
best_acc = 0.0 
 
accs = [] 
 
 
 
for mini_ds in online_train_ds: 
 
  loop = loop + 1 
 
  print("loop = ", loop) 
 
  mini_ds = mini_ds.shuffle(buffer_size=32) 
 
  mini_ds = mini_ds.map(decode_kafka_online_item) 
 
  mini_ds = mini_ds.batch(1) 
 
  l = len(mini_ds) 
 
  print("len = ", l) 
 
  total = total + l 
 
  print("total = ", total) 
 
# take the first 80 for training 
 
  train_mini_ds = mini_ds.take(80) 
 
# take last 20 for evaluation 
 
  if loop==1: 
 
    test_mini_ds = mini_ds.skip(80) 
 
  else: 
 
# prefer newer data 
 
    test_mini_ds = mini_ds.skip(80).concatenate(test_mini_ds) 
 
# only keep 1000 results 
 
  test_mini_ds = test_mini_ds.take(1000) 
 
  print("test len = ", len(test_mini_ds)) 
 
  if l > 0: 
 
   model.fit(train_mini_ds, batch_size=BATCH_SIZE, epochs=EPOCHS)   
 
   res = model.evaluate(test_mini_ds) 
 
   last_acc = res[1] 
 
   accs.append(last_acc) 
 
   print("accuracy = ", last_acc) 
 
   if last_acc > best_acc: 
 
       best_acc = last_acc 
 
   
 
print(accs) 
 
print("best accuracy = ", best_acc)
    
    
    

[]
best accuracy =  0.0


In [20]:
"""import tensorflow as tf
import numpy as np

(train_images, _), (test_images, _) = tf.keras.datasets.mnist.load_data()

TRAIN_BUF=1000
BATCH_SIZE=64

train_dataset = tf.data.Dataset.from_tensor_slices(train_images).shuffle(TRAIN_BUF).batch(BATCH_SIZE, drop_remainder=True)
test_dataset = tf.data.Dataset.from_tensor_slices(test_images).shuffle(TRAIN_BUF).batch(BATCH_SIZE, drop_remainder=True)

# print(train_dataset, type(train_dataset), test_dataset, type(test_dataset))

train_np = np.stack(list(train_dataset))
test_np = np.stack(list(test_dataset))
print(type(train_np), train_np.shape)
print(type(test_np), test_np.shape)"""

'import tensorflow as tf\nimport numpy as np\n\n(train_images, _), (test_images, _) = tf.keras.datasets.mnist.load_data()\n\nTRAIN_BUF=1000\nBATCH_SIZE=64\n\ntrain_dataset = tf.data.Dataset.from_tensor_slices(train_images).shuffle(TRAIN_BUF).batch(BATCH_SIZE, drop_remainder=True)\ntest_dataset = tf.data.Dataset.from_tensor_slices(test_images).shuffle(TRAIN_BUF).batch(BATCH_SIZE, drop_remainder=True)\n\n# print(train_dataset, type(train_dataset), test_dataset, type(test_dataset))\n\ntrain_np = np.stack(list(train_dataset))\ntest_np = np.stack(list(test_dataset))\nprint(type(train_np), train_np.shape)\nprint(type(test_np), test_np.shape)'