### Client Setup

Accepting streaming data from server. Then training ensemble model for following ML models:
- Random Forest Classifier
- Logistic Regression
- Linear Discriminant Analysis
- K-Neighbors Classifier
- Classification And Regression Trees
- Support Vector Classifier

<br>
<br>

In [84]:
# Importing required modules

# data streaming
import websockets

# data processing
from datetime import datetime
import pandas as pd

# ensemble modelling
import time
import sys
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [85]:
# Loading prebuilt model structures stored as '[MODEL].h5'

rf = pickle.load(open('../models/h5s/RF.h5', 'rb'))
lr = pickle.load(open('../models/h5s/LR.h5', 'rb'))
lda = pickle.load(open('../models/h5s/LDA.h5', 'rb'))
knn = pickle.load(open('../models/h5s/kNN.h5', 'rb'))
cart = pickle.load(open('../models/h5s/CART.h5', 'rb'))
svm = pickle.load(open('../models/h5s/SVC.h5', 'rb'))

In [86]:
# Function to log specified content in specified file

def log(content="", new=False, file='stream.log', timestamp=False, print_line=0):
    """
    Logs specified content too specified pre-existing file

    :param content: describe about parameter p1
    :param new: if True then overwrites, else appends. Default is False
    :param file: destination file to save logs into
    :param timestamp: if True, adds a timestamp before the content and appends to a new line
    :param print_line: prints a line built with specified number of '-'
    :return: None
    """ 
    # to overwrite
    if(new):
        log = open(f'./{file}', 'w')
        log.write("")
        log.close()
    log = open(f'./{file}', 'a')

    # to add time stamp
    if(timestamp):
        log.write(f"\n{datetime.now()} ~ {content}")
    # without timestamp and new lines
    else:
        log.write(content)
    
    # to print a line
    if(print_line):
        line = "-" * print_line
        log.write(f"\n{line}\n\n")
    
    # save files with changes
    log.close()


In [87]:
# Pandas 3D DataFrame
# https://stackoverflow.com/questions/24290495/constructing-3d-pandas-dataframe

metrics_df_columns=np.array([*['accuracy']*7, *['precision']*7, *['recall']*7, *['f1_score']*7, *['time_taken']*7])
models = np.array(['RF', 'LR', 'LDA', 'kNN', 'CART', 'SVM', 'VC']*5)
chunk_wise_6M_ensemble_metrics = pd.DataFrame(columns=pd.MultiIndex.from_tuples(zip(metrics_df_columns, models)))
chunk_wise_final_ensemble_metrics = pd.DataFrame(columns=['model_version', 'accuracy', 'precision', 'recall', 'f1_score_value', 'time_consumed'])

In [88]:
# def add_metrics(final):

In [89]:
async def ensemble(IoT, chunk_count, initial=False):
    """
    Ensembles a chunk of data for specified models

    :param chunk_count:
    :param initial:
    :return: ensemble model
    """ 
    global chunk_wise_6M_ensemble_metrics, metrics_df_columns
    # data preparation
    x = IoT.drop(['label', 'date', 'time', 'type'], axis=1)
    y = IoT['type']

    # data split into test and train sets
    x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.2)
 
    # ensemble model
    start = time.time()
    voting = VotingClassifier(estimators=[('RF', rf), ('LR', lr), ('LDA', lda), ('KNN', knn), ('CART', cart), ('SVM', svm)], voting='hard')
    voting.fit(x_train, y_train)
    end = time.time()

    # initialise a list for saving a row
    metrics_row = []
    # log accuracy of each model
    for model in (rf, lr, lda, knn, cart, svm, voting):
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)

        # calculate required metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
        recall = recall_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
        f1_score_value = f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
        time_consumed = end-start
        metrics_row = metrics_row + [accuracy, precision,recall, f1_score_value, time_consumed]

    # append to 'chunk_wise_6M_ensemble_metrics'
    metrics_row = np.array(metrics_row)
    # print(metrics_row)
    chunk_wise_6M_ensemble_metrics = pd.concat([chunk_wise_6M_ensemble_metrics, pd.DataFrame([metrics_row], columns=pd.MultiIndex.from_tuples(zip(metrics_df_columns, models)))])
    # print(chunk_wise_6M_ensemble_metrics)
    return voting

In [90]:
# define data chunk parameters
INITIAL_CHUNK_SIZE = 150   # size of chunk at start of data streaming
FINAL_CHUNK_SIZE = 30    # chunk size during data streaming
CHUNK_SIZE_INCREMENT_FACTOR = 2    # chunk size increment factor when error occurs while training ensemble model

async def client():
    """
    Client function for streaming and processing data simultaneously
    """
    global chunk_wise_final_ensemble_metrics
    uri = "ws://localhost:8765"    # websocket endpoint
    temp_row_count = 0    # realtime chunk size
    chunk_size = INITIAL_CHUNK_SIZE
    chunk_count = 0    # realtime chunk count
    initial_chunk = True    # initial chunk flag 
    model = 0    # init model
    log("", True)    # init 'stream.log' file
    async with websockets.connect(uri, ping_interval=None) as websocket:
        df = pd.DataFrame()    # init dataframe for storing the chunk

        # infinite loop for receiving streamed data
        while(True):
            # wait to receiving a record of data
            row = await websocket.recv()
            log(f"{temp_row_count} ")    # log row count
            data_list = row.split(",")    # split the row(comma separated format) string into a list 

            # if starting a new chunk, reinitialize 'df' to empty dataframe
            if(temp_row_count == 0):
                df = pd.DataFrame()
                # add row into the dataframe
                df = pd.DataFrame([data_list], columns = ["date", "time", "state", "sphone_signal", "label", "type"])
            
            #  else old chunk
            else:
                # append to 'df'
                df = pd.concat([df, pd.DataFrame([data_list], columns = ["date", "time", "state", "sphone_signal", "label", "type"])])
            temp_row_count += 1    # increment chunk size
            
            # 'tem_row_count' is equal to desired 'chunk_size'
            if (temp_row_count == chunk_size):
                # if initial chunk
                if(initial_chunk):
                    try:
                        # train ensemble model only once
                        model = await ensemble(df, chunk_count+1, initial=True)
                        # CHUNK ACCEPTED BY MODEL, NO ERRORS
                        initial_chunk = False    # initial chunk accepted, hence set to False
                        chunk_count += 1    # increment 'chunk_count'

                        # log chunk created
                        log(f"Chunk-{chunk_count} created with {temp_row_count} records.", timestamp=True, print_line=50)

                        chunk_size = FINAL_CHUNK_SIZE    # new chunk size reset to desired chunk size
                        temp_row_count = 0    # reset 'temp_row_count' for new chunk

                    # chunk rejected by ensemble model trainer, hence raise error
                    except ValueError:
                        # print(sys.exc_info())    # prints error statement/message

                        # log for class error
                        log("CLASS ERROR\n", timestamp=True)
                        chunk_size *= CHUNK_SIZE_INCREMENT_FACTOR    # multiplicative increase of chunk size
                
                # not initial chunk
                else:
                    # train ensemble model twice
                    try:
                        # 1. train ensemble model for chunk ---------------------------
                        new_chunk_model = await ensemble(df, chunk_count+1)
                        # -------------------------------------------------------------

                        # 2. ensemble model of previous chunk('model') with model of new chunk('new_chunk_model) ------
                        start = time.time()
                        new_ensemble_model = VotingClassifier(estimators=[('New Model', new_chunk_model), ('Old Model', model)], voting='hard')
                        # ---------------------------------------------------------------------------------------------

                        # fitting latest chunk into new model---------------------------------
                        IoT=df
                        # data preparation
                        x = IoT.drop(['label', 'date', 'time', 'type'], axis=1)
                        y = IoT['type']

                        # data split into test and train sets
                        x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.2)
                        new_ensemble_model.fit(x_train, y_train)
                        end = time.time()
                        # ---------------------------------------------------------------------

                        # calculate & print required metrics -----------------------------------------------------------
                        y_pred = new_ensemble_model.predict(x_test)
                        accuracy = accuracy_score(y_test, y_pred)
                        precision = precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
                        recall = recall_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
                        f1_score_value = f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
                        time_consumed = end-start

                        chunk_wise_final_ensemble_metrics = pd.concat([chunk_wise_final_ensemble_metrics, pd.DataFrame([[f'v{chunk_count}', accuracy, precision, recall, f1_score_value, time_consumed]], columns = ['model_version','accuracy', 'precision', 'recall', 'f1_score_value', 'time_consumed'])])
                        log(chunk_wise_final_ensemble_metrics.to_string(index=False, col_space=15), True, file="ensemble.log", timestamp=False)
                        # ----------------------------------------------------------------------------------------------

                        # ---------------------------------------------------
                        # SAVING MODEL FOR BAYESIAN MODEL TRAINING
                        filename = './models/ensemble_model.h5'
                        pickle.dump(new_ensemble_model, open(filename, 'wb'))
                        # ---------------------------------------------------

                        model = new_ensemble_model    # set 'model' to new_ensemble_model'
                        chunk_count += 1    # increment 'chunk_count'

                        # log chunk created
                        log(f"Chunk-{chunk_count} created with {temp_row_count} records.", timestamp=True, print_line=50)
                        chunk_size = FINAL_CHUNK_SIZE    # new chunk size reset to desired chunk size
                        temp_row_count = 0    # reset 'temp_row_count' for new chunk

                    # chunk rejected by ensemble model trainer, hence raise error
                    except ValueError:
                        # print(sys.exc_info())    # prints error statement/message

                        # log for class error
                        log("CLASS ERROR\n", timestamp=True)
                        chunk_size *= CHUNK_SIZE_INCREMENT_FACTOR    # multiplicative increase of chunk size

            # send acknowledgement for receiving a row successfully
            await websocket.send("1")

await client()

(<class 'ValueError'>, ValueError('This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0'), <traceback object at 0x7f48b735bb40>)
(<class 'ValueError'>, ValueError('This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0'), <traceback object at 0x7f48b7360900>)
(<class 'ValueError'>, ValueError('This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0'), <traceback object at 0x7f48b755d280>)
(<class 'ValueError'>, ValueError('This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0'), <traceback object at 0x7f48b796b5c0>)
(<class 'ValueError'>, ValueError('This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0'), <traceback object at 0x7f48b7b2a8c0>)
(<class 'ValueError'>, ValueError('This solver needs samples of at least 2 classes in the data, but the data contai

  _warn_prf(average, modifier, msg_start, len(result))
  self.explained_variance_ratio_ = (S**2 / np.sum(S**2))[
  self.explained_variance_ratio_ = (S**2 / np.sum(S**2))[
  self.explained_variance_ratio_ = (S**2 / np.sum(S**2))[
  self.explained_variance_ratio_ = (S**2 / np.sum(S**2))[
  self.explained_variance_ratio_ = (S**2 / np.sum(S**2))[
  self.explained_variance_ratio_ = (S**2 / np.sum(S**2))[
  self.explained_variance_ratio_ = (S**2 / np.sum(S**2))[
  self.explained_variance_ratio_ = (S**2 / np.sum(S**2))[
  self.explained_variance_ratio_ = (S**2 / np.sum(S**2))[
  self.explained_variance_ratio_ = (S**2 / np.sum(S**2))[
  self.explained_variance_ratio_ = (S**2 / np.sum(S**2))[
  self.explained_variance_ratio_ = (S**2 / np.sum(S**2))[
  self.explained_variance_ratio_ = (S**2 / np.sum(S**2))[
  self.explained_variance_ratio_ = (S**2 / np.sum(S**2))[
  self.explained_variance_ratio_ = (S**2 / np.sum(S**2))[
  self.explained_variance_ratio_ = (S**2 / np.sum(S**2))[


(<class 'ValueError'>, ValueError('This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0'), <traceback object at 0x7f48b700e000>)
(<class 'ValueError'>, ValueError('This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0'), <traceback object at 0x7f48b6958e80>)
(<class 'ValueError'>, ValueError('This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0'), <traceback object at 0x7f48b787ff80>)
(<class 'ValueError'>, ValueError('This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0'), <traceback object at 0x7f48b6aafc00>)
(<class 'ValueError'>, ValueError('This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0'), <traceback object at 0x7f48b5038540>)
(<class 'ValueError'>, ValueError('This solver needs samples of at least 2 classes in the data, but the data contai

ConnectionClosedOK: received 1001 (going away); then sent 1001 (going away)

<br>
<br>
<center><b>End of File</b></center>