# Header Files:

In [None]:
##Basics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import regex as re
import os
import string
import time
import warnings
import json
warnings.filterwarnings("ignore")

##TSFresh
import tsfresh
from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import impute

##ML scikit learn classes for data preprocessing:
from sklearn.preprocessing import StandardScaler,MinMaxScaler,LabelEncoder
from sklearn.model_selection import train_test_split

##ML scikit learn classes for feature selection:
from sklearn.feature_selection import chi2,mutual_info_classif  ### for chi2 and mutual info
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif  ### for ANOVA
from sklearn.feature_selection import RFE  ### for RFE

##ML scikit learn classes for model selection:  
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

##ML scikit learn classes for evaluating model:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report,accuracy_score,make_scorer,confusion_matrix,precision_recall_fscore_support
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import learning_curve

##ML scikit learn classes for creating Pipeline:
from sklearn.pipeline import Pipeline

## Autokeras Library:
import autokeras as ak
from autokeras import StructuredDataClassifier

## Deep Learning Libraries:
import tensorflow as tf
from tensorflow.keras.models import load_model,Model
from tensorflow.keras.layers import concatenate,Input,Dense,ReLU,BatchNormalization,Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model


# Stacked Ensemble Redone::

Here I have used the dataset which corresponds to Top 200 Features selected on the basis of ANOVA for EMG and IMU.

Preprocessing the data:
- This time the data was split into train and test (test_size=0.2)  {Two ways were tried, Random Sampling and Stratified Sampling}
- Standardised the test data on the basis of train data 
- Saved the UnScaled train data and the Scaled data

In [None]:
### Label Encoding, Removing Zero variance Features and Scaling the test data::
def initial(df):
    
    #### Label Encoding the Target Variable

    X=df.drop(["label"],axis=1)
    y=df["label"]
    if df.label.dtype==str:   ### Will apply label encoding if needed
        le=LabelEncoder()
        y=le.fit_transform(y)
        y=pd.Series(y)

    #### Removing Features having zero variance.

    Var=X[X.columns].std()
    col=Var[Var==0].index
    X=X.drop(col,axis=1)
    
    X, X_val, y, y_val = train_test_split(X, y,stratify=y,test_size=0.80, random_state=None)  ### For stratified sampling
    #X, X_val, y, y_val = train_test_split(X, y,stratify=y,test_size=0.80, random_state=None)  ### For random sampling
    
    ### Scaling the Data:
    sc=StandardScaler()
    X_sc=sc.fit_transform(X)
    X_val=sc.transform(X_val)  
    
    return X.reset_index(drop=True), pd.DataFrame(X_val,columns=X.columns), y.reset_index(drop=True), y_val.reset_index(drop=True)

In [None]:
### For the best model using EMG and NEMG Features (EMG+NEMG Combined)
df_emg=pd.read_csv("EMG_ANOVA_200_features.csv").rename({'0':'label'},axis=1)
df_nemg=pd.read_csv("NEMG_ANOVA_200_features.csv").rename({"0":"label"},axis=1)

temp=pd.DataFrame()
for i in range(26):
    n=df_nemg[df_nemg["label"]==i].reset_index(drop=True)
    e=df_emg[df_emg["label"]==i].reset_index(drop=True).drop(["label"],axis=1)[0:n.shape[0]]
    k=pd.concat([e,n],axis=1)
    temp=pd.concat([temp,k],axis=0)   

In [None]:
X, X_val, y, y_val=initial(temp)   ### Remains Fixed

## X,y ----> Training data
## X_val,y_val ----> Testing data   Which is being stored for training and evaluating the other level 0 models
X.shape,X_val.shape
# ((4736, 400), (18944, 400))
y.shape,y_val.shape
# ((4736,), (18944,))


train=pd.concat([X,y],axis=1)
test=pd.concat([X_val,y_val],axis=1)

train.shape,test.shape
# ((4736, 401), (18944, 401))

test.to_csv("Test_data_str.csv",index=False)    ### Scaled Test Data
train.to_csv("Train_data_str.csv",index=False)  ### Unscaled Train Data

Now Reading the saved Unscaled training data and used autokeras with max_iterations=28 to reach to the best model::

In [None]:
### Label Encoding and Removing Zero variance Features
def initial1(df):
    
    #### Label Encoding the Target Variable

    X=df.drop(["label"],axis=1)
    y=df["label"]
    if df.label.dtype==str:   ### Will apply label encoding if needed
        le=LabelEncoder()
        y=le.fit_transform(y)
        y=pd.Series(y)

    #### Removing Features having zero variance.

    Var=X[X.columns].std()
    col=Var[Var==0].index
    X=X.drop(col,axis=1)
    
    return X,y

Function to find out best model using autokeras library.

Parameters description: feature_count - no of features to be selected

                    max_trials - for the autokeras
                        
                    data -  EMG/NEMG                     

In [None]:
def funct_autokeras_accuracy(X_train, X_test, y_train, y_test,max_trials,data):
    
    search = StructuredDataClassifier(max_trials=max_trials)
    
    # perform the search
    search.fit(x=X_train, y=y_train, verbose=0)
    
    # evaluate the model
    loss, acc = search.evaluate(X_test, y_test, verbose=0)
    accuracy[f"{data}_{max_trials}"]=round(acc, 3)    
    
    # get the best performing model
    model = search.export_model()   
    
    # save the best performing model to file
    try:
        model.save(f"{data}_{max_trials}_strat", save_format="tf")
    except Exception:
        model.save(f"{data}_{max_trials}_strat.h5")


def funct_autokeras(max_trials,data,X,y):    ### data="EMG"/"NEMG"/"Both" ### feature_count=NULL in case of "Both"
    X=X
    y=y

    ### Splitting the data:
    X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y,test_size=0.3, random_state=1)
    
    ### Scaling the Data:
    sc=StandardScaler()
    X_train_scaled=sc.fit_transform(X_train)
    X_test_scaled=sc.transform(X_test)    
    
    funct_autokeras_accuracy(X_train_scaled, X_test_scaled, y_train, y_test,max_trials,data)


accuracy={}
df=pd.read_csv("Train_data_random.csv").rename({'0':'label'},axis=1)
X,y=initial1(df)   ### Remains Fixed
### Iterating through various values:
funct_autokeras(28,"Both",X,y) 
    
with open('accuracy_results_stacked_28_strat.json', 'w') as fp:
    json.dump(accuracy, fp,  indent=4) 

Creating the Architecture of Best Model on Tensorflow:

In [None]:
base_model = load_model("Both_28_strat", custom_objects=ak.CUSTOM_OBJECTS)
base_model.summary()

In [None]:
class Model:
    def __init__(self,number,l1,l2):
        self.number=number
        self.model = tf.keras.Sequential()
        self.model.add(tf.keras.layers.InputLayer(input_shape=(400)))
        self.model.add(tf.keras.layers.Normalization(axis=-1))
        self.model.add(tf.keras.layers.Dense(units=l1,activation=None))   ###
        self.model.add(tf.keras.layers.BatchNormalization())
        self.model.add(tf.keras.layers.ReLU())
        self.model.add(tf.keras.layers.Dropout(0.2))
        self.model.add(tf.keras.layers.Dense(units=l2,activation=None))  ###
        self.model.add(tf.keras.layers.BatchNormalization())
        self.model.add(tf.keras.layers.ReLU())
        self.model.add(tf.keras.layers.Dropout(0.2))
        #self.model.add(tf.keras.layers.Dropout(0.3))
        self.model.add(tf.keras.layers.Dense(units=26,activation="softmax"))
        self.model.compile(optimizer="adam",loss=tf.keras.losses.SparseCategoricalCrossentropy(),metrics="accuracy")
    
    def funct_fit(self,X_train,X_test,y_train,y_test,count,n_split):
        self.model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=50,batch_size=50)
        if count==n_split:
            self.model.save(f"{os.getcwd()}/model_{self.number}.h5")
        return self.model.history.history["accuracy"],self.model.history.history["val_accuracy"] 

def funct_avg(d):
    temp={}
    for i,j in d.items():
        temp[i]=np.mean(j)
        
    return temp

### cross validation:
def funct_cv(m,n_split,X,y,l1,l2):
    dic_results={"accuracy":[],"val_accuracy":[]}
    n_split=n_split
    count=0
    for train_index,test_index in StratifiedKFold(n_split).split(X,y):
        X_train,X_test=X[train_index],X[test_index]
        y_train,y_test=y[train_index],y[test_index]

        sc=StandardScaler()
        X_train_scaled=sc.fit_transform(X_train)
        X_test_scaled=sc.transform(X_test)  

        model=Model(m,l1,l2)   ### creating the model object 
        count=count+1
        acc,val_acc=model.funct_fit(X_train_scaled,X_test_scaled, y_train,y_test,count,n_split)        

        dic_results["accuracy"].append(max(acc))
        dic_results["val_accuracy"].append(max(val_acc))

    dic_results=funct_avg(dic_results)

    final_result[m]=dic_results

final_result={}
funct_cv(1,5,X.values,y.values,60,60)
funct_cv(2,5,X.values,y.values,90,90)
funct_cv(3,5,X.values,y.values,150,150)

with open('accuracy_results_StratifiedCV.json', 'w') as fp:
    json.dump(final_result, fp,  indent=4)          

Using Base Models for final Prediction::

In [None]:
allmodels=[base_model]

#### Loading other models:
def funct_load(number):
    for i in range(1,number+1):
        # load model from file
        model = load_model(f'model_{i}.h5')
        model.summary()
        # add to list of members
        allmodels.append(model)
            
funct_load(3)    

df=pd.read_csv("/home/sudharshan/Sudharshan/Test_data_str.csv").rename({'0':'label'},axis=1)
X_test,y_test=initial1(df)   ### Remains Fixed


# create stacked model input dataset as outputs from the individual ensemble models
def stacked_dataset(allmodels, X_test):
    stackX = None
    for model in allmodels:
        # make prediction
        yhat = model.predict(X_test, verbose=0)
        # stack predictions into [rows, members, class]
        if stackX is None:
            stackX = yhat
        else:
            stackX = np.dstack((stackX, yhat))
    # flatten predictions to [rows, members x class]
    stackX = stackX.reshape((stackX.shape[0], stackX.shape[1]*stackX.shape[2]))
    return stackX

stackedX = stacked_dataset(allmodels, X_test)

results={}

# evaluate standalone models on test dataset
for model in allmodels:
    y_hat=model.predict(X_test)
    y_hat=np.argmax(y_hat,axis=1)
    acc = accuracy_score(y_test, y_hat)
    print('Model Accuracy: %.6f' % acc)


def funct_report_csv(y,y_hat):
    clf_rep = precision_recall_fscore_support(y, y_hat)
    out_dict = {
                 "precision" :clf_rep[0].round(2)
                ,"recall" : clf_rep[1].round(2)
                ,"f1-score" : clf_rep[2].round(2)
                ,"support" : clf_rep[3]
                }
    out_df = pd.DataFrame(out_dict)
    avg_tot = (out_df.apply(lambda x: round(x.mean(), 2) if x.name!="support" else  round(x.sum(), 2)).to_frame().T)
    avg_tot.index = ["avg/total"]
    out_df = out_df.append(avg_tot)
    return out_df

def classification_report_with_accuracy_score(y, y_hat,model_name):
    #report=classification_report(y, y_hat,output_dict=True) # print classification report
    report=funct_report_csv(y, y_hat) # print classification report
    report.to_csv(f"Classification_Report_{model_name}.csv",index=False)

    cm=confusion_matrix(y,y_hat)
    plt.figure(figsize=(20,10))
    plt.rc("font",size=10)
    sns.heatmap(cm,annot=True,fmt=".2f",cmap="viridis")
    plt.savefig(f"Confusion_Matrix_{model_name}_simple.png")

    cm_1 = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] # For normalising the Matrix for better visualisation.
    plt.figure(figsize=(20,10))
    plt.rc("font",size=10)
    sns.heatmap(cm_1,annot=True,fmt=".2f",cmap="viridis")
    plt.savefig(f"Confusion_Matrix_{model_name}_axis=1.png")
    
    return accuracy_score(y, y_hat) # return accuracy score

    
def stacked_model_test(allmodels,X,y):
    # dictionary of all models
    sample={"XGB":XGBClassifier(),"LGBM":LGBMClassifier(),"RandomForest":RandomForestClassifier(),"LR":LogisticRegression(),"CatBoost":CatBoostClassifier(),"Naive Bayes":GaussianNB(),"SVC":SVC(),"KNN_3":KNeighborsClassifier(n_neighbors=3),"KNN_5":KNeighborsClassifier(n_neighbors=5)}
    # create dataset using ensemble
    stackedX = stacked_dataset(allmodels, X)
    
    for i,j in sample.items():
        arg={"model_name":i}
        sc=make_scorer(classification_report_with_accuracy_score,**arg)
        score_=cross_val_score(j,stackedX,y,cv=5,scoring=sc)
        acc = score_.mean()
       
        results[i]=round(acc,5)
           

stacked_model_test(allmodels,X_test,y_test)
#### To store the results in the form of json that is prettified:
with open('accuracy_results_final_stacked_random-sampling.json', 'w') as fp:
    json.dump(results, fp,  indent=4)     