In [2]:
##Basics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import regex as re
import os
import string
import time
import warnings
import json
warnings.filterwarnings("ignore")

##TSFresh
import tsfresh
from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import impute

##ML scikit learn classes for data preprocessing:
from sklearn.preprocessing import StandardScaler,MinMaxScaler,LabelEncoder
from sklearn.model_selection import train_test_split

##ML scikit learn classes for feature selection:
from sklearn.feature_selection import chi2,mutual_info_classif  ### for chi2 and mutual info
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif  ### for ANOVA
from sklearn.feature_selection import RFE  ### for RFE

##ML scikit learn classes for model selection:  
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

##ML scikit learn classes for evaluating model:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report,accuracy_score,make_scorer,confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import learning_curve

##ML scikit learn classes for creating Pipeline:
from sklearn.pipeline import Pipeline

## Autokeras Library:
import autokeras as ak
from autokeras import StructuredDataClassifier

from tensorflow.keras.models import load_model
import tensorflow as tf

# Preprocessing:

### Used Dataset:imu-dataset-original

In [None]:
## Iterate through the list of alphabets
## Create a function that takes in letter for file path, i.e "K" 
## Select each file using file path
## Store the csv file name with emg as emg variable(type:Dataframe), file name with other names as nemg(type:Dataframe)
## Concatenate it with previous variables and return final eng,nemg
## Pass it into the same function with different letter

In [None]:
dir="../input/imu-dataset-original/IMU dataset"

In [None]:
def funct_preprocessing(l,f,fn):
    
    emg=pd.DataFrame(columns=['timestamp', 'emg1', 'emg2', 'emg3', 'emg4', 'emg5', 'emg6', 'emg7',
       'emg8'])
    acc=pd.DataFrame()
    gyro=pd.DataFrame()
    
    for i in sorted(os.listdir(os.path.join(dir,l))):
        
        if "emg" in i:
            emg=pd.concat([emg,pd.read_csv(os.path.join(dir,l,i))],axis=0)
            
        elif "acc" in i or "gyro" in i:  
        #### accelerometer,gyro,orientation have same feature names. Hence changing the column names
            if "acc" in i:
                temp=pd.read_csv(os.path.join(dir,l,i))
                acc=pd.concat([acc,temp])

            elif "gyro" in i:
                temp=pd.read_csv(os.path.join(dir,l,i))
                gyro=pd.concat([gyro,temp])

    acc.columns=["timestamp","x_acc","y_acc","z_acc"]
    acc.drop("timestamp",axis=1,inplace=True)
    gyro.columns=["timestamp","x_gyro","y_gyro","z_gyro"]
        
    nemg=pd.concat([acc,gyro],axis=1)
    
    emg["label"]=l
    nemg["label"]=l
    
    f=pd.concat([f,emg],axis=0)
    fn=pd.concat([fn,nemg],axis=0)
    
    return f,fn

In [None]:
f=pd.DataFrame()
fn=pd.DataFrame()
for i in list(string.ascii_uppercase):
    f,fn=funct_preprocessing(i,f,fn)

In [None]:
f.to_csv("IMU_EMG.csv",index=False)
fn.to_csv("IMU_NonEMG_acc_and_gyro_only.csv",index=False)

# TSFresh with 50% Overlap:

### for EMG Dataset: WS=200
### for Non-EMG Dataset: WS=55

### Used Dataset:imu-dataset-original

In [None]:
## Function to include time and id columns in the data:
def id_time(df,step):
    
    #Removing unwanted columns.
    df.drop(columns=[i for i in df.columns if "timestamp" in i],inplace=True)
        
    #Sliding Window with 50% overlap:
    w,t=step,(step//2)
    r = np.arange(len(df))  # creating an array
    s = r[::t]              #selecting elements with a step of t, i.e. half of SL
    z = list(zip(s, s + w)) #Creating a list of tuples, each tuple holding the starting and ending row numbers.
    g = lambda t: df.iloc[t[0]:t[1]]
    j=pd.concat(map(g, z))
    
    j["id"]=0
    j["time"]=0 
    
    l=np.arange(0,len(j),step=step)
    l=np.append(l,len(j))
    
    j.reset_index(drop=True,inplace=True)
    
    #Sliding Window:    
    for i in range(len(l)-1):
        time = np.arange(len(j[l[i]:l[i+1]]))
        t_id=np.full(len(time),i)
        j.iloc[l[i]:l[i+1],list(j.columns).index("time")]=time
        j.iloc[l[i]:l[i+1],list(j.columns).index("id")]=t_id
                
    return j

## Function to extract time series features from the data:
def funct_tsfresh(df):
    #Extracting Features
    df_extracted = extract_features(df, column_id="id", column_sort="time")
    
    #Imputing the NaNs
    df_extracted=impute(df_extracted)
    
    return df_extracted

In [None]:
## Function to complete the preprocessing (combining files) and feature extraction process
def funct_preprocessing(l,f,fn):
    
    emg=pd.DataFrame(columns=['timestamp', 'emg1', 'emg2', 'emg3', 'emg4', 'emg5', 'emg6', 'emg7',
       'emg8'])
    acc=pd.DataFrame()
    gyro=pd.DataFrame()
    euler=pd.DataFrame()
    ori=pd.DataFrame()
    
    for i in sorted(os.listdir(os.path.join(dir,l))):
        
        if "emg" in i:
            emg=pd.concat([emg,pd.read_csv(os.path.join(dir,l,i))],axis=0)
            
        elif "acc" in i or "gyro" in i or "orientation" in i:  
        #### accelerometer,gyro,orientation have same feature names. Hence changing the column names
            if "acc" in i:
                temp=pd.read_csv(os.path.join(dir,l,i))
                acc=pd.concat([acc,temp])

            elif "gyro" in i:
                temp=pd.read_csv(os.path.join(dir,l,i))
                gyro=pd.concat([gyro,temp])
                
            elif "Euler" in i:
                temp=pd.read_csv(os.path.join(dir,l,i))
                euler=pd.concat([euler,temp])
                
            else:
                temp=pd.read_csv(os.path.join(dir,l,i))
                ori=pd.concat([ori,temp])

    acc.columns=["timestamp","x_acc","y_acc","z_acc"]
    acc.drop("timestamp",axis=1,inplace=True)
    gyro.columns=["timestamp","x_gyro","y_gyro","z_gyro"]
    gyro.drop("timestamp",axis=1,inplace=True)
    ori.columns=["timestamp","x_ori","y_ori","z_ori","w"]
    ori.drop("timestamp",axis=1,inplace=True)
    
    nemg=pd.concat([acc,gyro,ori,euler],axis=1)
    
    emg.reset_index(drop=True,inplace=True)
    nemg.reset_index(drop=True,inplace=True)
    
    type_emg={"emg1":float,
          "emg2":float,
          "emg3":float,
          "emg4":float,
          "emg5":float,
          "emg6":float,
          "emg7":float,
          "emg8":float
    }
    emg=emg.astype(type_emg)
    
###Now comes the TSFresh Part(Unsupervised)

    emg=id_time(emg,200)
    emg=funct_tsfresh(emg)
    nemg=id_time(nemg,55)
    nemg=funct_tsfresh(nemg)    
    
    emg["label"]=l
    nemg["label"]=l
    
    f=pd.concat([f,emg],axis=0)
    fn=pd.concat([fn,nemg],axis=0)
    
    return f,fn

In [None]:
## To cross-validate the code:
#fn.groupby(["id"]).agg("count")  

In [None]:
## To save time:
start=X   #### X is the custom value, like 18-21
end=X+3
f=pd.DataFrame()
fn=pd.DataFrame()
for i in list(string.ascii_uppercase)[start:end]:
    f,fn=funct_preprocessing(i,f,fn)

In [None]:
f.to_csv(f"IMU_EMG_AfterTSFresh_{start}-{end}.csv",index=False)
fn.to_csv(f"IMU_NonEMG_AfterTSFresh_{start}-{end}.csv",index=False)

# Feature Selection:

There are various Feature selection methods that can be used:
1. ANOVA - FILTER Method of FS
2. CHI2 - FILTER Method of FS
3. RFE - WRAPPER Method of FS
4. Mutual Info - FILTER Method of FS

There are other methods of FS like:
1. Feature Importance of individual models - EMBEDDED Method of FS
2. Pearson's Correlation
3. Spearman's Rank Correlation
4. PCA - FILTER Method of FS

## For Combining individual zip files:
## Used Dataset:after-tsfresh-imu

In [None]:
## Combining all the files:
dir="../input/after-tsfresh-imu"

In [None]:
emg=pd.DataFrame()
for i in os.listdir(dir):
    for j in os.listdir(os.path.join(dir,i)):
        if "_EMG_" in j:
            emg=pd.concat([emg,pd.read_csv(os.path.join(dir,i,j))],axis=0)
            
nemg=pd.DataFrame()
for i in os.listdir(dir):
    for j in os.listdir(os.path.join(dir,i)):
        if "_NonEMG_" in j:
            nemg=pd.concat([nemg,pd.read_csv(os.path.join(dir,i,j))],axis=0)
            

In [None]:
emg.to_csv("Combined_EMG.csv",index=False)
nemg.to_csv("Combined_NEMG.csv",index=False)

## Created a dataset called imu-for-feature-selection

In [None]:
df=pd.read_csv("../input/imu-for-feature-selection/Combined_EMG.csv")

In [None]:
imu=pd.read_csv("../input/imu-for-feature-selection/Combined_NEMG.csv")

In [None]:
imu.shape

In [None]:
df.shape

In [None]:
df.head()

In [None]:
def initial(df):
    
    #### Label Encoding the Target Variable

    X=df.drop(["label"],axis=1)
    y=df["label"]
    le=LabelEncoder()
    y=le.fit_transform(y)
    y= pd.Series(y)

    #### Removing Features having zero variance.

    Var=X[X.columns].std()
    col=Var[Var==0].index
    X=X.drop(col,axis=1)
    
    return X,y

In [None]:
def draw_curve(train_sizes, train_scores, test_scores):
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    plt.figure(figsize=(10,10))
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    plt.gca().invert_yaxis()
    
    # box-like grid
    plt.grid()
    
    # plot the std deviation as a transparent range at each training set size
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
    
    # plot the average training and test score lines at each training set size
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    
    # sizes the window for readability and displays the plot
    # shows error from 0 to 1.1
    plt.legend(loc="best")
    plt.ylim(-.1,1.1)
    plt.show()
       

def classification_report_with_accuracy_score(y_true, y_pred):

    print(classification_report(y_true, y_pred)) # print classification report
    cm=confusion_matrix(y_true,y_pred)
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] # For normalising the Matrix for better visualisation.
    plt.figure(figsize=(20,10))
    plt.rc("font",size=10)
    sns.heatmap(cm,annot=True,fmt=".2f",cmap="viridis")
    plt.show()
    return accuracy_score(y_true, y_pred) # return accuracy score

def fun_best(X,y):
    
    X.rename({"emg6__value_count__value_-1":"emg6__value_count__value_2"},axis=1,inplace=True)

    #To remove JSON characters from column names because LGBM fails to execute    
    X = X.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
    
    models={"XGB":XGBClassifier(),"LGBM":LGBMClassifier(),"GradientBoost":GradientBoostingClassifier(),"LDA":LinearDiscriminantAnalysis(),"RandomForest":RandomForestClassifier()}
    mean_score=[]
    
    for i,j in models.items():
        try:
            
            model=j
            score_model=cross_val_score(model,X,y,cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),scoring=make_scorer(classification_report_with_accuracy_score))
            mean_score.append(score_model.mean())
            train_sizes, train_scores, test_scores = learning_curve(model, X, y, n_jobs=-1, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), train_sizes=np.linspace(.1, 1.0, 5), verbose=0)
            draw_curve(train_sizes, train_scores, test_scores)
        
        except:
            
            model=j
            score_model=cross_val_score(model,X,y,cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),scoring=make_scorer(classification_report_with_accuracy_score))
            mean_score.append(score_model.mean())
            train_sizes, train_scores, test_scores = learning_curve(model, X, y, n_jobs=-1, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), train_sizes=np.linspace(.1, 1.0, 5), verbose=0)
            draw_curve(train_sizes, train_scores, test_scores)            
        
    result=dict(zip(models.keys(),mean_score))
   
    return result

In [None]:
def funct_FS_bestmodel(ds,fs_name,n):
    
    df=pd.read_csv(f"../input/imu-for-feature-selection/Combined_{ds}.csv")
    
    print("Read the Dataset")
    
    X,y=initial(df)

    print("Passed initial function")
    
    fs_dic={"MI":SelectKBest(mutual_info_classif, k=n),"CHI2":SelectKBest(chi2, k=n),"ANOVA":SelectKBest(f_classif, k=n),"RFE":RFE(estimator=CatBoostClassifier(), n_features_to_select=n)}
    
    ### for Chi2 feature selection, data points must be strictly positive.
    
    if fs_name=="CHI2":
        pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', fs_dic[fs_name])])
    
    else:
        pipe = Pipeline([('scaler', StandardScaler()),
                 ('selector', fs_dic[fs_name])])
    
    print("Pipeline set up")
    
    pipe.fit(X, y)
    # Get columns to keep and create new dataframe with those only
    cols = pipe.named_steps['selector'].get_support(indices=True) ### Note the format
    X_fs= X.iloc[:,cols]

    df_fs=pd.concat([X_fs,y],axis=1)
    df_fs.rename({"0":"label"},axis=1,inplace=True)
    df_fs.to_csv(f"{ds}_{fs_name}_{n}_features.csv",index=False)
    
    print("Done Feature Selection")
    
    result=fun_best(X_fs,y)
    print(f"Top {n} features using {fs_name} technique:")
    print(result)    

In [None]:
## To check the Corresponding Letter -> Number Encoding
target=pd.DataFrame(np.hstack([y.values.reshape((-1,1)),y_label.reshape((-1,1))]),columns=["label","label_encoded"])
target.drop_duplicates().sort_values("label").reset_index(drop=True)

In [None]:
### For Trial:
### Params are ("EMG" or "NEMG"/ Selection Method Name / Number of features)
funct_FS_bestmodel("EMG","CHI2",200) 

# Stacked Ensemble Learning

In [None]:
!pip install autokeras
import autokeras
from autokeras import StructuredDataClassifier

In [None]:
### Dictionary for getting Accuracies:
accuracy={}

In [None]:
### Copied from above section:
def initial(df):
    
    #### Label Encoding the Target Variable

    X=df.drop(["label"],axis=1)
    y=df["label"]
    if df.label.dtype==str:   ### Will apply label encoding if needed
        le=LabelEncoder()
        y=le.fit_transform(y)
        y=pd.Series(y)

    #### Removing Features having zero variance.

    Var=X[X.columns].std()
    col=Var[Var==0].index
    X=X.drop(col,axis=1)
    
    return X,y

In [None]:
def funct_FS(ds,fs_name,n,X,y):
    
    fs_dic={"MI":SelectKBest(mutual_info_classif, k=n),"CHI2":SelectKBest(chi2, k=n),"ANOVA":SelectKBest(f_classif, k=n),"RFE":RFE(estimator=CatBoostClassifier(), n_features_to_select=n)}    
    ### for Chi2 feature selection, data points must be strictly positive.    
    if fs_name=="CHI2":
        pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', fs_dic[fs_name])])
    
    else:
        pipe = Pipeline([('scaler', StandardScaler()),
                 ('selector', fs_dic[fs_name])])    
    print("Pipeline set up")
    
    pipe.fit(X, y)
    # Get columns to keep and create new dataframe with those only
    cols = pipe.named_steps['selector'].get_support(indices=True) ### Note the format
    X_fs= X.iloc[:,cols]

    df_fs=pd.concat([X_fs,y],axis=1)
    df_fs.rename({"0":"label"},axis=1,inplace=True)
    df_fs.to_csv(f"{ds}_{fs_name}_{n}_features.csv",index=False)    
    return df_fs

In [None]:
def funct_autokeras_accuracy(X_train, X_test, y_train, y_test,max_trials,data,feature_count):
    
    search = StructuredDataClassifier(max_trials=max_trials)
    
    # perform the search
    search.fit(x=X_train, y=y_train, verbose=0)
    
    # evaluate the model
    loss, acc = search.evaluate(X_test, y_test, verbose=0)
    accuracy[f"{data}_{feature_count}_{max_trials}"]=round(acc, 3)     
    
    # get the best performing model
    model = search.export_model()   
    
    # save the best performing model to file (Autokeras Model fails to save as h5 file)
    try:
        model.save(f"{data}_{feature_count}_{max_trials}", save_format="tf")
    except Exception:
        model.save(f"{data}_{feature_count}_{max_trials}.h5")

Function to find out best model using autokeras library.

Parameters description: feature_count - no of features to be selected

                    max_trials - for the autokeras
                        
                    data -  EMG/NEMG                     

In [None]:
def funct_autokeras(feature_count,max_trials,data,X,y):    ### data="EMG"/"NEMG"/"Both" ### feature_count=NULL in case of "Both"
    if feature_count!="None":
        df_fs=funct_FS(data,"ANOVA",feature_count,X,y)  ### fs_name="ANOVA"  ### (ds,fs_name,n,X,y)
        df_fs.rename({0:"label"},axis=1,inplace=True)        
        X=df_fs.drop(["label"],axis=1)
        y=df_fs["label"]
    else:
        X=X
        y=y

    ### Splitting the data:
    X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y,test_size=0.3, random_state=1)
    
    ### Scaling the Data:
    sc=StandardScaler()
    X_train_scaled=sc.fit_transform(X_train)
    X_test_scaled=sc.transform(X_test)    
    
    funct_autokeras_accuracy(X_train_scaled, X_test_scaled, y_train, y_test,max_trials,data,feature_count)

In [None]:
#Debugging
#df_fs=funct_FS("EMG","ANOVA",100,X,y)

In [None]:
df_ori=pd.read_csv("../input/imu-for-feature-selection/Combined_EMG.csv")  ### NEMG/EMG

In [None]:
X,y=initial(df_ori)   ### Remains Fixed

In [None]:
### Iterating through various values:
for feature_count in list(range(100,301,100)): ### No of features to be selected
    for max_trials in list(range(16,21)):  ### Trial count for autokeras
        funct_autokeras(feature_count,max_trials,"EMG",X,y)    
    print(f"Done for {feature_count}!")

In [None]:
#### To store the results in the form of json that is prettified:
with open('accuracy_results.json', 'w') as fp:
    json.dump(accuracy, fp,  indent=4)

In [8]:
### For the best model using EMG and NEMG Features
df_emg=pd.read_csv("../input/imu-top-200-features/EMG_ANOVA_200_features.csv")
df_nemg=pd.read_csv("../input/imu-top-200-features/NEMG_ANOVA_200_features.csv")
df_emg.rename({"0":"label"},axis=1,inplace=True)
df_nemg.rename({"0":"label"},axis=1,inplace=True)

## Combining EMG and IMU files:
temp=pd.DataFrame()
for i in range(26):
    n=df_nemg[df_nemg["label"]==i].reset_index(drop=True)
    e=df_emg[df_emg["label"]==i].reset_index(drop=True).drop(["label"],axis=1)[0:n.shape[0]]
    k=pd.concat([e,n],axis=1)
    temp=pd.concat([temp,k],axis=0)   

In [None]:
## Trial 1:
accuracy={}
X,y=initial(temp)   ### Remains Fixed
### Iterating through various values:
for max_trials in list(range(16,25)):  ### Trial count for autokeras
    funct_autokeras("None",max_trials,"Both",X,y)  
    print(f"Done for {max_trials}!")
    
#### To store the results in the form of json that is prettified:
with open('accuracy_results.json', 'w') as fp:
    json.dump(accuracy, fp,  indent=4)

### End Result:
# {
#     "Both_Null_16": 0.953,
#     "Both_Null_17": 0.943,
#     "Both_Null_18": 0.949,
#     "Both_Null_19": 0.953,
#     "Both_Null_20": 0.955,
#     "Both_Null_21": 0.949,
#     "Both_Null_22": 0.954,
#     "Both_Null_23": 0.951,
#     "Both_Null_24": 0.955
# }

In [None]:
## Trial 2:
accuracy={}
X,y=initial(temp)   ### Remains Fixed
### Iterating through various values:
for max_trials in list(range(20,41,5)):  ### Trial count for autokeras
    funct_autokeras("Null",max_trials,"Both",X,y)  
    print(f"Done for {max_trials}!")
    
#### To store the results in the form of json that is prettified:
with open('400 Features Combined 5-40.json', 'w') as fp:
    json.dump(accuracy, fp,  indent=4)

### End Result:
# {
#     "Both_Null_20": 0.075,
#     "Both_Null_25": 0.913,
#     "Both_Null_30": 0.932,
#     "Both_Null_35": 0.948,
#     "Both_Null_40": 0.276
# }

In [None]:
## IMU Best Model Try:
accuracy={}
X,y=initial(df_nemg)   ### Remains Fixed
### Iterating through various values:
for max_trials in range(10,30):  ### Trial count for autokeras
    funct_autokeras("Null",max_trials,"IMU",X,y)  
    print(f"Done for {max_trials}!")
    
#### To store the results in the form of json that is prettified:
with open('accuracy_results_IMU.json', 'w') as fp:
    json.dump(accuracy, fp,  indent=4)

### End Result:
# {
#     "IMU_Null_10": 0.695,
#     "IMU_Null_11": 0.68,
#     "IMU_Null_12": 0.684,
#     "IMU_Null_13": 0.702,
#     "IMU_Null_14": 0.718,
#     "IMU_Null_15": 0.706,
#     "IMU_Null_16": 0.75,
#     "IMU_Null_17": 0.702,
#     "IMU_Null_18": 0.712,
#     "IMU_Null_19": 0.72,
#     "IMU_Null_20": 0.716,
#     "IMU_Null_21": 0.721,
#     "IMU_Null_22": 0.7,
#     "IMU_Null_23": 0.712,
#     "IMU_Null_24": 0.709,
#     "IMU_Null_25": 0.707,
#     "IMU_Null_26": 0.732,
#     "IMU_Null_27": 0.692,
#     "IMU_Null_28": 0.711,
#     "IMU_Null_29": 0.703
# }

# Loading the Best Base Learner provided by Autokeras:

In [None]:
## Loading the Best Saved Model - max_iterations=24
base_model = load_model("Trial 1/Base Model", custom_objects=ak.CUSTOM_OBJECTS)

## To get the Best Model summary(Details about its architecture):
base_model.summary()

In [None]:
# Model: "model"
# _________________________________________________________________
#  Layer (type)                Output Shape              Param #   
# =================================================================
#  input_1 (InputLayer)        [(None, 400)]             0         
                                                                 
#  multi_category_encoding (Mu  (None, 400)              0         
#  ltiCategoryEncoding)                                            
                                                                 
#  normalization (Normalizatio  (None, 400)              801       
#  n)                                                              
                                                                 
#  dense (Dense)               (None, 32)                12832     
                                                                 
#  re_lu (ReLU)                (None, 32)                0         
                                                                 
#  dense_1 (Dense)             (None, 128)               4224      
                                                                 
#  re_lu_1 (ReLU)              (None, 128)               0         
                                                                 
#  dense_2 (Dense)             (None, 26)                3354      
                                                                 
#  classification_head_1 (Soft  (None, 26)               0         
#  max)                                                            
                                                                 
# =================================================================
# Total params: 21,211
# Trainable params: 20,410
# Non-trainable params: 801
# _________________________________________________________________

In [None]:
### Copied from above section:
def initial(df):
    
    #### Label Encoding the Target Variable

    X=df.drop(["label"],axis=1)
    y=df["label"]
    if df.label.dtype==str:   ### Will apply label encoding if needed
        le=LabelEncoder()
        y=le.fit_transform(y)
        y=pd.Series(y)

    #### Removing Features having zero variance.

    Var=X[X.columns].std()
    col=Var[Var==0].index
    X=X.drop(col,axis=1)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y,test_size=0.3, random_state=1)
    
    ### Scaling the Data:
    sc=StandardScaler()
    X_train_scaled=sc.fit_transform(X_train)
    X_test_scaled=sc.transform(X_test)  
    
    return X_train_scaled, X_test_scaled, y_train, y_test   ## Changed the return values

In [None]:
### For the best model using EMG and NEMG Features
df_emg=pd.read_csv("EMG_ANOVA_200_features.csv")
df_nemg=pd.read_csv("NEMG_ANOVA_200_features.csv")
df_emg.rename({"0":"label"},axis=1,inplace=True)
df_nemg.rename({"0":"label"},axis=1,inplace=True)

temp=pd.DataFrame()
for i in range(26):
    n=df_nemg[df_nemg["label"]==i].reset_index(drop=True)
    e=df_emg[df_emg["label"]==i].reset_index(drop=True).drop(["label"],axis=1)[0:n.shape[0]]
    k=pd.concat([e,n],axis=1)
    temp=pd.concat([temp,k],axis=0)   

X_train_scaled, X_test_scaled, y_train, y_test=initial(temp)   ### Remains Fixed

In [None]:
### Building the model above from scratch in TensorFlow:
class Model:
    def __init__(self,number,l1,l2):
        self.l1=l1
        self.l2=l2
        self.number=number
        self.model = tf.keras.Sequential()
        self.model.add(tf.keras.layers.InputLayer(input_shape=(400)))
        #model.add(tf.keras.layers.Normalization(axis=-1))
        self.model.add(tf.keras.layers.Dense(units=self.l1,activation="relu"))
        self.model.add(tf.keras.layers.Dense(units=self.l2,activation="relu"))
        self.model.add(tf.keras.layers.Dense(units=26,activation="softmax"))
        self.model.compile(optimizer="adam",loss=tf.keras.losses.SparseCategoricalCrossentropy(),metrics="accuracy")
    
    def funct_fit(self):
        self.model.fit(X_train_scaled,y_train,epochs=50,batch_size=50)
        self.model.save(f"{os.getcwd()}/Trial 1/model_{self.number}.h5")
#         try:
#             self.model.save(f"{os.getcwd()}/Trial 1/model_{self.number}", save_format="tf")
#         except Exception:          

In [None]:
### Constructing the other three base models:
### Original Model had (l1,l2)=(32,128)
model=Model(1,64,256)
model.funct_fit()

model=Model(2,96,384)
model.funct_fit()

model=Model(3,128,512)
model.funct_fit()

In [None]:
allmodels=[base_model]

#### Loading other models:
def funct_load(number):
    for i in range(1,number+1):
        # load model from file
        model = load_model(f'Trial 1/model_{i}.h5')
        # add to list of members
        allmodels.append(model)
            
funct_load(3)    
allmodels

In [None]:
# [<keras.engine.functional.Functional at 0x7fcf70f15eb8>,
#  <keras.engine.sequential.Sequential at 0x7fcf5a26e550>,
#  <keras.engine.sequential.Sequential at 0x7fcf58844b38>,
#  <keras.engine.sequential.Sequential at 0x7fcf587b8b70>]

In [None]:
# create stacked model input dataset as outputs from the ensemble
def stacked_dataset(members, X_test_scaled):
    stackX = None
    for model in allmodels:
        # make prediction
        yhat = model.predict(X_test_scaled, verbose=0)
        # stack predictions into [rows, members, class]
        if stackX is None:
            stackX = yhat
        else:
            stackX = np.dstack((stackX, yhat))
    # flatten predictions to [rows, members x class]
    stackX = stackX.reshape((stackX.shape[0], stackX.shape[1]*stackX.shape[2]))
    return stackX

In [None]:
# fit a model based on the outputs from the ensemble members
def fit_stacked_model(members, X_test_scaled, y_test,model_name):
    # create dataset using ensemble
    stackedX = stacked_dataset(members, X_test_scaled)
    # fit standalone model
    model = model_name
    model.fit(stackedX, y_test)
    return model
 
# make a prediction with the stacked model
def stacked_prediction(members, model, X_test_scaled):
    # create dataset using ensemble
    stackedX = stacked_dataset(members, X_test_scaled)
    # make a prediction
    yhat = model.predict(stackedX)
    return yhat

In [None]:
## METALEARNER USED IS LOGISTIC REGRESSION
# fit stacked model using the ensemble
model = fit_stacked_model(allmodels, X_test_scaled, y_test,LogisticRegression())
# evaluate model on test set
yhat = stacked_prediction(allmodels, model, X_test_scaled)
acc = accuracy_score(y_test, yhat)
print('Stacked Test Accuracy: %.3f' % acc)

# Stacked Test Accuracy: 0.995

In [None]:
## METALEARNER USED IS XGBOOST CLASSIFIER
# fit stacked model using the ensemble
model = fit_stacked_model(allmodels, X_test_scaled, y_test,XGBClassifier())
# evaluate model on test set
yhat = stacked_prediction(allmodels, model, X_test_scaled)
acc = accuracy_score(y_test, yhat)
print('Stacked Test Accuracy: %.3f' % acc)

# Stacked Test Accuracy: 1.000