# Feature Selection:

There are various Feature selection methods that can be used:
1. ANOVA - FILTER Method of FS
2. CHI2 - FILTER Method of FS
3. RFE - WRAPPER Method of FS
4. Mutual Info - FILTER Method of FS

There are other methods of FS like:
1. Feature Importance of individual models - EMBEDDED Method of FS
2. Pearson's Correlation
3. Spearman's Rank Correlation
4. PCA - FILTER Method of FS

## For Combining individual zip files:
## Used Dataset:after-tsfresh-imu

In [None]:
## Combining all the files:
dir="../input/after-tsfresh-imu"

### Combining all the EMG Files
emg=pd.DataFrame()
for i in os.listdir(dir):
    for j in os.listdir(os.path.join(dir,i)):
        if "_EMG_" in j:
            emg=pd.concat([emg,pd.read_csv(os.path.join(dir,i,j))],axis=0)

### Combining all the Non-EMG Files            
nemg=pd.DataFrame()
for i in os.listdir(dir):
    for j in os.listdir(os.path.join(dir,i)):
        if "_NonEMG_" in j:
            nemg=pd.concat([nemg,pd.read_csv(os.path.join(dir,i,j))],axis=0)
            
emg.to_csv("Combined_EMG.csv",index=False)
nemg.to_csv("Combined_NEMG.csv",index=False)

## Created a dataset called imu-for-feature-selection

In [None]:
df=pd.read_csv("../input/imu-for-feature-selection/Combined_EMG.csv")
imu=pd.read_csv("../input/imu-for-feature-selection/Combined_NEMG.csv")

def initial(df):
    
    #### Label Encoding the Target Variable

    X=df.drop(["label"],axis=1)
    y=df["label"]
    if df.label.dtype==str:   ### Will apply label encoding if needed
        le=LabelEncoder()
        y=le.fit_transform(y)
        y=pd.Series(y)

    #### Removing Features having zero variance.

    Var=X[X.columns].std()
    col=Var[Var==0].index
    X=X.drop(col,axis=1)
    
    return X,y

In [None]:
def draw_curve(train_sizes, train_scores, test_scores):
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    plt.figure(figsize=(10,10))
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    plt.gca().invert_yaxis()
    
    # box-like grid
    plt.grid()
    
    # plot the std deviation as a transparent range at each training set size
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
    
    # plot the average training and test score lines at each training set size
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    
    # sizes the window for readability and displays the plot
    # shows error from 0 to 1.1
    plt.legend(loc="best")
    plt.ylim(-.1,1.1)
    plt.show()
       

def classification_report_with_accuracy_score(y_true, y_pred):

    print(classification_report(y_true, y_pred)) # print classification report
    cm=confusion_matrix(y_true,y_pred)
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] # For normalising the Matrix for better visualisation.
    plt.figure(figsize=(20,10))
    plt.rc("font",size=10)
    sns.heatmap(cm,annot=True,fmt=".2f",cmap="viridis")
    plt.show()
    return accuracy_score(y_true, y_pred) # return accuracy score

def fun_best(X,y):
    
    X.rename({"emg6__value_count__value_-1":"emg6__value_count__value_2"},axis=1,inplace=True)

    #To remove JSON characters from column names because LGBM fails to execute    
    X = X.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
    
    models={"XGB":XGBClassifier(),"LGBM":LGBMClassifier(),"GradientBoost":GradientBoostingClassifier(),"LDA":LinearDiscriminantAnalysis(),"RandomForest":RandomForestClassifier()}
    mean_score=[]
    
    for i,j in models.items():
        try:
            
            model=j
            score_model=cross_val_score(model,X,y,cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),scoring=make_scorer(classification_report_with_accuracy_score))
            mean_score.append(score_model.mean())
            train_sizes, train_scores, test_scores = learning_curve(model, X, y, n_jobs=-1, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), train_sizes=np.linspace(.1, 1.0, 5), verbose=0)
            draw_curve(train_sizes, train_scores, test_scores)
        
        except:
            
            model=j
            score_model=cross_val_score(model,X,y,cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),scoring=make_scorer(classification_report_with_accuracy_score))
            mean_score.append(score_model.mean())
            train_sizes, train_scores, test_scores = learning_curve(model, X, y, n_jobs=-1, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), train_sizes=np.linspace(.1, 1.0, 5), verbose=0)
            draw_curve(train_sizes, train_scores, test_scores)            
        
    result=dict(zip(models.keys(),mean_score))
   
    return result

In [None]:
### Function for Implementing Various Feature Selection Methods
def funct_FS_bestmodel(ds,fs_name,n):
    
    df=pd.read_csv(f"../input/imu-for-feature-selection/Combined_{ds}.csv")
    
    print("Read the Dataset")
    
    X,y=initial(df)

    print("Passed initial function")
    
    fs_dic={"MI":SelectKBest(mutual_info_classif, k=n),"CHI2":SelectKBest(chi2, k=n),"ANOVA":SelectKBest(f_classif, k=n),"RFE":RFE(estimator=CatBoostClassifier(), n_features_to_select=n)}
    
    ### for Chi2 feature selection, data points must be strictly positive.
    
    if fs_name=="CHI2":
        pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('selector', fs_dic[fs_name])])
    
    else:
        pipe = Pipeline([('scaler', StandardScaler()),
                 ('selector', fs_dic[fs_name])])
    
    print("Pipeline set up")
    
    pipe.fit(X, y)
    # Get columns to keep and create new dataframe with those only
    cols = pipe.named_steps['selector'].get_support(indices=True) ### Note the format
    X_fs= X.iloc[:,cols]

    df_fs=pd.concat([X_fs,y],axis=1)
    df_fs.rename({"0":"label"},axis=1,inplace=True)
    df_fs.to_csv(f"{ds}_{fs_name}_{n}_features.csv",index=False)
    
    print("Done Feature Selection")
    
    result=fun_best(X_fs,y)
    print(f"Top {n} features using {fs_name} technique:")
    print(result)    

In [None]:
## To check the Corresponding Letter -> Number Encoding
target=pd.DataFrame(np.hstack([y.values.reshape((-1,1)),y_label.reshape((-1,1))]),columns=["label","label_encoded"])
target.drop_duplicates().sort_values("label").reset_index(drop=True)

In [None]:
### For Trial:
### Params are ("EMG" or "NEMG"/ Selection Method Name / Number of features)
funct_FS_bestmodel("EMG","CHI2",200) 