# Objectives

1. This lecture is to give the students hands-on activities on repeated 1-holdout cross-validation with the introduction to feature selection.
2. You want to study the below medium links.

# Define a problem statement and goals

# [Understand the dataset and features](https://archive.ics.uci.edu/ml/datasets/hepatitis)

## Features
   
     1. Class: DIE, LIVE
     2. AGE: 10, 20, 30, 40, 50, 60, 70, 80
     3. SEX: male, female
     4. STEROID: no, yes
     5. ANTIVIRALS: no, yes
     6. FATIGUE: no, yes
     7. MALAISE: no, yes
     8. ANOREXIA: no, yes
     9. LIVER BIG: no, yes
    10. LIVER FIRM: no, yes
    11. SPLEEN PALPABLE: no, yes
    12. SPIDERS: no, yes
    13. ASCITES: no, yes
    14. VARICES: no, yes
    15. BILIRUBIN: 0.39, 0.80, 1.20, 2.00, 3.00, 4.00
    16. ALK PHOSPHATE: 33, 80, 120, 160, 200, 250
    17. SGOT: 13, 100, 200, 300, 400, 500, 
    18. ALBUMIN: 2.1, 3.0, 3.8, 4.5, 5.0, 6.0
    19. PROTIME: 10, 20, 30, 40, 50, 60, 70, 80, 90
    20. HISTOLOGY: no, yes

In [None]:
import numpy as np
import pandas as pd
import math
from scipy.stats import loguniform
from statistics import mean
from scipy import stats
import matplotlib.pyplot as plt
import warnings 



from sklearn.model_selection import StratifiedShuffleSplit

#class sklearn.model_selection.StratifiedShuffleSplit(n_splits=10, *, test_size=None, train_size=None, random_state=None)

#from sklearn.model_selection import RepeatedStratifiedKFold, RandomizedSearchCV
from sklearn.metrics import classification_report,f1_score
from sklearn.impute import SimpleImputer 

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import BaggingClassifier,RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier,StackingClassifier,VotingClassifier

from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from scikit_posthocs import posthoc_nemenyi_friedman


warnings.filterwarnings("ignore")

## Loading the dataset

In [None]:
data = pd.read_csv('~/DATA/hepatitis.csv', thousands=',', na_values='?')

In [None]:
data.shape

In [None]:
data.dtypes

In [None]:
#from the above there is no datatype object that condradicts with the above description
allFeatures=data.columns[1:len(data.columns)]
catFeatures=data.columns[list(range(2,14))+list(range(15,17))+list(range(18,20))]
numFeatures= [i for i in allFeatures if not(i in catFeatures)]

In [None]:
numFeatures

In [None]:
catFeatures

In [None]:
#in-place command 
for c in catFeatures:
    data[c]=data[c].astype('object')

In [None]:
data.dtypes

# Descriptive analysis

In [None]:
data[numFeatures].describe()

In [None]:
data[numFeatures].hist()

In [None]:
plt.rcParams["figure.figsize"] = [7.50, 3.50]
plt.rcParams["figure.autolayout"] = True



for cat in catFeatures:
    fig, ax = plt.subplots()
    data[cat].value_counts().plot(ax=ax, kind='bar', xlabel=cat, ylabel='frequency')
    plt.show()

In [None]:
#imbalance dataset
data['Class'].value_counts()

In [None]:
X = data[data.columns[1:20]]
y = data[data.columns[0]]

# Developing Imputer and Standardized Scaler

In [None]:
imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
scalor=StandardScaler()

# Developing baseline and ensemble classifiers

In [None]:
baselineClassifiers=[LogisticRegression(), SVC(), KNeighborsClassifier()] #DecisionTreeClassifier()
nameBaselineClassifiers=['LR','SVC','KNN'] #'DT'

estimators=[]
for bc in range(0,len(baselineClassifiers)):
    estimators.append((nameBaselineClassifiers[bc],baselineClassifiers[bc]))

vote_hard_clf = VotingClassifier(estimators = estimators, voting ='hard')
rf_clf=RandomForestClassifier(max_depth=7, random_state=0)

stack_clf = StackingClassifier(estimators=estimators, final_estimator=rf_clf)
ada_clf=AdaBoostClassifier(base_estimator=rf_clf,n_estimators=200, random_state=0)
bagging_cls=BaggingClassifier(base_estimator=rf_clf,n_estimators=200,max_samples=0.5, max_features=0.5)
gb_clf= GradientBoostingClassifier(n_estimators=100, learning_rate=0.02, max_depth=3, random_state=0)

ensemble_classifiers=[vote_hard_clf,rf_clf,stack_clf,ada_clf,bagging_cls,gb_clf]
nameEnsembleClassifiers=['voting','RandomForest','stacking','AdaBoost','Bagging','GB']

# Splitting the data: Use any Splitting Criterion

In [None]:
number_of_splits=5
sss = StratifiedShuffleSplit(n_splits=number_of_splits, test_size=0.3, random_state=0)
train_indexes=[]
test_indexes=[]
for train_index, test_index in sss.split(X, y):
    train_indexes.append(train_index)
    test_indexes.append(test_index)

In [None]:
results={}
for bc in range(0,len(baselineClassifiers)):
    print(nameBaselineClassifiers[bc])
    results[nameBaselineClassifiers[bc]]=[]
    for tr,te in zip(train_indexes,test_indexes):
        X_train, X_test = X.iloc[tr], X.iloc[te]
        y_train, y_test = y.iloc[tr], y.iloc[te]
        
        #Imputation
        imp_mode.fit(X_train[catFeatures])
        imp_mean.fit(X_train[numFeatures])
        X_train[catFeatures]=imp_mode.transform(X_train[catFeatures])
        X_test[catFeatures]=imp_mode.transform(X_test[catFeatures])
        X_train[numFeatures]=imp_mean.transform(X_train[numFeatures])
        X_test[numFeatures]=imp_mean.transform(X_test[numFeatures])
    
        #Scaling numeric features
        scalor.fit(X_train[numFeatures])
        X_train[numFeatures]=scalor.transform(X_train[numFeatures])
        X_test[numFeatures]=scalor.transform(X_test[numFeatures])
    
        #Encoding the Categorical features
        X_train=pd.get_dummies(X_train)
        X_test=pd.get_dummies(X_test)
        X_test = X_test.reindex(columns = X_train.columns, fill_value=0)
        
        
        
        
        model=baselineClassifiers[bc].fit(X_train,y_train)
        y_test_pred=model.predict(X_test)
        f1_value=f1_score(y_test,y_test_pred,average='micro')
        print(f1_value)
        results[nameBaselineClassifiers[bc]].append(f1_value)
    print('The average of the classifiers\'results',np.average(results[nameBaselineClassifiers[bc]]))
    print()

In [None]:
for ec in range(0,len(ensemble_classifiers)):
    print(nameEnsembleClassifiers[ec])
    results[nameEnsembleClassifiers[ec]]=[]
    for tr,te in zip(train_indexes,test_indexes):
        X_train, X_test = X.iloc[tr], X.iloc[te]
        y_train, y_test = y.iloc[tr], y.iloc[te]
        
        #Imputation
        imp_mode.fit(X_train[catFeatures])
        imp_mean.fit(X_train[numFeatures])
        X_train[catFeatures]=imp_mode.transform(X_train[catFeatures])
        X_test[catFeatures]=imp_mode.transform(X_test[catFeatures])
        X_train[numFeatures]=imp_mean.transform(X_train[numFeatures])
        X_test[numFeatures]=imp_mean.transform(X_test[numFeatures])
    
        #Scaling numeric features
        scalor.fit(X_train[numFeatures])
        X_train[numFeatures]=scalor.transform(X_train[numFeatures])
        X_test[numFeatures]=scalor.transform(X_test[numFeatures])
    
        #Encoding the Categorical features
        X_train=pd.get_dummies(X_train)
        X_test=pd.get_dummies(X_test)
        X_test = X_test.reindex(columns = X_train.columns, fill_value=0)
        
        
        
        
        model=ensemble_classifiers[ec].fit(X_train,y_train)
        y_test_pred=model.predict(X_test)
        f1_value=f1_score(y_test,y_test_pred,average='micro')
        print(f1_value)
        results[nameEnsembleClassifiers[ec]].append(f1_value)
    print('The average of the classifiers\'results',np.average(results[nameEnsembleClassifiers[ec]]))
    print()
    

# Which classifier is the best based on the mean of repeated 1 holdout cross-validation?

In [None]:
name=''
maximum=float('-inf')
for nameClassifier in results.keys():
    avg=np.average(results[nameClassifier])
    if (avg > maximum):
        maximum=avg
        maxClassifier=nameClassifier

print('The classifier {0} is the minumm average of {1}'.format(maxClassifier,maximum))

In [None]:
test_significance=[]
for nameClassifier in results.keys():
    test_significance.append(results[nameClassifier])

#Check if Friedman test is signifiant
chi_square,p_value_mean=stats.friedmanchisquare(*test_significance)
print(p_value_mean)

In [None]:
trans_groups=np.array(test_significance).T
p=posthoc_nemenyi_friedman(trans_groups)
print(p)

# Feature selection methods 


What is the differences between feature selection methods, dimensionality reduction, and feature extraction?



Resources are [link1](https://towardsdatascience.com/feature-selection-for-machine-learning-3-categories-and-12-methods-6a4403f86543) and [link2](https://towardsdatascience.com/feature-selection-for-machine-learning-3-categories-and-12-methods-6a4403f86543) 

In [Sklearn](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection)

## Feature selection 

1. It eliminates irrelevant and noisy features by keeping the ones with minimum redundancy and maximum relevance to the target variable.
2. It reduces the computational time and complexity of training and testing a classifier, so it results in more cost-effective models.
3. It improves learning algorithms’ performance, avoids overfitting, and helps to create better general models.

There are three categories of feature selection methods, depending on how they interact with the classifier, namely: 
1. filter.
2. wrapper.
3. embedded methods.

### Filter methods 

They are scalable (up to very high-dimensional data) and perform fast feature selection before classification so that the bias of a learning algorithm does not interact with the bias of the feature selection algorithm.

#### Chi-square

If the target variable is independent of the feature, then it gets a low score, or if they are dependent, the feature is important. A higher value of chi-square means that the feature is more relevant concerning the class.

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
chi_selector = SelectKBest(chi2, k=15)  #Use k='all' if you need to rank all 

results={}
for bc in range(0,len(baselineClassifiers)):
    print(nameBaselineClassifiers[bc])
    results[nameBaselineClassifiers[bc]]=[]
    for tr,te in zip(train_indexes,test_indexes):
        X_train, X_test = X.iloc[tr], X.iloc[te]
        y_train, y_test = y.iloc[tr], y.iloc[te]
        
        #Imputation
        imp_mode.fit(X_train[catFeatures])
        imp_mean.fit(X_train[numFeatures])
        X_train[catFeatures]=imp_mode.transform(X_train[catFeatures])
        X_test[catFeatures]=imp_mode.transform(X_test[catFeatures])
        X_train[numFeatures]=imp_mean.transform(X_train[numFeatures])
        X_test[numFeatures]=imp_mean.transform(X_test[numFeatures])
        
        cols=X_train.columns
    
        #Feature Selection
        chi_selector.fit(X_train,y_train)
        X_train=chi_selector.transform(X_train)
        X_test=chi_selector.transform(X_test)
        
        column_names = cols[chi_selector.get_support()]
        
        X_train=pd.DataFrame(X_train,columns=column_names)
        X_test=pd.DataFrame(X_test,columns=column_names)    
        newNumFeaturs=[num for num in numFeatures if num in column_names]
        newCatFeaturs=[num for num in catFeatures if num in column_names]
            
        #Scaling numeric features
        scalor.fit(X_train[newNumFeaturs])
        X_train[newNumFeaturs]=scalor.transform(X_train[newNumFeaturs])
        X_test[newNumFeaturs]=scalor.transform(X_test[newNumFeaturs])
    
        #Encoding the Categorical features
        X_train=pd.get_dummies(X_train,columns=newCatFeaturs)
        X_test=pd.get_dummies(X_test,columns=newCatFeaturs)
        X_test = X_test.reindex(columns = X_train.columns, fill_value=0)
        
        
        model=baselineClassifiers[bc].fit(X_train,y_train)
        y_test_pred=model.predict(X_test)
        f1_value=f1_score(y_test,y_test_pred,average='micro')
        print(f1_value)
        results[nameBaselineClassifiers[bc]].append(f1_value)
    print('The average of the classifiers\'results',np.average(results[nameBaselineClassifiers[bc]]))
    print()

####  Mutual Information

A feature is considered relevant if it has a high information gain. It cannot handle redundant features, because features are selected in a univariate way.

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif
me_selector = SelectKBest(mutual_info_classif, k=15)  #Use k='all' if you need to rank all 

results={}
for bc in range(0,len(baselineClassifiers)):
    print(nameBaselineClassifiers[bc])
    results[nameBaselineClassifiers[bc]]=[]
    for tr,te in zip(train_indexes,test_indexes):
        X_train, X_test = X.iloc[tr], X.iloc[te]
        y_train, y_test = y.iloc[tr], y.iloc[te]
        
        #Imputation
        imp_mode.fit(X_train[catFeatures])
        imp_mean.fit(X_train[numFeatures])
        X_train[catFeatures]=imp_mode.transform(X_train[catFeatures])
        X_test[catFeatures]=imp_mode.transform(X_test[catFeatures])
        X_train[numFeatures]=imp_mean.transform(X_train[numFeatures])
        X_test[numFeatures]=imp_mean.transform(X_test[numFeatures])
        
        cols=X_train.columns
    
        #Feature Selection
        me_selector.fit(X_train,y_train)
        X_train=me_selector.transform(X_train)
        X_test=me_selector.transform(X_test)
        
        column_names = cols[me_selector.get_support()]
        
        X_train=pd.DataFrame(X_train,columns=column_names)
        X_test=pd.DataFrame(X_test,columns=column_names)    
        newNumFeaturs=[num for num in numFeatures if num in column_names]
        newCatFeaturs=[num for num in catFeatures if num in column_names]
            
        #Scaling numeric features
        scalor.fit(X_train[newNumFeaturs])
        X_train[newNumFeaturs]=scalor.transform(X_train[newNumFeaturs])
        X_test[newNumFeaturs]=scalor.transform(X_test[newNumFeaturs])
    
        #Encoding the Categorical features
        X_train=pd.get_dummies(X_train,columns=newCatFeaturs)
        X_test=pd.get_dummies(X_test,columns=newCatFeaturs)
        X_test = X_test.reindex(columns = X_train.columns, fill_value=0)
        
        
        model=baselineClassifiers[bc].fit(X_train,y_train)
        y_test_pred=model.predict(X_test)
        f1_value=f1_score(y_test,y_test_pred,average='micro')
        print(f1_value)
        results[nameBaselineClassifiers[bc]].append(f1_value)
    print('The average of the classifiers\'results',np.average(results[nameBaselineClassifiers[bc]]))
    print()

####  Wrapper methods

This widely used wrapper method uses an algorithm to train the model iteratively and each time removes the least important feature using the weights of the algorithm as the criterion.
It is a multivariate method in the sense that it evaluates the relevance of several features considered jointly.

## Using from scipy.stats feature selection

1. [Mann–Whitney U test](https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test)
2. [Chi-Squared test](https://en.wikipedia.org/wiki/Chi-squared_test)

In [None]:
from scipy.stats import mannwhitneyu,chi2_contingency,chi2

results={}
for bc in range(0,len(baselineClassifiers)):
    print(nameBaselineClassifiers[bc])
    results[nameBaselineClassifiers[bc]]=[]
    for tr,te in zip(train_indexes,test_indexes):
        X_train, X_test = X.iloc[tr], X.iloc[te]
        y_train, y_test = y.iloc[tr], y.iloc[te]
        
        #Imputation
        imp_mode.fit(X_train[catFeatures])
        imp_mean.fit(X_train[numFeatures])
        X_train[catFeatures]=imp_mode.transform(X_train[catFeatures])
        X_test[catFeatures]=imp_mode.transform(X_test[catFeatures])
        X_train[numFeatures]=imp_mean.transform(X_train[numFeatures])
        X_test[numFeatures]=imp_mean.transform(X_test[numFeatures])
        
        
        newCatFeatures=[]
        for fe in catFeatures:
            table=pd.crosstab(X_train[fe].to_numpy().flatten(), y_train.to_numpy().flatten())
            _, p, _, _ = chi2_contingency(table)
            if (p <=0.05):
                 newCatFeatures.append(fe) 
        
        
        newNumFeatures=[]
        for fe in numFeatures:
            _, p = mannwhitneyu(X_train[fe].to_numpy().flatten(), y_train.to_numpy().flatten())
            if (p <=0.05):
                newNumFeatures.append(fe)
    
        #print('Feature Selection')
        #print(numFeatures)
        #print(catFeatures)
        #print(newNumFeatures)
        #print(newCatFeatures)
        #input('Feature Selection')
        
        catFeatures=newCatFeatures
        numFeatures=newNumFeatures
        X_train=X_train[numFeatures+newCatFeatures]
        X_test=X_test[numFeatures+newCatFeatures]
        
        #Scaling numeric features
        scalor.fit(X_train[numFeatures])
        X_train[numFeatures]=scalor.transform(X_train[numFeatures])
        X_test[numFeatures]=scalor.transform(X_test[numFeatures])
    
        #Encoding the Categorical features
        X_train=pd.get_dummies(X_train)
        X_test=pd.get_dummies(X_test)
        X_test = X_test.reindex(columns = X_train.columns, fill_value=0)
        
        
        model=baselineClassifiers[bc].fit(X_train,y_train)
        y_test_pred=model.predict(X_test)
        f1_value=f1_score(y_test,y_test_pred,average='micro')
        print(f1_value)
        results[nameBaselineClassifiers[bc]].append(f1_value)
    print('The average of the classifiers\'results',np.average(results[nameBaselineClassifiers[bc]]))
    print()

In [None]:
for ec in range(0,len(ensemble_classifiers)):
    print(nameEnsembleClassifiers[ec])
    results[nameEnsembleClassifiers[ec]]=[]
    for tr,te in zip(train_indexes,test_indexes):
        X_train, X_test = X.iloc[tr], X.iloc[te]
        y_train, y_test = y.iloc[tr], y.iloc[te]
        
        #Imputation
        imp_mode.fit(X_train[catFeatures])
        imp_mean.fit(X_train[numFeatures])
        X_train[catFeatures]=imp_mode.transform(X_train[catFeatures])
        X_test[catFeatures]=imp_mode.transform(X_test[catFeatures])
        X_train[numFeatures]=imp_mean.transform(X_train[numFeatures])
        X_test[numFeatures]=imp_mean.transform(X_test[numFeatures])
    
    
        newCatFeatures=[]
        for fe in catFeatures:
            table=pd.crosstab(X_train[fe].to_numpy().flatten(), y_train.to_numpy().flatten())
            _, p, _, _ = chi2_contingency(table)
            if (p <=0.05):
                 newCatFeatures.append(fe) 
        
        
        newNumFeatures=[]
        for fe in numFeatures:
            _, p = mannwhitneyu(X_train[fe].to_numpy().flatten(), y_train.to_numpy().flatten())
            if (p <=0.05):
                newNumFeatures.append(fe)
    
        #print('Feature Selection')
        #print(numFeatures)
        #print(catFeatures)
        #print(newNumFeatures)
        #print(newCatFeatures)
        #input('Feature Selection')
        
        catFeatures=newCatFeatures
        numFeatures=newNumFeatures
        X_train=X_train[numFeatures+newCatFeatures]
        X_test=X_test[numFeatures+newCatFeatures]
    
        #Scaling numeric features
        scalor.fit(X_train[numFeatures])
        X_train[numFeatures]=scalor.transform(X_train[numFeatures])
        X_test[numFeatures]=scalor.transform(X_test[numFeatures])
    
        #Encoding the Categorical features
        X_train=pd.get_dummies(X_train)
        X_test=pd.get_dummies(X_test)
        X_test = X_test.reindex(columns = X_train.columns, fill_value=0)
                
        
        
        model=ensemble_classifiers[ec].fit(X_train,y_train)
        y_test_pred=model.predict(X_test)
        f1_value=f1_score(y_test,y_test_pred,average='micro')
        print(f1_value)
        results[nameEnsembleClassifiers[ec]].append(f1_value)
    print('The average of the classifiers\'results',np.average(results[nameEnsembleClassifiers[ec]]))
    print()
    