## Filter Methods - Basics - Plus - Filter statistical tests

### Putting it all together

In [18]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold, f_classif, SelectKBest

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score, r2_score, accuracy_score, f1_score

In [35]:
# load the Santander customer satisfaction dataset from Kaggle

data = pd.read_csv('C:/Users/RAJENDRA REDDY/Downloads/Genre1.csv')
data.shape

(200, 36)

In [36]:
data.head()

Unnamed: 0,chroma_stft_min,chroma_stft_max,chroma_cqt_min,chroma_cqt_max,chroma_cens_min,chroma_cens_max,melspectogram_min,melspectogram_max,mfcc_min,mfcc_max,...,zero_crossing_rate_min,zero_crossing_rate_max,tempogram_min,tempogram_max,delta_mfcc_min,delta_mfcc_max,mel_to_stft_min,mel_to_stft_max,class,song
0,0.001296,1,0.033154,1,0.003514,0.739581,8.89e-06,6547.407,-162.60739,148.07231,...,0.02002,0.305176,-2.85e-16,1,-27.087835,25.198893,0,18.772789,1,Aa To Sahii (sahi)_shortened.wav
1,0.002739,1,0.062056,1,0.020606,0.682328,1.99e-09,3179.2095,-243.84023,156.03381,...,0.008301,0.543457,-2.85e-16,1,-24.83185,26.813145,0,14.955276,1,Aadat (23)_shortened.wav
2,0.003432,1,0.056286,1,0.02501,0.674345,1.47e-06,367.87683,-197.41306,134.92323,...,0.054688,0.480957,-3.32e-16,1,-14.765142,14.908866,0,9.169767,1,Aag Chahat Ki Lag Jayegi (1)_shortened.wav
3,0.000696,1,0.049335,1,0.0,0.777123,8.43e-07,5928.974,-204.6526,162.19836,...,0.004883,0.195801,-2.44e-16,1,-29.71674,21.724106,0,17.88996,1,Aahista Aahista (16)_shortened.wav
4,0.000197,1,0.02621,1,0.0,0.782509,2.6e-11,722.85565,-351.27094,223.6753,...,0.024902,0.239258,-3.61e-16,1,-22.297218,16.177706,0,10.572831,1,Aaiye Meharban (23)_shortened.wav


In [37]:
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['class','song'], axis=1),
    data['class'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((140, 34), (60, 34))

In [38]:
# I keep a copy of the dataset with all the variables
# to compare the performance of machine learning models
# at the end of the notebook

X_train_original = X_train.copy()
X_test_original = X_test.copy()

### Remove constant features

In [39]:
constant_features = [
    feat for feat in X_train.columns if X_train[feat].std() == 0
]

X_train.drop(labels=constant_features, axis=1, inplace=True)
X_test.drop(labels=constant_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((140, 30), (60, 30))

### Remove quasi-constant features

In [40]:
sel = VarianceThreshold(threshold=0.01)

sel.fit(X_train) # finds the features with low variance

sum(sel.get_support()) # how many not quasi-constant?

21

In [41]:
features_to_keep = X_train.columns[sel.get_support()]

In [42]:
# remove the features

X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

X_train.shape, X_test.shape

((140, 21), (60, 21))

In [43]:
# sklearn transformations lead to numpy arrays
# here I transform the arrays back to dataframes

X_train= pd.DataFrame(X_train)
X_train.columns = features_to_keep

X_test= pd.DataFrame(X_test)
X_test.columns = features_to_keep

### Remove duplicated features

In [44]:
# check for duplicated features in the training set

duplicated_feat = []
for i in range(0, len(X_train.columns)):
    if i % 10 == 0:  # this helps me understand how the loop is going
        print(i)

    col_1 = X_train.columns[i]

    for col_2 in X_train.columns[i + 1:]:
        if X_train[col_1].equals(X_train[col_2]):
            duplicated_feat.append(col_2)
            
len(duplicated_feat)

0
10
20


0

In [45]:
# remove duplicated features
X_train.drop(labels=duplicated_feat, axis=1, inplace=True)
X_test.drop(labels=duplicated_feat, axis=1, inplace=True)

X_train.shape, X_test.shape

((140, 21), (60, 21))

In [46]:
# I keep a copy of the dataset except constant and duplicated variables
# to measure the performance of machine learning models
# at the end of the notebook

X_train_basic_filter = X_train.copy()
X_test_basic_filter = X_test.copy()

### Remove correlated features

In [47]:
# find and remove correlated features
def correlation(dataset, threshold):
    
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            # we are interested in absolute coeff value
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    
    return col_corr


corr_features = correlation(X_train, 0.8)
print('correlated features: ', len(set(corr_features)))

correlated features:  10


In [48]:
# remove correlated features
X_train.drop(labels=corr_features, axis=1, inplace=True)
X_test.drop(labels=corr_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((140, 11), (60, 11))

In [49]:
# keep a copy of the dataset at  this stage
X_train_corr = X_train.copy()
X_test_corr = X_test.copy()

### Select features based of anova

In [50]:
sel_ = SelectKBest(f_classif, k=10).fit(X_train, y_train)

# capture selected feature names
features_to_keep = X_train.columns[sel_.get_support()]

# select features
X_train_anova = sel_.transform(X_train)
X_test_anova = sel_.transform(X_test)

# numpy array to dataframe
X_train_anova = pd.DataFrame(X_train_anova)
X_train_anova.columns = features_to_keep

X_test_anova = pd.DataFrame(X_test_anova)
X_test_anova.columns = features_to_keep

X_train_anova.shape, X_test_anova.shape
features_to_keep

  msb = ssbn / float(dfbn)


Index(['mfcc_min', 'mfcc_max', 'spectral_centroid_min',
       'spectral_centroid_max', 'spectral_contrast_min',
       'spectral_contrast_max', 'tonnetz_min', 'tonnetz_max', 'delta_mfcc_min',
       'delta_mfcc_max'],
      dtype='object')

In [211]:
trainy, testy = y_train, y_test
# define outlier detection model
trainX = X_train[features_to_keep]
testX =  X_test[features_to_keep]

In [212]:
from sklearn.datasets import make_classification
from sklearn.metrics import f1_score
from sklearn.svm import OneClassSVM
from sklearn.metrics import precision_score, recall_score, accuracy_score
# generate dataset

# split into train/test sets

model = OneClassSVM(gamma='scale', nu=0.01)
# fit on majority class

model.fit(trainX)
# detect outliers in the test set
yhat = model.predict(trainX)
# mark inliers 1, outliers -1

# calculate score

print('Accuracy Score: %.3f' % accuracy_score(y_train, yhat))
print('F1 Score: %.3f' % f1_score(y_train, yhat, pos_label=1))
print('Precision Score: %.3f' % precision_score(y_train, yhat, average='micro'))
print('Recall Score: %.3f' % recall_score(y_train, yhat, average='micro'))

Accuracy Score: 0.986
F1 Score: 0.993
Precision Score: 0.986
Recall Score: 0.986


In [214]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.covariance import EllipticEnvelope
# generate dataset

# define outlier detection model
model = EllipticEnvelope(contamination=0.01)
# fit on majority class

model.fit(trainX)
# detect outliers in the test set
yhat = model.predict(trainX)
# mark inliers 1, outliers -1

# calculate score
print('Accuracy Score: %.3f' % accuracy_score(y_train, yhat))
print('F1 Score: %.3f' % f1_score(y_train, yhat, pos_label=1))
print('Precision Score: %.3f' % precision_score(y_train, yhat, average='micro'))
print('Recall Score: %.3f' % recall_score(y_train, yhat, average='micro'))

Accuracy Score: 0.986
F1 Score: 0.993
Precision Score: 0.986
Recall Score: 0.986


In [215]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import IsolationForest
# generate dataset

# split into train/test sets

# define outlier detection model
model = IsolationForest(contamination=0.01)
# fit on majority class

model.fit(trainX)
# detect outliers in the test set
yhat = model.predict(trainX)
# mark inliers 1, outliers -1

# calculate score
score = f1_score(trainy, yhat, pos_label=1)
print('F1 Score: %.3f' % score)

F1 Score: 0.993


In [223]:
from numpy import vstack
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.neighbors import LocalOutlierFactor

# make a prediction with a lof model
def lof_predict(model, trainX, testX):
	# create one large dataset
	composite = vstack((trainX, testX))
	# make prediction on composite dataset
	yhat = model.fit_predict(composite)
	# return just the predictions on the test set
	return yhat[len(trainX):]

# generate dataset

# split into train/test sets

# define outlier detection model
model = LocalOutlierFactor(contamination=0.01)
# get examples for just the majority class

# detect outliers in the test set
yhat = lof_predict(model,testX,trainX)
# mark inliers 1, outliers -1

# calculate score
score = f1_score(trainy, yhat, pos_label=1)
print('F1 Score: %.3f' % score)

F1 Score: 0.993


### Compare the performance in machine learning algorithms

In [140]:
# create a function to build random forests and
# compare its performance in train and test sets

def run_randomForests(X_train, X_test, y_train, y_test):
    
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)
    
    print('Train set')
    pred = rf.predict_proba(X_train)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred)))
    
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred)))

In [141]:
# original
run_randomForests(X_train_original,
                  X_test_original,
                  y_train, y_test)

Train set


ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [143]:
X_train = X_train[features_to_keep]
X_test =  X_test[features_to_keep]
from sklearn.svm import OneClassSVM
clf = OneClassSVM(gamma='auto').fit(X_train)
y_pred = clf.predict(X_test)
print('Ada Boost roc-auc: {}'.format(score = f1_score(y_test, y_pred, pos_label=1)))

IndexError: tuple index out of range

In [191]:
# one-class svm for imbalanced binary classification
from sklearn.datasets import make_classification
from sklearn.metrics import f1_score
from sklearn.svm import OneClassSVM
from sklearn.metrics import precision_score, recall_score
# generate dataset

# split into train/test sets
trainy, testy = y_train, y_test
# define outlier detection model
trainX = X_train[features_to_keep]
testX =  X_test[features_to_keep]
model = OneClassSVM(gamma='scale', nu=0.01)
# fit on majority class

model.fit(trainX)
# detect outliers in the test set
yhat = model.predict(trainX)
# mark inliers 1, outliers -1

# calculate score

print('Accuracy Score: %.3f' % accuracy_score(y_train, yhat))
print('F1 Score: %.3f' % f1_score(y_train, yhat, pos_label=1))
print('Precision Score: %.3f' % precision_score(y_train, yhat, average='micro'))
print('Recall Score: %.3f' % recall_score(y_train, yhat, average='micro'))

Accuracy Score: 0.986
F1 Score: 0.993
Precision Score: 0.986
Recall Score: 0.986


In [79]:
# filter methods - basic
run_randomForests(X_train_basic_filter,
                  X_test_basic_filter,
                  y_train, y_test)

Train set


ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [20]:
# filter methods - correlation
run_randomForests(X_train_corr,
                  X_test_corr,
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.8066004772684517
Test set
Random Forests roc-auc: 0.7859521124929707


In [21]:
# filter methods - univariate roc-auc
run_randomForests(X_train_anova,
                  X_test_anova,
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.8181634778452822
Test set
Random Forests roc-auc: 0.7994720109870546


As we see, the 20 features we selected using the univariate anova are doing a good job, as the final model does not show a decrease in performance compared to that one using all features.

In [22]:
# create a function to build logistic regression
# and compare its performance in train and test sets

def run_logistic(X_train, X_test, y_train, y_test):
    
    scaler = StandardScaler().fit(X_train)
    
    # function to train and test the performance of logistic regression
    logit = LogisticRegression(penalty='l1', random_state=44, max_iter=1000, solver='liblinear')
    logit.fit(X_train, y_train)
    
    print('Train set')
    pred = logit.predict_proba(scaler.transform(X_train))
    print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = logit.predict_proba(scaler.transform(X_test))
    print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [23]:
# original
run_logistic(X_train_original,
             X_test_original,
             y_train, y_test)

Train set
Logistic Regression roc-auc: 0.7430426412785165
Test set
Logistic Regression roc-auc: 0.7514165331434336


In [24]:
# filter methods - basic

run_logistic(X_train_basic_filter,
             X_test_basic_filter,
             y_train, y_test)

Train set
Logistic Regression roc-auc: 0.7410468829538979
Test set
Logistic Regression roc-auc: 0.7489081614486635


In [25]:
# filter methods - correlation

run_logistic(X_train_corr,
             X_test_corr,
             y_train, y_test)

Train set
Logistic Regression roc-auc: 0.7307283864065812
Test set
Logistic Regression roc-auc: 0.7227227435986561


In [26]:
# filter methods - univariate anova

run_logistic(X_train_anova,
             X_test_anova,
             y_train, y_test)

Train set
Logistic Regression roc-auc: 0.7385311277520487
Test set
Logistic Regression roc-auc: 0.7256599156189685


For logistic regression, we see that when we removed correlated features, we seemed to have removed some features that were good at predicting the target, as the performance dropped a bit.

Why don't you try to apply the univariate anova without removing features by correlation, to see if the selected features are good enough?

That is all for this lecture. I hope you enjoyed it!