<a href="https://colab.research.google.com/github/rodriguessdeyson/feature-comparison/blob/master-artefacts/feature-selection/FeatureSelection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Seleção de features dados estatísticos
Nesta etapa, dois arquivos são carregados com features obtidas dos dados puros de vibração. Esta etapa realiza a compração de dois conjuntos de features comumente extraídas de dados de vibração em comparação com as feature da biblioteca pyAudio (avaliar outra).

Após extraídas, um processo de seleção de features é aplicado aos dois conjuntos, sendo eles: Pearson, Gini Gain, Information Gain e Lasso. Ao final, as features selecionadas serão submetidas a dois classificadores e avaliado suas caracterísitcas.

Carga de dados dos dataset com as rotulações de cada falha.

In [None]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score, normalized_mutual_info_score
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import SelectFromModel
from google.colab import drive
import os
from sklearn.utils import shuffle

# Incializa o serviço do Google Drive.
drive.mount('/content/drive')

Carrega o data e realiza a separação de features e classes

In [None]:
# Load your CSV file into a DataFrame
data_statistics = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Master Final Project/V2/statistics_features.csv')
data_statistics = shuffle(data_statistics)


# Separate the features and the target variable
X = data_statistics.drop('class', axis=1)  # Features
y = data_statistics['class']  # Target variable

In [None]:
data_statistics

# Feature Selection com Pearson’s Correlation Coefficient


Utilizando a seleção de features com pearson, conseguimos remover features que são, de certa forma, semelhantes, mantendo apenas uma delas, evitano do sobrecarga do classificador. Para este modo de seleção, extrairemos as 10 principais features e testaresmos no classificador.

In [None]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import numpy as np

# Calculate Pearson correlation coefficients
pearson_correlation = X.corrwith(y)

# Select top features based on absolute correlation values
top_features = pearson_correlation.abs().sort_values(ascending=False).head(15).index

# Feature selection using Pearson correlation
pearson_features = SelectKBest(score_func=f_classif, k=15)
pearson_features.fit(X[top_features], y)

# Get selected features and their scores
pearson_selected_features = X[top_features].columns[pearson_features.get_support(indices=True)]
pearson_feature_scores = pearson_features.scores_[pearson_features.get_support(indices=True)]

print(pearson_selected_features)

In [None]:
import pandas as pd
from scipy.stats import pearsonr
from sklearn.feature_selection import SelectKBest, f_classif

# Assuming X and y are your feature matrix and target variable

# Calculate Pearson correlation coefficients
pearson_correlation = X.apply(lambda x: pearsonr(x, y)[0])

# Select top features based on absolute correlation values
top_features = pearson_correlation.abs().sort_values(ascending=False).head(15).index

# Use SelectKBest with f_classif to select the top features
pearson_features = SelectKBest(score_func=f_classif, k=15)
pearson_features.fit(X[top_features], y)

# Get selected features and their scores
pearson_selected_features = X[top_features].columns[pearson_features.get_support(indices=True)]
pearson_feature_scores = pearson_features.scores_[pearson_features.get_support(indices=True)]

# Print the selected features and their scores
print("Selected features using Pearson correlation:", pearson_selected_features)
print("Feature scores:", pearson_feature_scores)

# Feature Selection com Information Gain Ratio

In [None]:
# 2. Feature extraction using Information Gain
information_gain_features = SelectKBest(score_func=mutual_info_classif, k=15)  # Select top 10 features
information_gain_features.fit(X, y)
information_gain_selected_features = X.columns[information_gain_features.get_support(indices=True)]

print("Selected features using Information Gain:", information_gain_selected_features)


# Feature Selection com Gini Gain

In [None]:
forest = RandomForestClassifier(n_estimators=100, random_state=42)
forest.fit(X, y)
gini_selected_features = X.columns[forest.feature_importances_.argsort()[::-1][:15]]  # Select top 10 features

print("Selected features using Gini gain:", gini_selected_features)

# Feature Selection com LassoCV

In [None]:
# Feature selection using LassoCV
lasso_cv = LassoCV(cv=15)  # You can adjust the number of cross-validation folds
lasso_cv.fit(X, y)
lasso_selected_features = X.columns[SelectFromModel(lasso_cv, prefit=True).get_support(indices=True)]
print("Selected features using LASSO:", lasso_selected_features)

# Rank das features selecionadas

In [None]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import numpy as np

# Assuming X and y are already defined

# Feature selection using Pearson correlation
pearson_features = SelectKBest(score_func=f_classif, k=15)  # Select top 15 features
pearson_features.fit(X, y)
pearson_selected_features = X.columns[pearson_features.get_support(indices=True)]
pearson_feature_scores = pearson_features.scores_[pearson_features.get_support(indices=True)]

# Feature selection using Information Gain
information_gain_features = SelectKBest(score_func=mutual_info_classif, k=15)  # Select top 15 features
information_gain_features.fit(X, y)
information_gain_selected_features = X.columns[information_gain_features.get_support(indices=True)]
information_gain_scores = information_gain_features.scores_[information_gain_features.get_support(indices=True)]

# Feature selection using Gini gain (Random Forest)
forest = RandomForestClassifier(n_estimators=100, random_state=42)
forest.fit(X, y)
gini_selected_features = X.columns[forest.feature_importances_.argsort()[::-1][:15]]  # Select top 15 features
gini_importance_scores = forest.feature_importances_[forest.feature_importances_.argsort()[::-1][:15]]

# Feature selection using LassoCV
lasso_cv = LassoCV(cv=15)  # You can adjust the number of cross-validation folds
lasso_cv.fit(X, y)
lasso_selected_features = X.columns[SelectFromModel(lasso_cv, prefit=True, max_features=15).get_support(indices=True)]
lasso_coefficients = lasso_cv.coef_[SelectFromModel(lasso_cv, prefit=True, max_features=15).get_support(indices=True)]

# Create separate bar plots for each method
methods = ['Pearson', 'Information Gain', 'Gini Importance', 'LassoCV Coefficient']
all_selected_features = [pearson_selected_features, information_gain_selected_features, gini_selected_features, lasso_selected_features]
all_scores = [pearson_feature_scores, information_gain_scores, gini_importance_scores, lasso_coefficients]

plt.figure(figsize=(16, 10))

for i, method in enumerate(methods):
    plt.subplot(2, 2, i + 1)
    plt.bar(range(len(all_selected_features[i])), all_scores[i][all_selected_features[i].get_indexer(all_selected_features[i])], color='skyblue')
    plt.title(f'{method} Feature Ranking')
    plt.xlabel('Features')
    plt.ylabel('Scores')
    plt.xticks(range(len(all_selected_features[i])), all_selected_features[i], rotation=45, ha='right')
    plt.tight_layout()

plt.show()

In [None]:
import pandas as pd
from scipy.stats import pearsonr
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import numpy as np

# Assuming X and y are already defined

# Calculate Pearson correlation coefficients
pearson_correlation = X.apply(lambda x: pearsonr(x, y)[0])

# Select top features based on absolute correlation values
top_features = pearson_correlation.abs().sort_values(ascending=False).head(15).index

# Feature selection using Pearson correlation
pearson_features = SelectKBest(score_func=f_classif, k=15)
pearson_features.fit(X[top_features], y)

# Get selected features and their scores
pearson_selected_features = X[top_features].columns[pearson_features.get_support(indices=True)]
pearson_feature_scores = pearson_features.scores_[pearson_features.get_support(indices=True)]

# Feature selection using Information Gain
information_gain_features = SelectKBest(score_func=mutual_info_classif, k=15)  # Select top 15 features
information_gain_features.fit(X, y)
information_gain_selected_features = X.columns[information_gain_features.get_support(indices=True)]
information_gain_scores = information_gain_features.scores_[information_gain_features.get_support(indices=True)]

# Feature selection using Gini gain (Random Forest)
forest = RandomForestClassifier(n_estimators=100, random_state=42)
forest.fit(X, y)
gini_selected_features = X.columns[forest.feature_importances_.argsort()[::-1][:15]]  # Select top 15 features
gini_importance_scores = forest.feature_importances_[forest.feature_importances_.argsort()[::-1][:15]]

# Feature selection using LassoCV
lasso_cv = LassoCV(cv=15)  # You can adjust the number of cross-validation folds
lasso_cv.fit(X, y)
lasso_selected_features = X.columns[SelectFromModel(lasso_cv, prefit=True, max_features=15).get_support(indices=True)]
lasso_coefficients = lasso_cv.coef_[SelectFromModel(lasso_cv, prefit=True, max_features=15).get_support(indices=True)]

# Create separate bar plots for each method
methods = ['Pearson', 'Information Gain', 'Gini Importance', 'LassoCV Coefficient']
all_selected_features = [pearson_selected_features, information_gain_selected_features, gini_selected_features, lasso_selected_features]
all_scores = [pearson_feature_scores, information_gain_scores, gini_importance_scores, lasso_coefficients]

plt.figure(figsize=(16, 10))

for i, method in enumerate(methods):
    plt.subplot(2, 2, i + 1)
    plt.bar(range(len(all_selected_features[i])), all_scores[i][all_selected_features[i].get_indexer(all_selected_features[i])], color='skyblue')
    plt.title(f'{method} Feature Ranking')
    plt.xlabel('Features')
    plt.ylabel('Scores')
    plt.xticks(range(len(all_selected_features[i])), all_selected_features[i], rotation=45, ha='right')
    plt.tight_layout()

plt.show()

In [None]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import numpy as np

# Assuming X and y are already defined

# Calculate Pearson correlation coefficients
pearson_correlation = X.corrwith(y)

# Select top features based on absolute correlation values
top_features = pearson_correlation.abs().sort_values(ascending=False).head(15).index

# Feature selection using Pearson correlation
pearson_features = SelectKBest(score_func=f_classif, k=15)
pearson_features.fit(X[top_features], y)

# Get selected features and their scores
pearson_selected_features = X[top_features].columns[pearson_features.get_support(indices=True)]
pearson_feature_scores = pearson_features.scores_[pearson_features.get_support(indices=True)]

# Feature selection using Information Gain
information_gain_features = SelectKBest(score_func=mutual_info_classif, k=15)  # Select top 15 features
information_gain_features.fit(X, y)
information_gain_selected_features = X.columns[information_gain_features.get_support(indices=True)]
information_gain_scores = information_gain_features.scores_[information_gain_features.get_support(indices=True)]

# Feature selection using Gini gain (Random Forest)
forest = RandomForestClassifier(n_estimators=100, random_state=42)
forest.fit(X, y)
gini_selected_features = X.columns[forest.feature_importances_.argsort()[::-1][:15]]  # Select top 15 features
gini_importance_scores = forest.feature_importances_[forest.feature_importances_.argsort()[::-1][:15]]

# Feature selection using LassoCV
lasso_cv = LassoCV(cv=10)  # You can adjust the number of cross-validation folds
lasso_cv.fit(X, y)
lasso_selected_features = X.columns[SelectFromModel(lasso_cv, prefit=True, max_features=15).get_support(indices=True)]
lasso_coefficients = lasso_cv.coef_[SelectFromModel(lasso_cv, prefit=True, max_features=15).get_support(indices=True)]

# Create separate bar plots for each method
methods = ['Pearson', 'Information Gain', 'Gini Importance', 'LassoCV Coefficient']
all_selected_features = [pearson_selected_features, information_gain_selected_features, gini_selected_features, lasso_selected_features]
all_scores = [pearson_feature_scores, information_gain_scores, gini_importance_scores, lasso_coefficients]

plt.figure(figsize=(10, 16))

for i, method in enumerate(methods):
    plt.subplot(4, 1, i + 1)
    plt.bar(range(len(all_selected_features[i])), all_scores[i][all_selected_features[i].get_indexer(all_selected_features[i])], color='skyblue')
    plt.title(f'{method} Feature Ranking')
    plt.xlabel('Features')
    plt.ylabel('Scores')
    plt.xticks(range(len(all_selected_features[i])), all_selected_features[i], rotation=45, ha='right')
    plt.tight_layout()

plt.show()


In [None]:
# Dicionário para armazenar os ranks
feature_ranks = {}

# Rank baseado nos índices ou pontos dos métodos de seleção de features.
for idx, feature in enumerate(pearson_selected_features):
    feature_ranks[feature] = idx + 1

for idx, feature in enumerate(information_gain_selected_features):
    if feature in feature_ranks:
        feature_ranks[feature] += idx + 1
    else:
        feature_ranks[feature] = idx + 1

for idx, feature in enumerate(lasso_selected_features):
    if feature in feature_ranks:
        feature_ranks[feature] += idx + 1
    else:
        feature_ranks[feature] = idx + 1

for idx, feature in enumerate(gini_selected_features):
    if feature in feature_ranks:
        feature_ranks[feature] += idx + 1
    else:
        feature_ranks[feature] = idx + 1

# Sort the features based on their ranks
sorted_features = sorted(feature_ranks, key=feature_ranks.get)

# Print the ranked features
print("Ranked Features:")
for rank, feature in enumerate(sorted_features, 1):
    print(f"Rank {rank}: {feature}")

# Seleção de features dados pyAudio
Nesta etapa, dois arquivos são carregados com features obtidas dos dados puros de vibração. Esta etapa realiza a compração de dois conjuntos de features comumente extraídas de dados de vibração em comparação com as feature da biblioteca pyAudio (avaliar outra).

Após extraídas, um processo de seleção de features é aplicado aos dois conjuntos, sendo eles: Pearson, Gini Gain, Information Gain e Lasso. Ao final, as features selecionadas serão submetidas a dois classificadores e avaliado suas caracterísitcas.

In [None]:
# Load your CSV file into a DataFrame
data_pyaudio = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Master Final Project/V2/vib_pyaudio_features.csv')
data_pyaudio = shuffle(data_pyaudio)


# Separate the features and the target variable
XX = data_pyaudio.drop('class', axis=1)  # Features
yy = data_pyaudio['class']  # Target variable

In [None]:
data_pyaudio

# Feature Selection com Pearson’s Correlation Coefficient


In [None]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import numpy as np

# Calculate Pearson correlation coefficients
pearson_correlation = XX.corrwith(yy)

# Select top features based on absolute correlation values
top_features = pearson_correlation.abs().sort_values(ascending=False).head(15).index

# Feature selection using Pearson correlation
pearson_features = SelectKBest(score_func=f_classif, k=15)
pearson_features.fit(XX[top_features], yy)

# Get selected features and their scores
pearson_selected_features = XX[top_features].columns[pearson_features.get_support(indices=True)]
pearson_feature_scores = pearson_features.scores_[pearson_features.get_support(indices=True)]

print(pearson_selected_features)

# Feature Selection com Information Gain Ratio

In [None]:
# 2. Feature extraction using Information Gain
information_gain_features = SelectKBest(score_func=mutual_info_classif, k=15)  # Select top 10 features
information_gain_features.fit(XX, yy)
information_gain_selected_features = XX.columns[information_gain_features.get_support(indices=True)]

print("Selected features using Information Gain:", information_gain_selected_features)


# Feature Selection com Gini Gain

In [None]:
forest = RandomForestClassifier(n_estimators=100, random_state=42)
forest.fit(XX, yy)
gini_selected_features = XX.columns[forest.feature_importances_.argsort()[::-1][:15]]  # Select top 5 features

print("Selected features using Gini gain:", gini_selected_features)

# Feature Selection com LassoCV

In [None]:
# Feature selection using LassoCV
lasso_cv = LassoCV(cv=10)  # You can adjust the number of cross-validation folds
lasso_cv.fit(XX, yy)
lasso_selected_features = XX.columns[SelectFromModel(lasso_cv, prefit=True, max_features=15).get_support(indices=True)]
print("Selected features using LASSO:", lasso_selected_features[:15])

# Rank das features selecionadas

In [None]:
# Dicionário para armazenar os ranks
feature_ranks_pyAudio = {}

# Rank baseado nos índices ou pontos dos métodos de seleção de features.
for idx, feature in enumerate(pearson_selected_features):
    feature_ranks_pyAudio[feature] = idx + 1

for idx, feature in enumerate(information_gain_selected_features):
    if feature in feature_ranks_pyAudio:
        feature_ranks_pyAudio[feature] += idx + 1
    else:
        feature_ranks_pyAudio[feature] = idx + 1

for idx, feature in enumerate(lasso_selected_features):
    if feature in feature_ranks_pyAudio:
        feature_ranks_pyAudio[feature] += idx + 1
    else:
        feature_ranks_pyAudio[feature] = idx + 1

for idx, feature in enumerate(gini_selected_features):
    if feature in feature_ranks_pyAudio:
        feature_ranks_pyAudio[feature] += idx + 1
    else:
        feature_ranks_pyAudio[feature] = idx + 1

# Sort the features based on their ranks
sorted_features = sorted(feature_ranks_pyAudio, key=feature_ranks_pyAudio.get)

# Print the ranked features
print("Ranked Features:")
for rank, feature in enumerate(sorted_features, 1):
    print(f"Rank {rank}: {feature}")

In [None]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import numpy as np

# Assuming X and y are already defined

# Calculate Pearson correlation coefficients
pearson_correlation = XX.corrwith(yy)

# Select top features based on absolute correlation values
top_features = pearson_correlation.abs().sort_values(ascending=False).head(15).index

# Feature selection using Pearson correlation
pearson_features = SelectKBest(score_func=f_classif, k=15)
pearson_features.fit(XX[top_features], yy)

# Get selected features and their scores
pearson_selected_features = XX[top_features].columns[pearson_features.get_support(indices=True)]
pearson_feature_scores = pearson_features.scores_[pearson_features.get_support(indices=True)]

# Feature selection using Information Gain
information_gain_features = SelectKBest(score_func=mutual_info_classif, k=15)  # Select top 15 features
information_gain_features.fit(XX, yy)
information_gain_selected_features = XX.columns[information_gain_features.get_support(indices=True)]
information_gain_scores = information_gain_features.scores_[information_gain_features.get_support(indices=True)]

# Feature selection using Gini gain (Random Forest)
forest = RandomForestClassifier(n_estimators=100, random_state=42)
forest.fit(XX, yy)
gini_selected_features = XX.columns[forest.feature_importances_.argsort()[::-1][:15]]  # Select top 15 features
gini_importance_scores = forest.feature_importances_[forest.feature_importances_.argsort()[::-1][:15]]

# Feature selection using LassoCV
lasso_cv = LassoCV(cv=10)  # You can adjust the number of cross-validation folds
lasso_cv.fit(XX, yy)
lasso_selected_features = XX.columns[SelectFromModel(lasso_cv, prefit=True, max_features=15).get_support(indices=True)]
lasso_coefficients = lasso_cv.coef_[SelectFromModel(lasso_cv, prefit=True, max_features=15).get_support(indices=True)]

# Create separate bar plots for each method
methods = ['Pearson', 'Information Gain', 'Gini Importance', 'LassoCV Coefficient']
all_selected_features = [pearson_selected_features, information_gain_selected_features, gini_selected_features, lasso_selected_features]
all_scores = [pearson_feature_scores, information_gain_scores, gini_importance_scores, lasso_coefficients]

plt.figure(figsize=(10, 16))

for i, method in enumerate(methods):
    plt.subplot(4, 1, i + 1)
    plt.bar(range(len(all_selected_features[i])), all_scores[i][all_selected_features[i].get_indexer(all_selected_features[i])], color='skyblue')
    plt.title(f'{method} Feature Ranking')
    plt.xlabel('Features')
    plt.ylabel('Scores')
    plt.xticks(range(len(all_selected_features[i])), all_selected_features[i], rotation=45, ha='right')
    plt.tight_layout()

plt.show()