In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.feature_selection import RFE, RFECV, SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, f1_score

### Carregar e preparar dados - Classificação

In [2]:
# Carregar o dataset
df = pd.read_csv('./datasets/fruit_quality.csv')

In [3]:
# Visualizar estrutura
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   A_id         4000 non-null   int64  
 1   Size         4000 non-null   float64
 2   Weight       4000 non-null   float64
 3   Sweetness    4000 non-null   float64
 4   Crunchiness  4000 non-null   float64
 5   Juiciness    4000 non-null   float64
 6   Ripeness     4000 non-null   float64
 7   Acidity      4000 non-null   float64
 8   Quality      4000 non-null   object 
dtypes: float64(7), int64(1), object(1)
memory usage: 281.4+ KB


In [4]:
# Visualizar DF
df.head(5)

Unnamed: 0,A_id,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
0,0,-3.970049,-2.512336,5.34633,-1.012009,1.8449,0.32984,-0.49159,good
1,1,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.86753,-0.722809,good
2,2,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636,bad
3,3,-0.657196,-2.271627,1.324874,-0.097875,3.63797,-3.413761,0.790723,good
4,4,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984,good


In [5]:
# Ajustar Dataframe

# Remover a coluna 'A_id'
df.drop('A_id', axis=1, inplace=True)

# Transformar coluna 'Quality' em numérica
df['Quality'] = (df['Quality'] == 'good').astype(int)
df.head(5)

Unnamed: 0,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
0,-3.970049,-2.512336,5.34633,-1.012009,1.8449,0.32984,-0.49159,1
1,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.86753,-0.722809,1
2,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636,0
3,-0.657196,-2.271627,1.324874,-0.097875,3.63797,-3.413761,0.790723,1
4,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984,1


### Treinar modelo de regressão logística com RFE

In [6]:
# Separar X e y
X = df.drop('Quality', axis=1)
y=df['Quality']

In [7]:
# Separar Treino e Teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=51)

In [8]:
# Treinar o modelo com RFE
# RFE (Recursive Features Elimination)
# Uso um estimador e define uma quantidade de features (dois hiperparâmetros)
# O RFE faz iterações iniciando com todas as features e eliminando a cada iteração até atingir a quantidade definida
# Elimina características/features menos importantes
rfe_method = RFE(estimator=LogisticRegression(), n_features_to_select=2)
rfe_method.fit(X_train, y_train)

In [9]:
# Quais features foram selecionadas
X_train.columns[(rfe_method.get_support())]

Index(['Size', 'Sweetness'], dtype='object')

In [10]:
# Ranking de Features
def mostrar_ranking(metodo_fs, X_train):
  
  # Obter o Ranking de Features
  ranking = rfe_method.ranking_

  # Obter os nomes das features
  nomes_features = X_train.columns.to_list()

  # Criar um Dataframe com os rankings e os nomes das features
  df_ranking = pd.DataFrame({'Features': nomes_features, 'Ranking': ranking})

  # Ordenar o DataFrame pelo Ranking
  df_ranking = df_ranking.sort_values(by='Ranking')

  # Exibir Ranking
  print(df_ranking)

In [11]:
# Ranking de Features do RFE Regressão
mostrar_ranking(rfe_method, X_train)

      Features  Ranking
0         Size        1
2    Sweetness        1
4    Juiciness        2
1       Weight        3
6      Acidity        4
5     Ripeness        5
3  Crunchiness        6


In [12]:
# Função para avaliar performance
def performance_classificacao(modelo, X_test, y_test):
  
  # Faz a predição com o modelo no conjunto de testes
  y_pred = modelo.predict(X_test)

  # Avaliar desempenho
  return f1_score(y_test, y_pred)

In [13]:
# Performance Regressão com RFE
performance_classificacao(rfe_method, X_test, y_test)

0.697171381031614

### Treinar modelo sem RFE

In [14]:
# Treinar modelo sem RFE
model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)

In [15]:
# Validar Performance
performance_classificacao(model_lr, X_test, y_test)

0.7787903893951947

### Treinar Modelo de Regressão Logística com RFECV

In [16]:
rfe_method_cv = RFECV(estimator=LogisticRegression(), min_features_to_select=2, cv=5, scoring='f1_weighted')
rfe_method_cv.fit(X_train, y_train)

In [17]:
performance_classificacao(rfe_method_cv, X_test, y_test)

0.7767634854771784

In [18]:
# Quais features foram selecionadas
X_train.columns[(rfe_method_cv.get_support())]

Index(['Size', 'Weight', 'Sweetness', 'Juiciness', 'Ripeness', 'Acidity'], dtype='object')

In [19]:
# Quantas Features foram selecionadas
rfe_method_cv.n_features_

6

### Treinar modelo de regressão logística com SelectFromModel

In [20]:
sfm_method = SelectFromModel(estimator=model_lr, max_features=5, threshold=0.1)
sfm_method.fit(X_train, y_train)

In [21]:
# Quais features foram selecionadas
X_train.columns[(sfm_method.get_support())]

Index(['Size', 'Weight', 'Sweetness', 'Juiciness', 'Acidity'], dtype='object')

In [22]:
# Treinar modelo com as features selecionadas
X_train_ajustado_class = sfm_method.transform(X_train)
X_test_ajustado_class = sfm_method.transform(X_test)
model_lr.fit(X_train_ajustado_class, y_train)

In [23]:
# Performance do Modelo com SelectFromModel
performance_classificacao(model_lr, X_test_ajustado_class, y_test)

0.7738193869096934