# Perform feature selection on regression data

## 0. Introduction

This notebook contains:
  1. Correlation feature selection
  2. Mutual information feature selection

## 1. Correlation feature selection

In [None]:
from sklearn.datasets import make_regression
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

In [None]:
def get_dataset():
  X, y = make_regression(n_samples=1000, n_features=100, n_informative=10, noise=0.1, random_state=42)
  return X, y

In [None]:
def corr_feature_selection(X_train, y_train, X_test):
  fs = SelectKBest(score_func=f_regression, k=88)
  fs.fit(X_train, y_train)
  X_train_fs = fs.transform(X_train)
  X_test_fs = fs.transform(X_test)
  return X_train_fs, X_test_fs, fs


In [None]:
X, y = get_dataset()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
X_train_fs, X_test_fs, fs = corr_feature_selection(X_train, y_train, X_test)
model = LinearRegression()
model.fit(X_train_fs, y_train)
y_preds = model.predict(X_test_fs)
mae = mean_absolute_error(y_test, y_preds)
print(f"MAE: {mae:.5f}")

MAE: 0.08060


## 2. Mutual information feature selection

In [None]:
def mi_feature_selection(X_train, y_train, X_test):
  fs = SelectKBest(score_func=mutual_info_regression, k=88)
  fs.fit(X_train, y_train)
  X_train_fs = fs.transform(X_train)
  X_test_fs = fs.transform(X_test)
  return X_train_fs, X_test_fs, fs

In [None]:
X, y = get_dataset()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
X_train_fs, X_test_fs, fs = mi_feature_selection(X_train, y_train, X_test)
model = LinearRegression()
model.fit(X_train_fs, y_train)
y_preds = model.predict(X_test_fs)
mae = mean_absolute_error(y_test, y_preds)
print(f"MAE: {mae:.5f}")

MAE: 7.45128
