In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd

from sklearn.feature_selection import RFE, SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

<IPython.core.display.Javascript object>

In [3]:
df = pd.read_csv("data/train.csv")

# We can make predictions on test.csv, but they dont give us the
# labels for test; so we won't know the test accuracy.
# Instead, we'll train/test split the data from train.csv.
test = pd.read_csv("data/test.csv")

X = df.drop(columns="price_range")
y = df["price_range"]

<IPython.core.display.Javascript object>

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

<IPython.core.display.Javascript object>

In [5]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

<IPython.core.display.Javascript object>

## RFE with RandomForest

(using feature importance)

In [6]:
selector = RFE(RandomForestClassifier(), n_features_to_select=3)
selector.fit(X_train_scaled, y_train)

selected_names = X_train.columns[selector.get_support()]
selected = selector.transform(X_train_scaled)

rfe_selected_df = pd.DataFrame(selected, columns=selected_names)
rfe_selected_df.head()

Unnamed: 0,battery_power,px_height,ram
0,1.54836,0.275712,-0.580476
1,-1.379535,-0.258762,1.336453
2,-0.010914,-0.285711,-0.657302
3,-1.043621,-0.764042,1.288322
4,0.488416,0.40147,-0.489767


<IPython.core.display.Javascript object>

## RFE with LogisticRegression

(using coefficient magnitude)

In [7]:
selector = RFE(LogisticRegression(), n_features_to_select=3)
selector.fit(X_train_scaled, y_train)

selected_names = X_train.columns[selector.get_support()]
selected = selector.transform(X_train_scaled)

rfe_selected_df = pd.DataFrame(selected, columns=selected_names)
rfe_selected_df.head()

Unnamed: 0,battery_power,px_height,ram
0,1.54836,0.275712,-0.580476
1,-1.379535,-0.258762,1.336453
2,-0.010914,-0.285711,-0.657302
3,-1.043621,-0.764042,1.288322
4,0.488416,0.40147,-0.489767


<IPython.core.display.Javascript object>

## SelectKBest

(using ANOVA)

In [8]:
selector = SelectKBest(k=3)
selector.fit(X_train_scaled, y_train)

selected_names = X_train.columns[selector.get_support()]
selected = selector.transform(X_train_scaled)

rfe_selected_df = pd.DataFrame(selected, columns=selected_names)
rfe_selected_df.head()

Unnamed: 0,battery_power,px_width,ram
0,1.54836,1.181997,-0.580476
1,-1.379535,-0.556547,1.336453
2,-0.010914,-1.020159,-0.657302
3,-1.043621,0.984962,1.288322
4,0.488416,-0.336332,-0.489767


<IPython.core.display.Javascript object>

### Using these results to predict

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

<IPython.core.display.Javascript object>

In [10]:
rfe_pipeline = Pipeline(
    [
        ("scale", StandardScaler()),
        ("rfe", RFE(RandomForestClassifier(random_state=42), n_features_to_select=4)),
        ("classifier", LogisticRegression(random_state=42)),
    ]
)

rfe_pipeline.fit(X_train, y_train)

print(f'selected features: {list(X.columns[rfe_pipeline["rfe"].get_support()])}')

train_score = rfe_pipeline.score(X_train, y_train)
test_score = rfe_pipeline.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

selected features: ['battery_power', 'px_height', 'px_width', 'ram']
Train score: 0.959375
Test score: 0.9725


<IPython.core.display.Javascript object>

In [11]:
skb_pipeline = Pipeline(
    [
        ("scale", StandardScaler()),
        ("selectkbest", SelectKBest(k=4)),
        ("classifier", LogisticRegression(random_state=42)),
    ]
)

skb_pipeline.fit(X_train, y_train)

print(
    f'selected features: {list(X.columns[skb_pipeline["selectkbest"].get_support()])}'
)

train_score = skb_pipeline.score(X_train, y_train)
test_score = skb_pipeline.score(X_test, y_test)

print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

selected features: ['battery_power', 'px_height', 'px_width', 'ram']
Train score: 0.959375
Test score: 0.9725


<IPython.core.display.Javascript object>