In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer

In [None]:
# Load dataset
df = load_breast_cancer()
X = df.data
y = df.target

In [None]:
X.shape

## Model performance with all 30 features

In [None]:
#Dataset split into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,stratify=y, random_state=1)

In [None]:
# decision tree construction
model = DecisionTreeClassifier(criterion = 'entropy', random_state=1)
model= model.fit(X_train, y_train)

## Model Evaluation

In [None]:
#prediction
y_pred = model.predict(X_test)
#accuracy
data_accuracy = metrics.accuracy_score(y_test, y_pred)
print("Data Accuracy:",data_accuracy)

In [None]:
print(metrics.classification_report(y_test, y_pred,digits=2,output_dict=False))

# Wrapper Method_Forward selection

In [None]:
# Perform forward feature selection
forward_selector = SequentialFeatureSelector(model, n_features_to_select=10, direction='forward')
forward_selector.fit(X, y)

# Get the selected features
selected_features_forward = forward_selector.get_support(indices=True)
print("Selected features (forward selection):", selected_features_forward)

## Model performance with the subset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X[:,selected_features_forward], y, test_size=0.3,stratify=y, random_state=1)

In [None]:
model= model.fit(X_train, y_train)
#prediction
y_pred = model.predict(X_test)
#accuracy
data_accuracy = metrics.accuracy_score(y_test, y_pred)
print("Data Accuracy:",data_accuracy)

In [None]:
print(metrics.classification_report(y_test, y_pred,digits=2,output_dict=False))

# Wrapper Method_Backward selection

In [None]:
# Perform backward feature selection
backward_selector = SequentialFeatureSelector(model, n_features_to_select=10, direction='backward')
backward_selector.fit(X, y)

# Get the selected features
selected_features_backward = backward_selector.get_support(indices=True)
print("Selected features (backward selection):", selected_features_backward)

## Model performance with the subset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X[:,selected_features_backward], y, test_size=0.3,stratify=y, random_state=1)

In [None]:
model= model.fit(X_train, y_train)
#prediction
y_pred = model.predict(X_test)
#accuracy
data_accuracy = metrics.accuracy_score(y_test, y_pred)
print("Data Accuracy:",data_accuracy)

In [None]:
print(metrics.classification_report(y_test, y_pred,digits=2,output_dict=False))

# Filter Methods_Feature selection

In [None]:
from sklearn.feature_selection import VarianceThreshold

# Applying Variance Threshold
selector = VarianceThreshold(threshold=0.2)
X_variance_filtered = selector.fit_transform(X)

print("Selected features (variance threshold):", selector.get_support(indices=True))
X_variance_filtered.shape

## Model performance with the subset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_variance_filtered, y, test_size=0.3,stratify=y, random_state=1)

In [None]:
model= model.fit(X_train, y_train)
#prediction
y_pred = model.predict(X_test)
#accuracy
data_accuracy = metrics.accuracy_score(y_test, y_pred)
print("Data Accuracy:",data_accuracy)

In [None]:
print(metrics.classification_report(y_test, y_pred,digits=2,output_dict=False))

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler

# Normalise the data to the range [0, 1] for chi-squared test
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Applying Chi-Squared test
chi2_selector = SelectKBest(chi2, k=10)
X_chi2_filtered = chi2_selector.fit_transform(X_scaled, y)

print("Selected features (chi-squared test):", chi2_selector.get_support(indices=True))

## Model performance with the subset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_chi2_filtered, y, test_size=0.3,stratify=y, random_state=1)

In [None]:
model= model.fit(X_train, y_train)
#prediction
y_pred = model.predict(X_test)
#accuracy
data_accuracy = metrics.accuracy_score(y_test, y_pred)
print("Data Accuracy:",data_accuracy)

In [None]:
print(metrics.classification_report(y_test, y_pred,digits=2,output_dict=False))

# Comment on: 1. Does feature subset selection improve the decisiontree model performance? 

#  2. If so, which feature selection method suits this dataset?