# COMP3009 – Assignment 2  
feature selection for regression and classification
Steps:

1. Load the cleaned dataset.
2. Define:
   - pCR, classification target
   - RelapseFreeSurvival, regression target
3. Split the data into *raining and test sets  
   80% rain / 20% test
4. Apply a ilter method on the training set:
5. Always keep core clinical features:
6. Train simple baseline models:
   - Logistic Regression for pCR
   - Random Forest Regressor for RFS
7. Save the final selected feature lists to:
   - selected_features_pcr.cvv
   - selected_features_rfs.csv



In [9]:
import numpy as np
import pandas as pd

from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_classif, f_regression
from sklearn.metrics import balanced_accuracy_score, mean_absolute_error

RANDOM_STATE = 42

# Column names
ID_COL = "ID"
PCR_COL = "pCR (outcome)"
RFS_COL = "RelapseFreeSurvival (outcome)"

CORE_CLINICAL_FEATURES = [
    "Age",
    "ER",
    "PgR",
    "HER2",
    "TrippleNegative",
    "ChemoGrade",
    "Proliferation",
    "Gene",
]

In [10]:
data_path = Path("Dataset/preprocessed_dataset_cleaned.csv")

df = pd.read_csv(data_path)
print("Shape:", df.shape)
df.head()

Shape: (400, 121)


Unnamed: 0,ID,pCR (outcome),RelapseFreeSurvival (outcome),Age,ER,PgR,HER2,TrippleNegative,ChemoGrade,Proliferation,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,TRG002174,1.0,144.0,41.0,0.0,0.0,0.0,1.0,3.0,3.0,...,0.517172,0.375126,3.325332,0.002314,3880771.5,473.464852,0.000768,0.182615,0.030508,0.000758
1,TRG002178,0.0,142.0,39.0,1.0,1.0,0.0,0.0,3.0,3.0,...,0.444391,0.444391,3.032144,0.005612,2372009.744,59.45971,0.004383,0.032012,0.001006,0.003685
2,TRG002204,1.0,135.0,31.0,0.0,0.0,0.0,1.0,2.0,1.0,...,0.534549,0.534549,2.485848,0.006752,1540027.421,33.935384,0.007584,0.024062,0.000529,0.006447
3,TRG002206,0.0,12.0,35.0,0.0,0.0,0.0,1.0,3.0,3.0,...,0.506185,0.506185,2.606255,0.003755,6936740.794,46.859265,0.005424,0.013707,0.000178,0.004543
4,TRG002210,0.0,109.0,61.0,1.0,0.0,0.0,0.0,2.0,1.0,...,0.462282,0.462282,2.809279,0.006521,1265399.054,39.621023,0.006585,0.034148,0.001083,0.005626


In [11]:
X_train, X_test, y_pcr_train, y_pcr_test, y_rfs_train, y_rfs_test = train_test_split(
    X,
    y_pcr,
    y_rfs,
    test_size=0.2,
    stratify=y_pcr,
    random_state=RANDOM_STATE
)

print("Train size:", X_train.shape[0])
print("Test size:", X_test.shape[0])

Train size: 320
Test size: 80


In [22]:
from pathlib import Path

split_dir = Path("/Users/remylieberman/Desktop/code/MLcw2/Dataset/testTrainSPlit")
split_dir.mkdir(parents=True, exist_ok=True)

X_train.to_csv(split_dir / "XTrain.csv", index=False)
X_test.to_csv(split_dir / "XTest.csv", index=False)

pd.DataFrame({"pCR": y_pcr_train}).to_csv(split_dir / "y_pcr_train.csv", index=False)
pd.DataFrame({"pCR": y_pcr_test}).to_csv(split_dir / "y_pcr_test.csv", index=False)

pd.DataFrame({"RFS": y_rfs_train}).to_csv(split_dir / "y_rfs_train.csv", index=False)
pd.DataFrame({"RFS": y_rfs_test}).to_csv(split_dir / "y_rfs_test.csv", index=False)

print("saved")

saved


In [12]:
def get_available_core_features(columns, core_list):
    cols_set = set(columns)
    return [c for c in core_list if c in cols_set]

available_core = get_available_core_features(feature_cols, CORE_CLINICAL_FEATURES)
#print("Available core clinical features:", available_core)

Available core clinical features: ['Age', 'ER', 'PgR', 'HER2', 'TrippleNegative', 'ChemoGrade', 'Proliferation', 'Gene']


In [13]:
#chillin ftest for classification 

X_train_num = X_train.copy()
X_test_num = X_test.copy()


K_PCR = min(20, X_train_num.shape[1])

selector_pcr = SelectKBest(score_func=f_classif, k=K_PCR)
selector_pcr.fit(X_train_num, y_pcr_train)

mask_pcr = selector_pcr.get_support()
kbest_features_pcr = list(X_train_num.columns[mask_pcr])

print("Top-K features for pCR (from F-test):")
print(kbest_features_pcr)

final_features_pcr = sorted(set(kbest_features_pcr) | set(available_core)) #may need to remove this

print("\nFinal selected features for pCR (union of F-test + core clinical):")
print(final_features_pcr)
print("Total pCR features selected:", len(final_features_pcr))

Top-K features for pCR (from F-test):
['ER', 'PgR', 'HER2', 'Proliferation', 'LNStatus', 'TumourStage', 'Gene', 'original_firstorder_10Percentile', 'original_firstorder_Skewness', 'original_firstorder_Variance', 'original_glcm_DifferenceAverage', 'original_glcm_InverseVariance', 'original_glcm_MaximumProbability', 'original_gldm_DependenceEntropy', 'original_gldm_DependenceNonUniformityNormalized', 'original_gldm_LargeDependenceEmphasis', 'original_gldm_LargeDependenceHighGrayLevelEmphasis', 'original_glrlm_RunEntropy', 'original_glrlm_RunLengthNonUniformityNormalized', 'original_glrlm_RunPercentage']

Final selected features for pCR (union of F-test + core clinical):
['Age', 'ChemoGrade', 'ER', 'Gene', 'HER2', 'LNStatus', 'PgR', 'Proliferation', 'TrippleNegative', 'TumourStage', 'original_firstorder_10Percentile', 'original_firstorder_Skewness', 'original_firstorder_Variance', 'original_glcm_DifferenceAverage', 'original_glcm_InverseVariance', 'original_glcm_MaximumProbability', 'orig

In [14]:
#less chill test for regression

K_RFS = min(20, X_train_num.shape[1])

selector_rfs = SelectKBest(score_func=f_regression, k=K_RFS)
selector_rfs.fit(X_train_num, y_rfs_train)

mask_rfs = selector_rfs.get_support()
kbest_features_rfs = list(X_train_num.columns[mask_rfs])

print("Top-K features for RFS (from F-test):")
print(kbest_features_rfs)

final_features_rfs = sorted(set(kbest_features_rfs) | set(available_core))

print("\nFinal selected features for RFS (union of F-test + core clinical):")
print(final_features_rfs)
print("Total RFS features selected:", len(final_features_rfs))

Top-K features for RFS (from F-test):
['TumourStage', 'Gene', 'original_shape_MajorAxisLength', 'original_shape_Maximum2DDiameterRow', 'original_shape_Maximum2DDiameterSlice', 'original_shape_Maximum3DDiameter', 'original_shape_MinorAxisLength', 'original_shape_Sphericity', 'original_firstorder_90Percentile', 'original_firstorder_InterquartileRange', 'original_firstorder_Kurtosis', 'original_firstorder_Maximum', 'original_firstorder_MeanAbsoluteDeviation', 'original_firstorder_Range', 'original_firstorder_RobustMeanAbsoluteDeviation', 'original_firstorder_RootMeanSquared', 'original_firstorder_Variance', 'original_glcm_ClusterTendency', 'original_gldm_DependenceEntropy', 'original_glszm_ZonePercentage']

Final selected features for RFS (union of F-test + core clinical):
['Age', 'ChemoGrade', 'ER', 'Gene', 'HER2', 'PgR', 'Proliferation', 'TrippleNegative', 'TumourStage', 'original_firstorder_90Percentile', 'original_firstorder_InterquartileRange', 'original_firstorder_Kurtosis', 'origin

In [17]:
pipe_all_pcr = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        penalty="l2",
        solver="lbfgs",
        max_iter=5000,
        random_state=RANDOM_STATE
    ))
])

pipe_all_pcr.fit(X_train_num, y_pcr_train)
y_pred_all = pipe_all_pcr.predict(X_test_num)
bal_acc_all = balanced_accuracy_score(y_pcr_test, y_pred_all)

print("PCR baseline (all features)")
print(f"Balanced accuracy on test set: {bal_acc_all:.3f}")

X_train_pcr_sel = X_train_num[final_features_pcr]
X_test_pcr_sel = X_test_num[final_features_pcr]

pipe_sel_pcr = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        penalty="l2",
        solver="lbfgs",
        max_iter=5000,
        random_state=RANDOM_STATE
    ))
])

pipe_sel_pcr.fit(X_train_pcr_sel, y_pcr_train)
y_pred_sel = pipe_sel_pcr.predict(X_test_pcr_sel)
bal_acc_sel = balanced_accuracy_score(y_pcr_test, y_pred_sel)

print("PCR with selected features")
print(f"Balanced accuracy on test set: {bal_acc_sel:.3f}")

PCR baseline (all features)
Balanced accuracy on test set: 0.607
PCR with selected features
Balanced accuracy on test set: 0.669


In [18]:
# --- Baseline: all features ---
rf_all = RandomForestRegressor(
    n_estimators=500,
    max_depth=None,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

rf_all.fit(X_train_num, y_rfs_train)
y_pred_all_rfs = rf_all.predict(X_test_num)
mae_all = mean_absolute_error(y_rfs_test, y_pred_all_rfs)

print("RFS baseline (all features)")
print(f"MAE on test set: {mae_all:.3f}")

# --- Selected features only ---
X_train_rfs_sel = X_train_num[final_features_rfs]
X_test_rfs_sel = X_test_num[final_features_rfs]

rf_sel = RandomForestRegressor(
    n_estimators=500,
    max_depth=None,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

rf_sel.fit(X_train_rfs_sel, y_rfs_train)
y_pred_sel_rfs = rf_sel.predict(X_test_rfs_sel)
mae_sel = mean_absolute_error(y_rfs_test, y_pred_sel_rfs)

print("RFS with selected features")
print(f"MAE on test set: {mae_sel:.3f}")

RFS baseline (all features)
MAE on test set: 18.824
RFS with selected features
MAE on test set: 18.781


In [20]:
pcr_feat_path = "Dataset/SelectedFeaturePCR.csv"
rfs_feat_path = "Dataset/SelectedFeaturesRFS.csv"

pd.Series(final_features_pcr, name="feature").to_csv(pcr_feat_path, index=False)
pd.Series(final_features_rfs, name="feature").to_csv(rfs_feat_path, index=False)

print("Saved pCR feature list to:", pcr_feat_path)
print("Saved RFS feature list to:", rfs_feat_path)

Saved pCR feature list to: Dataset/SelectedFeaturePCR.csv
Saved RFS feature list to: Dataset/SelectedFeaturesRFS.csv
