In [1]:
import pandas as pd
import sklearn
import numpy as np

#PREPROCESSING
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures

#LOADING MODEL
import pickle


from sklearn.feature_selection import SelectFromModel

In [2]:
#Get dataset from .csv file
all_df = pd.read_excel('FinalTestDataset2024.xls', index_col = False) #!Change file name
all_df.head()

Unnamed: 0,ID,Age,ER,PgR,HER2,TrippleNegative,ChemoGrade,Proliferation,HistologyType,LNStatus,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,TRG002219,47.0,1,1,0,0,3,2,1,1,...,0.49835,0.49835,3.144594,0.003447,8257693.277,150.048587,0.001753,0.03711,0.001369,0.001513
1,TRG002222,41.0,1,1,0,0,3,2,1,0,...,0.622381,0.622381,2.061654,0.006535,1568441.643,26.484938,0.009649,0.019352,0.000321,0.008285
2,TRG002223,53.0,0,0,0,1,2,1,1,1,...,0.412482,0.412482,3.440353,0.005391,2656924.827,174.606929,0.001594,0.075152,0.005255,0.001444
3,TRG002235,46.0,1,1,0,0,2,1,1,1,...,0.378333,0.378333,3.531715,0.007102,1714787.173,96.787378,0.002772,0.053377,0.002666,0.002406
4,TRG002240,39.0,0,0,1,0,2,2,1,1,...,0.524767,0.524767,2.186214,0.007896,510479.346,12.789071,0.020072,0.02314,0.000463,0.017172


In [3]:
#Save ID
ID = all_df["ID"]
X = all_df.drop(['ID'], axis=1)

In [4]:
#Replace all missing values with the median of their field
imputer = SimpleImputer(missing_values=999, strategy="median")
imputer.set_output(transform="pandas")
X = imputer.fit_transform(X)

In [5]:
#Replace values outside the IQR with the median value from rest of the dataset

columns = X.columns
medians = {}
#Loading weights from file
with open("medians_RFS.pkl", "rb") as f:
    medians = pickle.load(f)




def replace_outliers_with_median(df):
    columns = df.columns 
    for column in columns:
        Q1 = df[column].quantile(0.25) 
        Q3 = df[column].quantile(0.75)  
        IQR = Q3 - Q1                  
        lower_bound = Q1 - 1.5 * IQR   
        upper_bound = Q3 + 1.5 * IQR   
        median = medians[column]
        
        df[column] = df[column].apply(lambda x: median if x < lower_bound or x > upper_bound else x)



replace_outliers_with_median(X)

In [6]:
#Using One-hot encoding to deal with categorical values
categorical_columns = ['ChemoGrade','Proliferation','HistologyType','TumourStage']

encoder = OneHotEncoder(sparse_output=False)

one_hot_encoded = encoder.fit_transform(X[categorical_columns])

one_hot_df = pd.DataFrame(one_hot_encoded, 
                          columns=encoder.get_feature_names_out(categorical_columns))

#X after all categorical values have been one hot encoded
one_hot_X = pd.concat([X.drop(categorical_columns, axis=1), one_hot_df], axis=1)

In [7]:
#Scale values for use in model
scaler = MinMaxScaler(feature_range=(0.05,0.95), clip=False)
scaler.set_output(transform="pandas")
Xs = scaler.fit_transform(one_hot_X)

In [8]:
#Using polynomial features to find interactions between features
poly = PolynomialFeatures(degree=2, include_bias=True)
X_poly = poly.fit_transform(Xs)

print("Original number of features:", X.shape[1])
print("Expanded number of features:", X_poly.shape[1]) #Should be 8001


Original number of features: 118
Expanded number of features: 7875


In [9]:
#Load feature_selector

with open("important_features_RFS.pkl", "rb") as f:
    feature_selecter = pickle.load(f)

#Identify features with best relation
model = SelectFromModel(feature_selecter, prefit=True, max_features=50)



feature_importance=model.get_support(indices=True)


#reduce to only these features
reduced_Xs = X_poly[:, feature_importance]



In [10]:
with open("model_regression.pkl", "rb") as f:
    model = pickle.load(f)
    
y_pred = model.predict(reduced_Xs)

In [11]:
output_df = ID.to_frame().join(pd.DataFrame(y_pred))
output_df.to_csv("RFSPrediction.csv",index=False,header=False)