In [1]:
import pandas as pd
import sklearn
import numpy as np

#PREPROCESSING
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures

#LOADING MODEL
import pickle


from sklearn.feature_selection import SelectFromModel

In [4]:
#Get dataset from .csv file
all_df = pd.read_excel('test_x.xlsx', index_col = False) #!Change file name
all_df.head()

Unnamed: 0,ID,Age,ER,PgR,HER2,TrippleNegative,ChemoGrade,Proliferation,HistologyType,LNStatus,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,TRG002796,33.7,1,1,1,0,3,1,1,0,...,0.335649,0.335649,2.459148,0.002885,1015285.576,76.59191,0.003979,0.120866,0.014279,0.00355
1,TRG002206,35.0,0,0,0,1,3,3,1,1,...,0.506185,0.506185,2.606255,0.003755,6936740.794,46.859265,0.005424,0.013707,0.000178,0.004543
2,TRG002753,51.285421,0,0,0,1,3,3,1,0,...,0.366012,0.366012,3.3042,0.006157,2478844.146,95.5941,0.002761,0.043729,0.00169,0.002422
3,TRG002620,61.943874,1,0,0,0,2,1,1,0,...,0.391121,0.391121,3.463022,0.00546,3178794.052,162.425278,0.001675,0.060932,0.003558,0.001466
4,TRG002720,63.238877,0,0,1,0,2,3,1,1,...,0.409377,0.409376,3.305045,0.00173,6755623.084,476.373341,0.000653,0.117835,0.014701,0.000594


In [5]:
#Save ID
ID = all_df["ID"]
X = all_df.drop(['ID'], axis=1)

In [6]:
#Replace all missing values with the median of their field
imputer = SimpleImputer(missing_values=999, strategy="median")
imputer.set_output(transform="pandas")
X = imputer.fit_transform(X)

In [7]:
#Replace values outside the IQR with the median value from rest of the dataset

columns = X.columns
medians = {}
#Loading weights from file
with open("medians_RFS.pkl", "rb") as f:
    medians = pickle.load(f)




def replace_outliers_with_median(df):
    columns = df.columns 
    for column in columns:
        Q1 = df[column].quantile(0.25) 
        Q3 = df[column].quantile(0.75)  
        IQR = Q3 - Q1                  
        lower_bound = Q1 - 1.5 * IQR   
        upper_bound = Q3 + 1.5 * IQR   
        median = medians[column]
        
        df[column] = df[column].apply(lambda x: median if x < lower_bound or x > upper_bound else x)



replace_outliers_with_median(X)

In [8]:
#Using One-hot encoding to deal with categorical values
categorical_columns = ['ChemoGrade','Proliferation','HistologyType','TumourStage']

encoder = OneHotEncoder(sparse_output=False)

one_hot_encoded = encoder.fit_transform(X[categorical_columns])

one_hot_df = pd.DataFrame(one_hot_encoded, 
                          columns=encoder.get_feature_names_out(categorical_columns))

#X after all categorical values have been one hot encoded
one_hot_X = pd.concat([X.drop(categorical_columns, axis=1), one_hot_df], axis=1)

In [9]:
#Scale values for use in model
scaler = MinMaxScaler(feature_range=(0.05,0.95), clip=False)
scaler.set_output(transform="pandas")
Xs = scaler.fit_transform(one_hot_X)

In [10]:
#Using polynomial features to find interactions between features
poly = PolynomialFeatures(degree=2, include_bias=True)
X_poly = poly.fit_transform(Xs)

print("Original number of features:", X.shape[1])
print("Expanded number of features:", X_poly.shape[1]) #Should be 8001


Original number of features: 118
Expanded number of features: 8001


In [11]:
#Load feature_selector

with open("important_features_RFS.pkl", "rb") as f:
    feature_selecter = pickle.load(f)

#Identify features with best relation
model = SelectFromModel(feature_selecter, prefit=True, max_features=50)



feature_importance=model.get_support(indices=True)


#reduce to only these features
reduced_Xs = X_poly[:, feature_importance]



In [12]:
with open("model_regression.pkl", "rb") as f:
    model = pickle.load(f)
    
y_pred = model.predict(reduced_Xs)

In [13]:
output_df = ID.to_frame().join(pd.DataFrame(y_pred))
output_df.to_csv("RFSPrediction.csv",index=False,header=False)