<a href="https://colab.research.google.com/github/rebeccamess/tm10007_group6/blob/main/TM10007_group6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cloning the git respository
!git clone https://github.com/jveenland/TM10007_ML.git

!pip install boruta # use this if boruta is not installed
# Import the needed modules
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import precision_score, accuracy_score
from boruta import BorutaPy

# This disables all warnings future warnings that pandas gives
warnings.filterwarnings("ignore")

# creating a random state
np.random.seed(42)

# Reading the csv and printing the number of samples and columns
data = pd.read_csv('TM10007_ML/worcliver/Liver_radiomicFeatures.csv')
# Summary of information about the data
print(f'The number of samples: {len(data.index)}')
print(f'The number of features: {len(data.columns)-2}')
print(f'The number of missing data: {data.isnull().values.sum()}')
print(f'The number of zeros in the data: {(data == 0).sum().sum()}')

# Assess the distribution for malginant and benign in the dataset
count_mal = (data['label'] == 'malignant').sum() 
count_ben = (data['label'] == 'benign').sum() 
print(f'The number malignent tumors are: {count_mal}')
print(f'The number benign tumors are: {count_ben}')


# Step 1
# Spliting the data in a train and test set

train_df, test_df = train_test_split(data, test_size = 0.2, random_state=42, stratify = data['label'])
test_df.to_csv('test_df.csv')


#step 2
# calculating the the SD of a collumn
std_colums = data.std()
print(f'The number of colums with only zeros: {(std_colums == 0).sum().sum()}')
# extracting all the collums that only contain zeros
std_colums = std_colums[std_colums == False].index.tolist()

# removing the collumns that only contain zeros from the data
pre_processed_data = data.drop(columns = std_colums)

# calculating the standard deviation of the collumns
std_colums = pre_processed_data.std()
print(f'The number of colums with only zeros in the pre_processed data: {(std_colums == 0).sum().sum()}')
print(f'The number of zeros in the pre_processed data: {(pre_processed_data == 0).sum().sum()}')





# Step 3
# Placing the mean in places where there is an NaN
# Creating a list with all the means of the collumns but excluding the zeros from the caluclation
pre_processed_data_nan = pre_processed_data.copy()
mean = np.mean(pre_processed_data_nan)

# Creating a mask with true when there is a zero
pre_processed_data_mask = (pre_processed_data == 0)

# Replace NaN values with True
pre_processed_data = pre_processed_data.fillna(True)

# Replace True values in each column with the corresponding mean value
for col, value in mean.items():
    pre_processed_data[col] = pre_processed_data[col].replace(True, value)






# Step 4
# Calculating the ouliers and replacing them with the mean of the column
# Note that when calculating the mean the outliers are not excluded

# Defining what an oulier is 
# An outlier is 2 time the standard deviation
outlier = std_colums*2 + mean
outlier_low = mean - std_colums*2

# Making a mask that hase a True when the value is an outlier
for col, value in mean.items(): 
  pre_processed_data_mask[col] = (pre_processed_data[col] >= outlier[col]) | (pre_processed_data[col] <= outlier_low[col])

# Placing the an NaN value in the data where there is an True value in the mask
pre_processed_data[pre_processed_data_mask] = np.nan

# Replace NaN values with True
pre_processed_data = pre_processed_data.fillna(True)

# Replace True values in each column with the corresponding mean value
for col, value in mean.items():
    pre_processed_data[col] = pre_processed_data[col].replace(True, value)

# count the number of boolean Trues
count_true = (pre_processed_data_mask == True).sum().sum()

print(f"There are {count_true} outliers in the dataset that are being replaced")
percentage_outliers = (count_true/(len(data.index)*(len(data.columns)-2)))*100
print(f"{percentage_outliers:.2f}% of the data consists of outliers")





# Step 5
# scaling the features using range matching
# select only the numeric columns
pre_processed_data_numeric = pre_processed_data.select_dtypes(include=[float, int])

# create a MinMaxScaler object
scaler = MinMaxScaler(feature_range=(0, 100))

# fit and transform the numeric DataFrame
df_scaled = pd.DataFrame(scaler.fit_transform(pre_processed_data_numeric), columns=pre_processed_data_numeric.columns)

# concatenate the scaled numeric DataFrame with the non-numeric columns
pre_processed_data = pd.concat([pre_processed_data.select_dtypes(exclude=[float, int]), df_scaled], axis=1)





# Step 6
# Spliting the pre_processed_data in a train and validation set
train_df_pre, test_df_over = train_test_split(pre_processed_data, test_size = 0.2, random_state=42, stratify = data['label'])
train_df_pre.to_csv('train_df_pre.csv')

train_df, valid_df = train_test_split(train_df_pre, test_size = 0.25, random_state=42, stratify = train_df_pre['label'])
train_df.to_csv('train_df.csv')
valid_df.to_csv('valid_df.csv')

# creating x and y. x is the data without the label and ID, y is the data with only the label and malignent is now 1 and benign is now 0
# train
dropped = train_df.drop(['ID', 'label'], axis=1)
x = train_df.drop(['ID', 'label'], axis=1)
y = train_df['label']
y = y.replace({'benign': 0, 'malignant': 1})

# validation
dropped_val = valid_df.drop(['ID', 'label'], axis=1)
x_val = valid_df.drop(['ID', 'label'], axis=1)
y_val = valid_df['label']
y_val = y_val.replace({'benign': 0, 'malignant': 1})

# test
dropped_test = test_df_over.drop(['ID', 'label'], axis=1)
x_test = test_df_over.drop(['ID', 'label'], axis=1)
y_test = test_df_over['label']
y_test = y_test.replace({'benign': 0, 'malignant': 1})


# Step 7
# Feature selection and model training
# let's initialize a RF model 
model = RandomForestRegressor(n_estimators=8, random_state=42, max_depth = 5, max_features = 23, min_samples_split = 3)

# let's initialize Boruta
feat_selector = BorutaPy(
    verbose= 0,
    estimator=model,
    n_estimators= 240, 
    max_iter=15,  
    random_state=42,
    perc = 85,
    alpha=0.105
)



# train Boruta
# N.B.: X and y must be numpy arrays
feat_selector.fit(np.array(x), np.array(y))

# print support and ranking for each feature
print("\n------Support and Ranking for each feature------")
for i in range(len(feat_selector.support_)):
    if feat_selector.support_[i]:
        print("Passes the test: ", x.columns[i],
              " - Ranking: ", feat_selector.ranking_[i])
        
X_filtered = feat_selector.transform(np.array (x))
X_filtered_val = feat_selector.transform(np.array (x_val)) 
print("\n------Selected Features------\n")

# train the model
model.fit(X_filtered, y)

# compute predictions
predictions = model.predict(X_filtered)
predictions_val = model.predict(X_filtered_val)

# create a dataframe with real predictions and values
df = pd.DataFrame({'pred': predictions, 'observed': y})
df_val = pd.DataFrame({'pred': predictions_val, 'observed': y_val})


# Step 8
# calculating the accuracy
true_values = df['observed'].values
predictions = df['pred'].values
predictions = np.where(predictions >= 0.5, 1, 0)

pred_train = accuracy_score(true_values, predictions)
print(f'The accuracy on the train set is: {pred_train}')

true_values_val = df_val['observed'].values
predictions_val = df_val['pred'].values
predictions_val = np.where(predictions_val >= 0.5, 1, 0)

pred_val = accuracy_score(true_values_val, predictions_val)
print(f'The accuracy on the val set is: {pred_val}')

# Step 9 using the test data and calculating the accuracy


x_test_filtered = feat_selector.transform(np.array (x_test)) 
predictions_test = model.predict(x_test_filtered)
df_test = pd.DataFrame({'pred': predictions_test, 'observed': y_test})

###
true_values_test = df_test['observed'].values
predictions_test = df_test['pred'].values
predictions_test = np.where(predictions_test >= 0.5, 1, 0)

pred_test = accuracy_score(true_values_test, predictions_test)
print(f'The accuracy on the test set is: {pred_test}')

The number of samples: 186
The number of features: 493
The number of missing data: 0
The number of zeros in the data: 4341
The number malignent tumors are: 94
The number benign tumors are: 92
The number of colums with only zeros: 16
The number of colums with only zeros in the pre_processed data: 0
The number of zeros in the pre_processed data: 3039
There are 3750 outliers in the dataset that are being replaced
4.09% of the data consists of outliers

------Support and Ranking for each feature------
Passes the test:  PREDICT_original_sf_convexity_avg_2.5D  - Ranking:  1
Passes the test:  PREDICT_original_sf_area_avg_2.5D  - Ranking:  1
Passes the test:  PREDICT_original_sf_area_min_2.5D  - Ranking:  1
Passes the test:  PREDICT_original_logf_kurtosis_sigma1  - Ranking:  1
Passes the test:  PREDICT_original_logf_range_sigma1  - Ranking:  1
Passes the test:  PREDICT_original_logf_quartile_range_sigma5  - Ranking:  1
Passes the test:  PREDICT_original_logf_quartile_range_sigma10  - Ranking: 