## Cleanup Process

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np
from copy import deepcopy

### Dataframe Initialization and Preliminary Analysis

In [5]:
# Initializing the pandas dataframe
df = pd.read_csv('meta_info_2d.csv')

# Number of columns for each data type
print("\nNumber of Columns for Each Data Type:")
print(df.dtypes.value_counts())

print('-----------------------------------')

# Only showing columns with missing values
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]  # Filter to show only columns with missing values
print("\nColumns with Missing Values:")
print(missing_values)

print('-----------------------------------')

# Number of unique values per column
print("\nNumber of Unique Values per Column:")
print(df.nunique())

print('-----------------------------------')

df.head(8)


Number of Columns for Each Data Type:
float64    106
int64        4
object       3
bool         1
Name: count, dtype: int64
-----------------------------------

Columns with Missing Values:
Series([], dtype: int64)
-----------------------------------

Number of Unique Values per Column:
patient_id                              872
nodule_no                                23
original_image                         2621
mask_image                             2621
malignancy                                5
                                       ... 
radiomics_original_ngtdm_Busyness         1
radiomics_original_ngtdm_Coarseness       1
radiomics_original_ngtdm_Complexity       1
radiomics_original_ngtdm_Contrast         1
radiomics_original_ngtdm_Strength         1
Length: 114, dtype: int64
-----------------------------------


Unnamed: 0,patient_id,nodule_no,original_image,mask_image,malignancy,is_cancer,is_clean,radiomics_original_shape_Elongation,radiomics_original_shape_Flatness,radiomics_original_shape_LeastAxisLength,...,radiomics_original_glszm_SmallAreaHighGrayLevelEmphasis,radiomics_original_glszm_SmallAreaLowGrayLevelEmphasis,radiomics_original_glszm_ZoneEntropy,radiomics_original_glszm_ZonePercentage,radiomics_original_glszm_ZoneVariance,radiomics_original_ngtdm_Busyness,radiomics_original_ngtdm_Coarseness,radiomics_original_ngtdm_Complexity,radiomics_original_ngtdm_Contrast,radiomics_original_ngtdm_Strength
0,1,0,0001_NI000.npy,0001_MA000.npy,5,True,False,0.972773,0.858091,19.588162,...,3.394066e-08,3.394066e-08,-3.203427e-16,0.000184,0.0,0.0,1000000.0,0.0,0.0,0.0
1,2,0,0002_NI000.npy,0002_MA000.npy,5,True,False,0.795595,0.747403,21.632902,...,4.92321e-09,4.92321e-09,-3.203427e-16,7e-05,0.0,0.0,1000000.0,0.0,0.0,0.0
2,3,0,0003_NI000.npy,0003_MA000.npy,2,False,False,0.768615,0.549141,14.561133,...,1.547565e-07,1.547565e-07,-3.203427e-16,0.000393,0.0,0.0,1000000.0,0.0,0.0,0.0
3,3,1,0003_NI001.npy,0003_MA001.npy,5,True,False,0.836843,0.690356,16.647824,...,9.520109e-08,9.520109e-08,-3.203427e-16,0.000309,0.0,0.0,1000000.0,0.0,0.0,0.0
4,3,2,0003_NI002.npy,0003_MA002.npy,4,True,False,0.66754,0.638922,7.316637,...,1.467976e-05,1.467976e-05,-3.203427e-16,0.003831,0.0,0.0,1000000.0,0.0,0.0,0.0
5,3,3,0003_NI003.npy,0003_MA003.npy,4,True,False,0.875257,0.787447,10.55981,...,2.017691e-06,2.017691e-06,-3.203427e-16,0.00142,0.0,0.0,1000000.0,0.0,0.0,0.0
6,4,0,0004_NI000.npy,0004_MA000.npy,1,False,False,0.689709,0.566921,3.500944,...,0.0001876525,0.0001876525,-3.203427e-16,0.013699,0.0,0.0,1000000.0,0.0,0.0,0.0
7,5,0,0005_NI000.npy,0005_MA000.npy,3,Ambiguous,False,0.864986,0.779798,4.126333,...,0.0001983733,0.0001983733,-3.203427e-16,0.014085,0.0,0.0,1000000.0,0.0,0.0,0.0


A preliminary examination of our dataframe indicates that it contains a significant number of columns, primarily consisting of floating-point values. Currently, there are no missing values; however, we need to conduct a more in-depth analysis to explore the distribution of data, identify potential outliers, examine correlations between variables, and assess overall data quality.

This Jupyter notebook allows you to review all the modifications made to the dataset.

We will now apply various data analysis techniques to assess the relevance and utility of these columns.

### Simple Considerations

First, we will identify and remove obviously irrelevant columns:

 - `nodule_idx` and `patient_id` do not hold significant relevance for our analysis.
 - Since `is_cancer` captures the essential aspect of `malignancy` and will serve as our target variable, we can exclude the `malignancy` column.
 - `original_image` and `mask_image` are simply composed of the names of files used to determine radiomics features, and therefore irrelevant. 

Additionally, we determined that adding a new column to represent the total number of nodules per patient could be informative. This new feature is important as it might indicate whether having multiple nodules increases the likelihood of some being cancerous, or perhaps raises the overall risk.

In [6]:
df.drop("nodule_no", axis=1, inplace=True)
df.drop("malignancy", axis=1, inplace=True)

df.drop("original_image", axis=1, inplace=True)
df.drop("mask_image", axis=1, inplace=True)

# Creates a new column that counts the occurrences of each patient_id
df['patient_nodule_count'] = df.groupby('patient_id').cumcount() + 1
insert_position = df.columns.get_loc('patient_id') +1
# Places new column in front of patient_id
df.insert(insert_position, 'patient_nodule_count', df.pop('patient_nodule_count'))
df.drop("patient_id", axis=1, inplace=True)

df.head()

Unnamed: 0,patient_nodule_count,is_cancer,is_clean,radiomics_original_shape_Elongation,radiomics_original_shape_Flatness,radiomics_original_shape_LeastAxisLength,radiomics_original_shape_MajorAxisLength,radiomics_original_shape_Maximum2DDiameterColumn,radiomics_original_shape_Maximum2DDiameterRow,radiomics_original_shape_Maximum2DDiameterSlice,...,radiomics_original_glszm_SmallAreaHighGrayLevelEmphasis,radiomics_original_glszm_SmallAreaLowGrayLevelEmphasis,radiomics_original_glszm_ZoneEntropy,radiomics_original_glszm_ZonePercentage,radiomics_original_glszm_ZoneVariance,radiomics_original_ngtdm_Busyness,radiomics_original_ngtdm_Coarseness,radiomics_original_ngtdm_Complexity,radiomics_original_ngtdm_Contrast,radiomics_original_ngtdm_Strength
0,1,True,False,0.972773,0.858091,19.588162,22.827597,31.038346,32.075141,26.977005,...,3.394066e-08,3.394066e-08,-3.203427e-16,0.000184,0.0,0.0,1000000.0,0.0,0.0,0.0
1,1,True,False,0.795595,0.747403,21.632902,28.944112,36.659875,30.9453,37.724947,...,4.92321e-09,4.92321e-09,-3.203427e-16,7e-05,0.0,0.0,1000000.0,0.0,0.0,0.0
2,1,False,False,0.768615,0.549141,14.561133,26.5162,26.873199,31.855156,25.552264,...,1.547565e-07,1.547565e-07,-3.203427e-16,0.000393,0.0,0.0,1000000.0,0.0,0.0,0.0
3,2,True,False,0.836843,0.690356,16.647824,24.114835,26.959597,30.922607,24.750701,...,9.520109e-08,9.520109e-08,-3.203427e-16,0.000309,0.0,0.0,1000000.0,0.0,0.0,0.0
4,3,True,False,0.66754,0.638922,7.316637,11.451529,11.145619,12.548361,12.934109,...,1.467976e-05,1.467976e-05,-3.203427e-16,0.003831,0.0,0.0,1000000.0,0.0,0.0,0.0


Next, we will eliminate any columns that contain only a single unique value or duplicate the exact values found in another column.

In [7]:
df = df.loc[:, df.nunique() > 1]
df = df.loc[:,~df.T.duplicated()] # treats columns as lines in order to find duplicates
df.head()

Unnamed: 0,patient_nodule_count,is_cancer,radiomics_original_shape_Elongation,radiomics_original_shape_Flatness,radiomics_original_shape_LeastAxisLength,radiomics_original_shape_MajorAxisLength,radiomics_original_shape_Maximum2DDiameterColumn,radiomics_original_shape_Maximum2DDiameterRow,radiomics_original_shape_Maximum2DDiameterSlice,radiomics_original_shape_Maximum3DDiameter,...,radiomics_original_glrlm_RunVariance,radiomics_original_glrlm_ShortRunEmphasis,radiomics_original_glszm_GrayLevelNonUniformity,radiomics_original_glszm_LargeAreaEmphasis,radiomics_original_glszm_SizeZoneNonUniformity,radiomics_original_glszm_SizeZoneNonUniformityNormalized,radiomics_original_glszm_SmallAreaEmphasis,radiomics_original_glszm_ZoneEntropy,radiomics_original_glszm_ZonePercentage,radiomics_original_glszm_ZoneVariance
0,1,True,0.972773,0.858091,19.588162,22.827597,31.038346,32.075141,26.977005,34.406945,...,30.375465,0.190547,1.0,29463184.0,1.0,1.0,3.394066e-08,-3.203427e-16,0.000184,0.0
1,1,True,0.795595,0.747403,21.632902,28.944112,36.659875,30.9453,37.724947,40.384819,...,51.76955,0.211422,1.0,203119504.0,1.0,1.0,4.92321e-09,-3.203427e-16,7e-05,0.0
2,1,False,0.768615,0.549141,14.561133,26.5162,26.873199,31.855156,25.552264,32.630357,...,16.420147,0.232529,1.0,6461764.0,1.0,1.0,1.547565e-07,-3.203427e-16,0.000393,0.0
3,2,True,0.836843,0.690356,16.647824,24.114835,26.959597,30.922607,24.750701,30.922607,...,16.833843,0.14264,1.0,10504081.0,1.0,1.0,9.520109e-08,-3.203427e-16,0.000309,0.0
4,3,True,0.66754,0.638922,7.316637,11.451529,11.145619,12.548361,12.934109,13.26797,...,2.783696,0.287526,1.0,68121.0,1.0,1.0,1.467976e-05,-3.203427e-16,0.003831,0.0


### Correlation


To assess the degree of correlation between the columns and the target variable, and gain better insight about the feature's importance, we decided to create a new DataFrame. We chose the interval of 0.45 and -0.45 as it effectively captures some correlation, both positive and negative. That interval can be changed below.

However, to facilitate this analysis, we must first convert the `patient_id` and `is_cancer` columns into tangible numerical formats. This conversion is essential because correlation calculations require numeric inputs to assess the relationship between variables accurately. 

In [8]:
# Map is_cancer values to numeric representations
df['is_cancer'] = df['is_cancer'].map({'False': 0, 'Ambiguous': 1, 'True': 2})

df.head()

Unnamed: 0,patient_nodule_count,is_cancer,radiomics_original_shape_Elongation,radiomics_original_shape_Flatness,radiomics_original_shape_LeastAxisLength,radiomics_original_shape_MajorAxisLength,radiomics_original_shape_Maximum2DDiameterColumn,radiomics_original_shape_Maximum2DDiameterRow,radiomics_original_shape_Maximum2DDiameterSlice,radiomics_original_shape_Maximum3DDiameter,...,radiomics_original_glrlm_RunVariance,radiomics_original_glrlm_ShortRunEmphasis,radiomics_original_glszm_GrayLevelNonUniformity,radiomics_original_glszm_LargeAreaEmphasis,radiomics_original_glszm_SizeZoneNonUniformity,radiomics_original_glszm_SizeZoneNonUniformityNormalized,radiomics_original_glszm_SmallAreaEmphasis,radiomics_original_glszm_ZoneEntropy,radiomics_original_glszm_ZonePercentage,radiomics_original_glszm_ZoneVariance
0,1,2,0.972773,0.858091,19.588162,22.827597,31.038346,32.075141,26.977005,34.406945,...,30.375465,0.190547,1.0,29463184.0,1.0,1.0,3.394066e-08,-3.203427e-16,0.000184,0.0
1,1,2,0.795595,0.747403,21.632902,28.944112,36.659875,30.9453,37.724947,40.384819,...,51.76955,0.211422,1.0,203119504.0,1.0,1.0,4.92321e-09,-3.203427e-16,7e-05,0.0
2,1,0,0.768615,0.549141,14.561133,26.5162,26.873199,31.855156,25.552264,32.630357,...,16.420147,0.232529,1.0,6461764.0,1.0,1.0,1.547565e-07,-3.203427e-16,0.000393,0.0
3,2,2,0.836843,0.690356,16.647824,24.114835,26.959597,30.922607,24.750701,30.922607,...,16.833843,0.14264,1.0,10504081.0,1.0,1.0,9.520109e-08,-3.203427e-16,0.000309,0.0
4,3,2,0.66754,0.638922,7.316637,11.451529,11.145619,12.548361,12.934109,13.26797,...,2.783696,0.287526,1.0,68121.0,1.0,1.0,1.467976e-05,-3.203427e-16,0.003831,0.0


In [9]:
df.to_csv('./2d_semiclean.csv', index=False)

We are now able to compute the correlation matrix effectively and gain insights into the relationships within the dataset:

In [10]:
df2 = deepcopy(df)
correlation_matrix = df2.corr()
is_cancer_corr = correlation_matrix['is_cancer'] 

# Identifying strong correlations with 'is_cancer'
strong_corr_with_is_cancer = is_cancer_corr[(is_cancer_corr > 0.45) | (is_cancer_corr < -0.45)] # can be changed later

# Dropping the 'is_cancer' entry from the results
strong_corr_with_is_cancer = strong_corr_with_is_cancer.drop('is_cancer')
print('There are', strong_corr_with_is_cancer.shape[0], 'columns with a correlation in that range.')
strong_corr_with_is_cancer.head(20)

There are 10 columns with a correlation in that range.


radiomics_original_shape_LeastAxisLength            0.506082
radiomics_original_shape_MajorAxisLength            0.470183
radiomics_original_shape_Maximum2DDiameterColumn    0.478529
radiomics_original_shape_Maximum2DDiameterRow       0.508987
radiomics_original_shape_Maximum2DDiameterSlice     0.473662
radiomics_original_shape_Maximum3DDiameter          0.481364
radiomics_original_shape_MinorAxisLength            0.515905
radiomics_original_shape_SurfaceVolumeRatio        -0.482464
radiomics_original_gldm_LargeDependenceEmphasis     0.456971
radiomics_original_glrlm_RunEntropy                 0.494140
Name: is_cancer, dtype: float64

We only want to retain the `patient_nodule_count`, and `is_cancer` columns, along with those that have demonstrated strong correlation values.

In [12]:
# List of columns to keep
columns_to_keep = ['patient_nodule_count', 'is_cancer'] + strong_corr_with_is_cancer.index.tolist()

# New DataFrame with the selected columns
filtered_df = df[columns_to_keep]

filtered_df.head()

Unnamed: 0,patient_nodule_count,is_cancer,radiomics_original_shape_LeastAxisLength,radiomics_original_shape_MajorAxisLength,radiomics_original_shape_Maximum2DDiameterColumn,radiomics_original_shape_Maximum2DDiameterRow,radiomics_original_shape_Maximum2DDiameterSlice,radiomics_original_shape_Maximum3DDiameter,radiomics_original_shape_MinorAxisLength,radiomics_original_shape_SurfaceVolumeRatio,radiomics_original_gldm_LargeDependenceEmphasis,radiomics_original_glrlm_RunEntropy
0,1,2,19.588162,22.827597,31.038346,32.075141,26.977005,34.406945,22.206076,0.368609,541.648121,3.473935
1,1,2,21.632902,28.944112,36.659875,30.9453,37.724947,40.384819,23.027779,0.388414,602.337075,4.231633
2,1,0,14.561133,26.5162,26.873199,31.855156,25.552264,32.630357,20.380759,0.472904,464.245476,3.180046
3,2,2,16.647824,24.114835,26.959597,30.922607,24.750701,30.922607,20.180327,0.361115,529.237581,3.246592
4,3,2,7.316637,11.451529,11.145619,12.548361,12.934109,13.26797,7.644348,0.816013,335.911877,2.094209


In [13]:
filtered_df.to_csv('C:\\Users\\hasht\\Desktop\\Cloned Repositories\\Lung-Cancer-Classification-using-CT\\model_development\\2d_corr.csv', index=False)

### 2. Recursive Feature Elimination (RFE)

 **Recursive Feature Elimination** (RFE) is a feature selection method that works by **recursively eliminating less important features** to help improve model performance and reduce dimensionality. 
    
It involves training a machine learning model, in this case, a **Random Forest Classifier** and using its performance to evaluate the feature's importance. It starts by training the model on the full set of features.
     
After training, the model assigns importance scores to each feature, based on how much they contribute to reduce the impurity or error when used in splits.
     

In [10]:
X = df.drop(columns=['is_cancer'])  # All columns except the target
y = df['is_cancer']  # Target variable

# Train-test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate RF model
model = RandomForestClassifier(random_state=42)

# Apply RFE to select the top N features
rfe = RFE(estimator=model, n_features_to_select=50)  # Select top N features
X_train_rfe = rfe.fit_transform(X_train, y_train)  # Fit RFE and transform training data
X_test_rfe = rfe.transform(X_test)  # Transform test data using the selected features

# Fit Random Forest model on the selected features
model.fit(X_train_rfe, y_train)

# Predict on the test set
y_pred = model.predict(X_test_rfe)


# Print the selected columns
selected_columns = X.columns[rfe.support_]
print("Selected Features:", selected_columns)


Selected Features: Index(['patient_nodule_count', 'radiomics_original_shape_Elongation',
       'radiomics_original_shape_Flatness',
       'radiomics_original_shape_LeastAxisLength',
       'radiomics_original_shape_MajorAxisLength',
       'radiomics_original_shape_Maximum2DDiameterColumn',
       'radiomics_original_shape_Maximum2DDiameterRow',
       'radiomics_original_shape_Maximum2DDiameterSlice',
       'radiomics_original_shape_Maximum3DDiameter',
       'radiomics_original_shape_MeshVolume',
       'radiomics_original_shape_MinorAxisLength',
       'radiomics_original_shape_Sphericity',
       'radiomics_original_shape_SurfaceArea',
       'radiomics_original_shape_SurfaceVolumeRatio',
       'radiomics_original_shape_VoxelVolume',
       'radiomics_original_firstorder_10Percentile',
       'radiomics_original_firstorder_90Percentile',
       'radiomics_original_firstorder_Energy',
       'radiomics_original_firstorder_InterquartileRange',
       'radiomics_original_firstorde

Then we can **evaluate the performance of the dataset after the RFE**, when the random forest is applied.

In [9]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score


# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification Report (includes Precision, Recall, and F1 Score)
class_report = classification_report(y_test, y_pred, target_names=['False', 'Ambiguous', 'True'])
print("\nClassification Report:")
print(class_report)

# Macro-Averaged F1 Score
f1_macro = f1_score(y_test, y_pred, average='macro')
print(f"Macro-Averaged F1 Score: {f1_macro:.4f}")

# Weighted-Averaged F1 Score
f1_weighted = f1_score(y_test, y_pred, average='weighted')
print(f"Weighted-Averaged F1 Score: {f1_weighted:.4f}")


# ROC-AUC score (One-vs-Rest for multi-class classification)
# We need the probability estimates for calculating ROC-AUC.
y_pred_proba = model.predict_proba(X_test_rfe)
roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class="ovr")
print(f"ROC-AUC Score (One-vs-Rest): {roc_auc:.4f}")


Confusion Matrix:
[[ 81  78   5]
 [ 19 218  19]
 [  2  33  74]]

Classification Report:
              precision    recall  f1-score   support

       False       0.79      0.49      0.61       164
   Ambiguous       0.66      0.85      0.75       256
        True       0.76      0.68      0.71       109

    accuracy                           0.71       529
   macro avg       0.74      0.67      0.69       529
weighted avg       0.72      0.71      0.70       529

Macro-Averaged F1 Score: 0.6898
Weighted-Averaged F1 Score: 0.6968
ROC-AUC Score (One-vs-Rest): 0.8360


For reference, the confusion matrix is formatted as shown below.

| Actual \ Predicted | False (0) | Ambiguous (1) | True (2) |
|--------------------|-----------|---------------|----------|
| **False (0)**       | .       | .             | .       |
| **Ambiguous (1)**   | .         | .            | .        |
| **True (2)**        |      .| .            | .        |


