# Lung Cancer

## Get Radiomic Features from LIDC-IDRI

### Imports

In [None]:
import pylidc as pl
from pylidc.utils import consensus
import SimpleITK as sitk
from radiomics import featureextractor
import pandas as pd

In [None]:
# Initialize the feature extractor
extractor = featureextractor.RadiomicsFeatureExtractor()

# Get the list of additional features from pl.annotation_feature_names
additional_features = pl.annotation_feature_names

# Query the LIDC-IDRI dataset for scans with annotations
scans_with_annotations = pl.query(pl.Scan).filter(pl.Scan.annotations.any()).all()

# Lists to store the extracted features and patient IDs
features_list = []

# Variable to create unique IDs for the nodules
nodule_id_counter = 1

# Iterating through all scans with annotations
for scan in scans_with_annotations:
    # Get the patient ID
    patient_id = scan.patient_id

    # Clusterize the annotations for the scan and retrieve all annotations
    nods = scan.cluster_annotations()

    # Iterating through all nodules of the patient
    for anns in nods:
        # Check if the current nodule has annotations
        if anns:
            # Convert consensus annotations into a mask
            cmask, _, _ = pl.utils.consensus(anns, clevel=0.5, pad=[(20, 20), (20, 20), (0, 0)])

            # Convert the pixel array to a SimpleITK image
            image = sitk.GetImageFromArray(cmask.astype(float))

            # Extract radiomic features using PyRadiomics
            features = extractor.execute(image, image, label=1)  # Use label 1 for the nodule

            # Add the patient ID to the features
            features['Patient_ID'] = patient_id

            # Add a unique ID for the nodule
            features['Nodule_ID'] = f'Nodule_{nodule_id_counter}'
            nodule_id_counter += 1

            # Add the additional features to the features dictionary
            for feature_name in additional_features:
                # Access the feature values from the annotation
                if hasattr(anns[0], feature_name):
                    features[feature_name] = getattr(anns[0], feature_name)
                else:
                    # Handle cases where the feature is not available
                    features[feature_name] = None

            # Add the features to the list
            features_list.append(features) 

# Create a DataFrame to store the features
features_df = pd.DataFrame(features_list)

# Save the features to a CSV file
features_df.to_csv('radiomic_features_lidc_all_with_additional.csv', index=False)


## Data Clean-up

In [194]:
df = pd.read_csv('radiomic_features_lidc_all_with_additional.csv')
df

Unnamed: 0,diagnostics_Versions_PyRadiomics,diagnostics_Versions_Numpy,diagnostics_Versions_SimpleITK,diagnostics_Versions_PyWavelet,diagnostics_Versions_Python,diagnostics_Configuration_Settings,diagnostics_Configuration_EnabledImageTypes,diagnostics_Image-original_Hash,diagnostics_Image-original_Dimensionality,diagnostics_Image-original_Spacing,...,Nodule_ID,subtlety,internalStructure,calcification,sphericity,margin,lobulation,spiculation,texture,malignancy
0,v3.1.0,1.23.5,2.3.0,1.3.0,3.9.13,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},966db6c8d814983bbfe7125907b2ae5008c7994e,3D,"(1.0, 1.0, 1.0)",...,Nodule_1,4,1,6,4,4,1,2,5,3
1,v3.1.0,1.23.5,2.3.0,1.3.0,3.9.13,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},d750f93365c734f6f1972e806b046bb1a203f47f,3D,"(1.0, 1.0, 1.0)",...,Nodule_2,5,1,6,3,4,1,1,5,3
2,v3.1.0,1.23.5,2.3.0,1.3.0,3.9.13,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},58d9ee5073ad7215556c1e295a67e9204a623b2d,3D,"(1.0, 1.0, 1.0)",...,Nodule_3,4,1,5,5,5,1,1,5,1
3,v3.1.0,1.23.5,2.3.0,1.3.0,3.9.13,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},312913e3e22bc6f2e6cffc8abd1120da5fa413fd,3D,"(1.0, 1.0, 1.0)",...,Nodule_4,5,1,4,3,5,2,3,5,4
4,v3.1.0,1.23.5,2.3.0,1.3.0,3.9.13,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},08706c40effcb9f7489dc9ce0139a265b8dc6048,3D,"(1.0, 1.0, 1.0)",...,Nodule_5,3,1,6,5,5,5,5,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2646,v3.1.0,1.23.5,2.3.0,1.3.0,3.9.13,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},db18dcbfdf1ebd1c1390db9dd1fbfd85cce4724b,3D,"(1.0, 1.0, 1.0)",...,Nodule_2647,5,1,6,4,4,2,2,5,4
2647,v3.1.0,1.23.5,2.3.0,1.3.0,3.9.13,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},fe63df501e056c7f875ab771151194481c70a0b1,3D,"(1.0, 1.0, 1.0)",...,Nodule_2648,1,1,6,4,1,1,1,1,4
2648,v3.1.0,1.23.5,2.3.0,1.3.0,3.9.13,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},e25d0da08cbc45ea3587c9426cc102940176a4de,3D,"(1.0, 1.0, 1.0)",...,Nodule_2649,2,1,6,4,3,1,1,5,3
2649,v3.1.0,1.23.5,2.3.0,1.3.0,3.9.13,"{'minimumROIDimensions': 2, 'minimumROISize': ...",{'Original': {}},5888efc13966913742433819826d00f4d79f0c24,3D,"(1.0, 1.0, 1.0)",...,Nodule_2650,5,1,6,4,5,1,1,5,2


In [195]:
df.isna().sum()

diagnostics_Versions_PyRadiomics    0
diagnostics_Versions_Numpy          0
diagnostics_Versions_SimpleITK      0
diagnostics_Versions_PyWavelet      0
diagnostics_Versions_Python         0
                                   ..
margin                              0
lobulation                          0
spiculation                         0
texture                             0
malignancy                          0
Length: 140, dtype: int64

In [196]:
df.columns.tolist()

['diagnostics_Versions_PyRadiomics',
 'diagnostics_Versions_Numpy',
 'diagnostics_Versions_SimpleITK',
 'diagnostics_Versions_PyWavelet',
 'diagnostics_Versions_Python',
 'diagnostics_Configuration_Settings',
 'diagnostics_Configuration_EnabledImageTypes',
 'diagnostics_Image-original_Hash',
 'diagnostics_Image-original_Dimensionality',
 'diagnostics_Image-original_Spacing',
 'diagnostics_Image-original_Size',
 'diagnostics_Image-original_Mean',
 'diagnostics_Image-original_Minimum',
 'diagnostics_Image-original_Maximum',
 'diagnostics_Mask-original_Hash',
 'diagnostics_Mask-original_Spacing',
 'diagnostics_Mask-original_Size',
 'diagnostics_Mask-original_BoundingBox',
 'diagnostics_Mask-original_VoxelNum',
 'diagnostics_Mask-original_VolumeNum',
 'diagnostics_Mask-original_CenterOfMassIndex',
 'diagnostics_Mask-original_CenterOfMass',
 'original_shape_Elongation',
 'original_shape_Flatness',
 'original_shape_LeastAxisLength',
 'original_shape_MajorAxisLength',
 'original_shape_Maximum

In [197]:
df.nunique().tolist()

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 2651,
 1,
 1,
 1339,
 2334,
 1,
 1,
 2651,
 1,
 1339,
 1856,
 951,
 6,
 2651,
 2651,
 2651,
 2570,
 2570,
 2651,
 325,
 394,
 314,
 658,
 2347,
 2651,
 2650,
 2650,
 2650,
 951,
 1,
 1,
 951,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 951,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 2647,
 2483,
 2536,
 2648,
 951,
 1,
 1,
 2641,
 2641,
 2641,
 1,
 2650,
 2650,
 2650,
 1668,
 1,
 1,
 1,
 2651,
 2651,
 2651,
 1,
 2651,
 2651,
 2651,
 2546,
 2651,
 2651,
 2651,
 2651,
 6,
 1,
 1,
 1,
 962,
 962,
 962,
 1,
 7,
 9,
 962,
 962,
 962,
 9,
 951,
 48,
 1,
 1,
 1,
 1,
 1,
 875,
 2651,
 5,
 4,
 6,
 5,
 5,
 5,
 5,
 5,
 5]

In [198]:
unique_value_counts = df.nunique()
columns_with_single_unique_value = unique_value_counts[unique_value_counts == 1].index
print(columns_with_single_unique_value)

Index(['diagnostics_Versions_PyRadiomics', 'diagnostics_Versions_Numpy',
       'diagnostics_Versions_SimpleITK', 'diagnostics_Versions_PyWavelet',
       'diagnostics_Versions_Python', 'diagnostics_Configuration_Settings',
       'diagnostics_Configuration_EnabledImageTypes',
       'diagnostics_Image-original_Dimensionality',
       'diagnostics_Image-original_Spacing',
       'diagnostics_Image-original_Minimum',
       'diagnostics_Image-original_Maximum',
       'diagnostics_Mask-original_Spacing', 'original_firstorder_10Percentile',
       'original_firstorder_90Percentile', 'original_firstorder_Entropy',
       'original_firstorder_InterquartileRange',
       'original_firstorder_Kurtosis', 'original_firstorder_Maximum',
       'original_firstorder_MeanAbsoluteDeviation', 'original_firstorder_Mean',
       'original_firstorder_Median', 'original_firstorder_Minimum',
       'original_firstorder_Range',
       'original_firstorder_RobustMeanAbsoluteDeviation',
       'original_fir

In [199]:
df.drop(columns=columns_with_single_unique_value,inplace=True)

In [200]:
df.drop(df.columns[0], axis=1, inplace=True)
df.drop(df.columns[0], axis=1, inplace=True)
df.drop(df.columns[1], axis=1, inplace=True)
df.drop(df.columns[1], axis=1, inplace=True)
df.drop(df.columns[1], axis=1, inplace=True)
df.drop(df.columns[3], axis=1, inplace=True)
df.drop(df.columns[3], axis=1, inplace=True)
df.drop(df.columns[3], axis=1, inplace=True)
df.drop("Patient_ID", axis=1, inplace=True)
df.drop("Nodule_ID", axis=1, inplace=True)

In [201]:
df["malignancy"].value_counts()

malignancy
3    989
2    728
1    418
4    301
5    215
Name: count, dtype: int64

In [202]:
df = df[df['malignancy'] != 3]
df.loc[:, 'malignancy'] = df['malignancy'].replace({1: 0, 2: 0})
df.loc[:, 'malignancy'] = df['malignancy'].replace({4: 1, 5: 1})

In [203]:
df["malignancy"].value_counts()

malignancy
0    1146
1     516
Name: count, dtype: int64

In [204]:
df

Unnamed: 0,diagnostics_Image-original_Mean,diagnostics_Mask-original_VoxelNum,diagnostics_Mask-original_VolumeNum,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,original_shape_Maximum3DDiameter,...,original_glszm_ZoneVariance,subtlety,internalStructure,calcification,sphericity,margin,lobulation,spiculation,texture,malignancy
2,0.007535,34,1,0.289526,1.632763,5.639429,6.000000,6.324555,5.099020,6.324555,...,0.000000e+00,4,1,5,5,5,1,1,5,0
3,0.063299,3646,3,0.216692,6.435946,29.700850,37.054015,43.566042,35.227830,49.284886,...,2.949211e+06,5,1,4,3,5,2,3,5,1
4,0.014866,521,1,0.595806,7.374952,12.378110,15.524175,14.142136,12.041595,15.588457,...,0.000000e+00,3,1,6,5,5,5,5,5,1
5,0.006938,15,1,0.000000,0.000000,5.009955,4.000000,5.385165,5.000000,5.385165,...,0.000000e+00,2,1,6,4,4,5,5,5,0
6,0.025858,1629,1,0.261163,6.327166,24.226851,22.090722,31.256999,31.064449,32.264532,...,0.000000e+00,5,1,6,4,5,5,4,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2644,0.028955,287,1,0.231112,2.892066,12.513718,12.041595,15.231546,15.033296,16.186414,...,0.000000e+00,5,1,3,5,4,2,1,5,0
2646,0.010188,325,1,0.375133,4.534490,12.087678,12.806248,11.401754,13.928388,14.628739,...,0.000000e+00,5,1,6,4,4,2,2,5,1
2647,0.019069,623,1,0.687954,8.589183,12.485120,14.422205,14.317821,12.649111,15.165751,...,0.000000e+00,1,1,6,4,1,1,1,1,1
2649,0.007404,77,1,0.247844,2.471175,9.970706,7.280110,7.211103,7.615773,10.440307,...,0.000000e+00,5,1,6,4,5,1,1,5,0


## Data Analysis

## Imports

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df.select_dtypes(include=['object'])

In [None]:
numeric_columns = df.select_dtypes(exclude=['object'])
correlation_matrix = numeric_columns.corr()

print(correlation_matrix)

In [None]:

plt.figure(figsize=(15, 13))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', linewidths=0.5)
plt.show()

In [None]:
correlation_matrix = numeric_columns.corr()

threshold = 0.7

strong_correlations = correlation_matrix[
    (correlation_matrix > threshold) & (correlation_matrix < 1)
]

print(strong_correlations)

## Classification

### Imports

In [234]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [235]:
df

Unnamed: 0,diagnostics_Image-original_Mean,diagnostics_Mask-original_VoxelNum,diagnostics_Mask-original_VolumeNum,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,original_shape_Maximum3DDiameter,...,original_glszm_ZoneVariance,subtlety,internalStructure,calcification,sphericity,margin,lobulation,spiculation,texture,malignancy
2,0.007535,34,1,0.289526,1.632763,5.639429,6.000000,6.324555,5.099020,6.324555,...,0.000000e+00,4,1,5,5,5,1,1,5,0
3,0.063299,3646,3,0.216692,6.435946,29.700850,37.054015,43.566042,35.227830,49.284886,...,2.949211e+06,5,1,4,3,5,2,3,5,1
4,0.014866,521,1,0.595806,7.374952,12.378110,15.524175,14.142136,12.041595,15.588457,...,0.000000e+00,3,1,6,5,5,5,5,5,1
5,0.006938,15,1,0.000000,0.000000,5.009955,4.000000,5.385165,5.000000,5.385165,...,0.000000e+00,2,1,6,4,4,5,5,5,0
6,0.025858,1629,1,0.261163,6.327166,24.226851,22.090722,31.256999,31.064449,32.264532,...,0.000000e+00,5,1,6,4,5,5,4,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2644,0.028955,287,1,0.231112,2.892066,12.513718,12.041595,15.231546,15.033296,16.186414,...,0.000000e+00,5,1,3,5,4,2,1,5,0
2646,0.010188,325,1,0.375133,4.534490,12.087678,12.806248,11.401754,13.928388,14.628739,...,0.000000e+00,5,1,6,4,4,2,2,5,1
2647,0.019069,623,1,0.687954,8.589183,12.485120,14.422205,14.317821,12.649111,15.165751,...,0.000000e+00,1,1,6,4,1,1,1,1,1
2649,0.007404,77,1,0.247844,2.471175,9.970706,7.280110,7.211103,7.615773,10.440307,...,0.000000e+00,5,1,6,4,5,1,1,5,0


In [236]:
X = df.iloc[:, :-1]  # Features (all columns except the last one)
y = df.iloc[:, -1]   # Target (the last column)

In [237]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [260]:
def testing(test,pred):
    accuracy = accuracy_score(test, pred)
    report = classification_report(test, pred, zero_division=1)
    print("Accuracy:", accuracy)
    print("Classification Report:\n", report)


#### Random Forest Classifier

In [261]:
from sklearn.ensemble import RandomForestClassifier

In [262]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)  # You can adjust the number of trees (n_estimators)
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)

In [263]:
testing(y_test,y_pred)

Accuracy: 0.8918918918918919
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.94      0.93       239
           1       0.83      0.78      0.80        94

    accuracy                           0.89       333
   macro avg       0.87      0.86      0.86       333
weighted avg       0.89      0.89      0.89       333


#### Support Vector Machines (SVM)

In [264]:
from sklearn.svm import SVR
from sklearn.svm import SVC

#### Naive Bayes

##### GaussianNB

In [265]:
from sklearn.naive_bayes import GaussianNB

In [266]:
gauss = GaussianNB()
gauss.fit(X_train, y_train)
y_pred = gauss.predict(X_test)

In [267]:
testing(y_test,y_pred)

Accuracy: 0.7627627627627628
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.97      0.86       239
           1       0.78      0.22      0.35        94

    accuracy                           0.76       333
   macro avg       0.77      0.60      0.60       333
weighted avg       0.77      0.76      0.71       333


##### MultinomialNB

In [268]:
from sklearn.naive_bayes import MultinomialNB

In [269]:
mult = MultinomialNB()
mult.fit(X_train, y_train)
y_pred = mult.predict(X_test)

ValueError: Negative values in data passed to MultinomialNB (input X)

In [None]:
testing(y_test,y_pred)

##### BernoulliNB

In [270]:
from sklearn.naive_bayes import BernoulliNB
bern = BernoulliNB()
bern.fit(X_train, y_train)
y_pred = bern.predict(X_test)

In [271]:
testing(y_test,y_pred)

Accuracy: 0.7267267267267268
Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.99      0.84       239
           1       0.71      0.05      0.10        94

    accuracy                           0.73       333
   macro avg       0.72      0.52      0.47       333
weighted avg       0.72      0.73      0.63       333


#### Logistic Regression

In [272]:
from sklearn.linear_model import LogisticRegression

In [273]:
lregress = LogisticRegression()
lregress.fit(X_train, y_train)
y_pred = lregress.predict(X_test)

In [274]:
testing(y_test,y_pred)

Accuracy: 0.2822822822822823
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.00      0.00       239
           1       0.28      1.00      0.44        94

    accuracy                           0.28       333
   macro avg       0.64      0.50      0.22       333
weighted avg       0.80      0.28      0.12       333


#### XGBoost/LightGBM (Gradient Boosting)

##### XGBoost

In [275]:
import xgboost as xgb

In [276]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


In [277]:
testing(y_test,y_pred)

Accuracy: 0.8828828828828829
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.92      0.92       239
           1       0.80      0.78      0.79        94

    accuracy                           0.88       333
   macro avg       0.86      0.85      0.85       333
weighted avg       0.88      0.88      0.88       333


##### LightGBM

In [278]:
import lightgbm as lgb

In [279]:
model = lgb.LGBMClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [280]:
testing(y_test,y_pred)

Accuracy: 0.8918918918918919
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.93      0.93       239
           1       0.82      0.79      0.80        94

    accuracy                           0.89       333
   macro avg       0.87      0.86      0.86       333
weighted avg       0.89      0.89      0.89       333
