# Supported Vector Machine

# !!!!!!!!! Criar csvs limpos para testar depois !!!!!!!!!!!

# Base Idea: [link to study](https://pubs.aip.org/aip/acp/article-abstract/2655/1/020103/2888254/Classification-of-normal-and-nodule-lung-images?redirectedFrom=fulltext)

This code utilizes a Support Vector Machine (SVM) for classification of data extracted from the LIDC-IDRI dataset.

The `.csv` file employed in this version contains a **clean and analyzed** dataset derived from the raw data using the `pylidc`, `pyradiomics`, and deep feature extraction methods.

The relevant methods can be found in the **csv_cleanup** folder.

## Importing libraries and Datasets

We will begin by importing the relevant and necessary libraries.

In [20]:
# Step 1: Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

Next, we will convert the three datasets into pandas DataFrames for further processing.

In [23]:
#df = pd.read_csv('2d_semiclean.csv')
df = pd.read_csv('semi_clean_cnn.csv')
df.head()

Unnamed: 0,patient_nodule_count,is_cancer,resnet3d_feature_0,resnet3d_feature_1,resnet3d_feature_2,resnet3d_feature_3,resnet3d_feature_4,resnet3d_feature_5,resnet3d_feature_6,resnet3d_feature_7,...,resnet3d_feature_499,resnet3d_feature_500,resnet3d_feature_501,resnet3d_feature_502,resnet3d_feature_503,resnet3d_feature_506,resnet3d_feature_507,resnet3d_feature_508,resnet3d_feature_509,resnet3d_feature_510
0,1,2,1.999915,0.654481,0.010412,0.677332,4.946937,2.85274,1.334359,4.105425,...,0.418391,0.739559,1.52253,0.883103,0.585353,0.0,0.83262,0.063217,0.425889,0.825663
1,1,2,1.722624,0.566323,0.012774,0.905643,4.9225,2.808539,1.638239,4.224512,...,0.43404,0.693717,1.521672,0.84732,0.567338,0.0,0.818854,0.079964,0.41275,0.800591
2,1,0,1.925799,0.541719,0.030587,0.713345,4.636407,2.736021,1.203993,4.424734,...,0.426261,0.686177,1.546877,0.870471,0.601339,0.0,0.824317,0.069473,0.412042,0.795907
3,2,2,1.978363,0.523451,0.0,0.815097,4.655786,2.63535,1.258861,3.846047,...,0.382703,0.763561,1.547989,0.89858,0.598388,0.0,0.81382,0.037082,0.416101,0.844883
4,3,1,2.07902,0.590542,0.030947,0.855208,4.822085,2.612184,1.310962,4.156609,...,0.402608,0.715807,1.545234,0.858355,0.58661,0.0,0.811345,0.071713,0.407306,0.807776


In [24]:
# One hot encoding
df_encoded = df.copy()

# Select only columns with floating-point values
float_columns = df_encoded.select_dtypes(include='float').columns

# Define function to apply one-hot encoding for quantile-based intervals
def one_hot_encode_quantile_intervals(column, quantiles):
    # Get the quantile boundaries for the column
    quantile_values = df_encoded[column].quantile(quantiles).values
    
    # Create an empty DataFrame to store one-hot encoded columns
    encoded_df = pd.DataFrame()
    
    # Create binary columns for each interval based on quantiles
    for i in range(len(quantile_values) - 1):
        low = quantile_values[i]
        high = quantile_values[i + 1]
        encoded_df[f'{column}_interval_{i}'] = df_encoded[column].apply(lambda x: 1 if low <= x < high else 0)
    
    return encoded_df

# Iterate over each floating point column and apply one-hot encoding based on quantile intervals
for column in float_columns:
    # Define quantiles for dividing the data into intervals
    # (You can adjust the quantiles as desired)
    quantiles = [0.0, 0.25, 0.5, 0.75, 1.0]
    
    # Apply one-hot encoding for this column
    encoded_intervals_df = one_hot_encode_quantile_intervals(column, quantiles)
    
    # Drop the original floating-point column and concatenate the new one-hot encoded columns
    df_encoded = df_encoded.drop(columns=[column]).join(encoded_intervals_df)

# Now df_encoded contains one-hot encoded columns for each floating-point column based on quantile intervals.


In [25]:
# Step 2: Extract features (X) and labels (y)
# Assume df contains the feature columns and a label column
X = df.drop(columns=['is_cancer'])  # Drop the label column to get features
y = df['is_cancer']  # Target variable (lung nodule classification)

X_enc = df_encoded.drop(columns=['is_cancer'])  # Drop the label column to get features
y_enc = df_encoded['is_cancer']  # Target variable (lung nodule classification)

In [26]:
# Step 3: Data preprocessing (scaling)
# SVM performs better when features are standardized
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # Fit to the data and then transform it

X_scaled_enc = scaler.fit_transform(X_enc)  # Fit to the data and then transform it

In [27]:

# Step 4: Setting up the SVM model
# We will use a basic SVM with an RBF kernel (commonly used for medical data)
svm_model = SVC(kernel='rbf', C=1, gamma='scale')  # Regularization and kernel hyperparameters


In [28]:

# Step 5: Performing 10-fold cross-validation
# Define KFold with 10 splits
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [29]:
# Cross-validation to get the score for each fold
cv_scores = cross_val_score(svm_model, X_scaled, y, cv=kfold, scoring='accuracy')

# Output the results
print(f"Cross-validation accuracy scores for each fold: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores)}")
print(f"Standard deviation of accuracy: {np.std(cv_scores)}")

print('-------------------------------------------------------')

svm_model = SVC(kernel='rbf', C=1, gamma='scale')  # Regularization and kernel hyperparameters
cv_scores = cross_val_score(svm_model, X_scaled_enc, y_enc, cv=kfold, scoring='accuracy')

# Output the results
print(f"Cross-validation accuracy scores for each fold: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores)}")
print(f"Standard deviation of accuracy: {np.std(cv_scores)}")

Cross-validation accuracy scores for each fold: [0.53231939 0.55513308 0.60076046 0.58555133 0.53231939 0.59695817
 0.54961832 0.58015267 0.5648855  0.57633588]
Mean accuracy: 0.567403419150727
Standard deviation of accuracy: 0.023436815776307553
-------------------------------------------------------
Cross-validation accuracy scores for each fold: [0.54372624 0.5513308  0.58555133 0.58935361 0.51330798 0.56653992
 0.54580153 0.59923664 0.54580153 0.55343511]
Mean accuracy: 0.5594084695091863
Standard deviation of accuracy: 0.024616491099290917
