# Supported Vector Machine

# !!!!!!!!! Criar csvs limpos para testar depois !!!!!!!!!!!

# Base Idea: [link to study](https://pubs.aip.org/aip/acp/article-abstract/2655/1/020103/2888254/Classification-of-normal-and-nodule-lung-images?redirectedFrom=fulltext)

This code utilizes a Support Vector Machine (SVM) for classification of data extracted from the LIDC-IDRI dataset.

The `.csv` file employed in this version contains a **clean and analyzed** dataset derived from the raw data using the `pylidc`, `pyradiomics`, and deep feature extraction methods.

The relevant methods can be found in the **csv_cleanup** folder.

## Importing libraries and Datasets

We will begin by importing the relevant and necessary libraries.

In [24]:
# Step 1: Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt

In [25]:
%matplotlib inline

Next, we will convert the three datasets into pandas DataFrames for further processing.

In [28]:
df = pd.read_csv('clean_rfe50.csv')
#df = pd.read_csv('semi_clean_cnn.csv')

mask = df['is_cancer'] == 1

# select all rows except the ones that contain 
df = df[~mask]

# Map 'is_cancer' values from 2 to 1
df['is_cancer'] = df['is_cancer'].replace(2, 1)

df.head()

Unnamed: 0,is_cancer,resnet3d_feature_33,resnet3d_feature_41,resnet3d_feature_52,resnet3d_feature_63,resnet3d_feature_64,resnet3d_feature_66,resnet3d_feature_67,resnet3d_feature_68,resnet3d_feature_72,...,original_firstorder_Median,original_firstorder_Minimum,original_firstorder_RobustMeanAbsoluteDeviation,original_firstorder_Skewness,original_firstorder_Variance,original_gldm_DependenceNonUniformityNormalized,original_glrlm_GrayLevelNonUniformity,original_glrlm_LongRunEmphasis,original_glrlm_RunLengthNonUniformity,original_glrlm_ShortRunEmphasis
0,1,0.339816,0.562524,0.442979,0.730204,0.333658,0.491633,0.398934,0.413691,0.399483,...,0.364344,0.011082,0.001734,0.648394,0.005157,0.21098,0.87881,0.15793,0.324954,0.496851
1,1,0.526288,0.736178,0.486639,0.796184,0.340574,0.547593,0.475688,0.532664,0.69038,...,0.161537,0.034874,0.0,0.359984,0.000511,0.465737,0.890961,0.364065,0.24092,0.253756
2,0,0.616598,0.428909,0.104794,0.619367,0.502572,0.125043,0.291206,0.484636,0.376556,...,0.224852,0.000876,0.0,0.610007,0.002463,0.448261,0.874003,0.314858,0.249673,0.428741
3,1,0.395332,0.431717,0.251262,0.73569,0.667631,0.282189,0.237252,0.245101,0.460938,...,0.368496,0.00453,0.004242,0.659629,0.006736,0.313686,0.804823,0.218568,0.269539,0.537671
6,0,0.275632,0.454981,0.361014,0.542304,0.659543,0.325105,0.451855,0.178754,0.195805,...,0.453218,0.00832,0.354365,0.870931,0.072326,0.308885,0.574462,0.183526,0.196528,0.378695


In [29]:
# One hot encoding
df_encoded = df.copy()

# Select only columns with floating-point values
float_columns = df_encoded.select_dtypes(include='float').columns

# Define function to apply one-hot encoding for quantile-based intervals
def one_hot_encode_quantile_intervals(column, quantiles):
    # Get the quantile boundaries for the column
    quantile_values = df_encoded[column].quantile(quantiles).values
    
    # Create an empty DataFrame to store one-hot encoded columns
    encoded_df = pd.DataFrame()
    
    # Create binary columns for each interval based on quantiles
    for i in range(len(quantile_values) - 1):
        low = quantile_values[i]
        high = quantile_values[i + 1]
        encoded_df[f'{column}_interval_{i}'] = df_encoded[column].apply(lambda x: 1 if low <= x < high else 0)
    
    return encoded_df

# Iterate over each floating point column and apply one-hot encoding based on quantile intervals
for column in float_columns:
    # Define quantiles for dividing the data into intervals
    # (You can adjust the quantiles as desired)
    quantiles = [0.0, 0.25, 0.5, 0.75, 1.0]
    
    # Apply one-hot encoding for this column
    encoded_intervals_df = one_hot_encode_quantile_intervals(column, quantiles)
    
    # Drop the original floating-point column and concatenate the new one-hot encoded columns
    df_encoded = df_encoded.drop(columns=[column]).join(encoded_intervals_df)

# Now df_encoded contains one-hot encoded columns for each floating-point column based on quantile intervals.


In [30]:
# Step 2: Extract features (X) and labels (y)
# Assume df contains the feature columns and a label column
X = df.drop(columns=['is_cancer'])  # Drop the label column to get features
y = df['is_cancer']  # Target variable (lung nodule classification)

X_enc = df_encoded.drop(columns=['is_cancer'])  # Drop the label column to get features
y_enc = df_encoded['is_cancer']  # Target variable (lung nodule classification)

### Scatterplot for Analysis

In [31]:
'''
target = 'is_cancer'
features = [col for col in df.columns if col != target]

import seaborn as sns

# Using seaborn's color palette for a visually distinct color scheme
sns.set_palette("Set2")

# Setting up a larger figure and adjusting the number of columns for clarity
n_cols = 2  # Change number of columns to 2 for easier reading
n_rows = (len(features) + 1) // n_cols

plt.figure(figsize=(15, 5 * n_rows))

# Generate scatter plots with enhancements
for i, feature in enumerate(features):
    plt.subplot(n_rows, n_cols, i + 1)
    plt.scatter(df[feature], df[target], color='teal', alpha=0.6, edgecolor='k', s=15)
    plt.title(f'{feature} vs {target}', fontsize=12)
    plt.xlabel(feature, fontsize=10)
    plt.ylabel(target, fontsize=10)
    plt.grid(True, linestyle='--', linewidth=0.5, alpha=0.7)

plt.tight_layout()
plt.show()'''

'\ntarget = \'is_cancer\'\nfeatures = [col for col in df.columns if col != target]\n\nimport seaborn as sns\n\n# Using seaborn\'s color palette for a visually distinct color scheme\nsns.set_palette("Set2")\n\n# Setting up a larger figure and adjusting the number of columns for clarity\nn_cols = 2  # Change number of columns to 2 for easier reading\nn_rows = (len(features) + 1) // n_cols\n\nplt.figure(figsize=(15, 5 * n_rows))\n\n# Generate scatter plots with enhancements\nfor i, feature in enumerate(features):\n    plt.subplot(n_rows, n_cols, i + 1)\n    plt.scatter(df[feature], df[target], color=\'teal\', alpha=0.6, edgecolor=\'k\', s=15)\n    plt.title(f\'{feature} vs {target}\', fontsize=12)\n    plt.xlabel(feature, fontsize=10)\n    plt.ylabel(target, fontsize=10)\n    plt.grid(True, linestyle=\'--\', linewidth=0.5, alpha=0.7)\n\nplt.tight_layout()\nplt.show()'

In [32]:
# Step 3: Data preprocessing (scaling)
# SVM performs better when features are standardized
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # Fit to the data and then transform it

X_scaled_enc = scaler.fit_transform(X_enc)  # Fit to the data and then transform it

In [33]:

# Step 4: Setting up the SVM model
# We will use a basic SVM with an RBF kernel (commonly used for medical data)
svm_model = SVC(kernel='rbf', C=1, gamma='scale')  # Regularization and kernel hyperparameters


In [34]:

# Step 5: Performing 10-fold cross-validation
# Define KFold with 10 splits
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [35]:
# Cross-validation to get the score for each fold
cv_scores = cross_val_score(svm_model, X_scaled, y, cv=kfold, scoring='accuracy')

# Output the results
print(f"Cross-validation accuracy scores for each fold: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores)}")
print(f"Standard deviation of accuracy: {np.std(cv_scores)}")

print('-------------------------------------------------------')

svm_model = SVC(kernel='rbf', C=0.5, gamma='scale')  # Regularization and kernel hyperparameters
cv_scores = cross_val_score(svm_model, X_scaled_enc, y_enc, cv=kfold, scoring='accuracy')

# Output the results
print(f"Cross-validation accuracy scores for each fold: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores)}")
print(f"Standard deviation of accuracy: {np.std(cv_scores)}")

Cross-validation accuracy scores for each fold: [0.86290323 0.87096774 0.90322581 0.85483871 0.84677419 0.91129032
 0.87096774 0.87903226 0.91056911 0.84552846]
Mean accuracy: 0.875609756097561
Standard deviation of accuracy: 0.02375285646810462
-------------------------------------------------------
Cross-validation accuracy scores for each fold: [0.84677419 0.87903226 0.86290323 0.85483871 0.83870968 0.90322581
 0.84677419 0.87096774 0.91056911 0.84552846]
Mean accuracy: 0.8659323367427222
Standard deviation of accuracy: 0.023672582884425888


In [36]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB



# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=700),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    'SVM': SVC(),
    'Gauss': GaussianNB()
}

# Step 2: Extract features (X) and labels (y)
# Assume df contains the feature columns and a label column
X = df.drop(columns=['is_cancer'])  # Drop the label column to get features
y = df['is_cancer']  # Target variable (lung nodule classification)

# Step 3: Data preprocessing (scaling)
# SVM performs better when features are standardized
scaler = StandardScaler()

X = scaler.fit_transform(X)  # Fit to the data and then transform it

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print(f"{name} Accuracy: {accuracy:.2f}")


Logistic Regression Accuracy: 0.87
Random Forest Accuracy: 0.87


Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.87
SVM Accuracy: 0.88
Gauss Accuracy: 0.83
