In [5]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

def train_svm_model(file_path, column_names, target_column, one_hot_columns=None, threshold=None, cv_folds=5, random_state=42):
    """
    Train an SVM model on a given dataset using cross-validation and display its performance metrics.
    
    Parameters:
        file_path (str): Path to the dataset file.
        column_names (list): List of column names for the dataset.
        target_column (str): Name of the target column.
        one_hot_columns (list): List of column names to one-hot encode. Default is None.
        threshold (int or float): Threshold to convert target column into binary classes. Default is None.
        cv_folds (int): Number of cross-validation folds. Default is 5.
        random_state (int): Random seed for reproducibility. Default is 42.
    
    Returns:
        None
    """
    # Load dataset
    data = pd.read_csv(file_path, header=None, names=column_names)
    
    # Preprocessing: Convert target column to binary if threshold is provided
    if threshold is not None:
        data[target_column] = data[target_column] > threshold
    
    # Preprocessing: One-hot encode specified columns
    if one_hot_columns:
        data = pd.get_dummies(data, columns=one_hot_columns, drop_first=True)
    
    # Split data into features (X) and target (y)
    X = data.drop(columns=[target_column])  
    y = data[target_column] 
    
    # Cross-validation
    svm_model = SVC(kernel="rbf", probability=True, random_state=random_state)
    cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
    
    # Calculate cross-validation scores
    accuracy_scores = cross_val_score(svm_model, X, y, cv=cv, scoring='accuracy')
    precision_scores = cross_val_score(svm_model, X, y, cv=cv, scoring='precision_weighted')
    recall_scores = cross_val_score(svm_model, X, y, cv=cv, scoring='recall_weighted')
    f1_scores = cross_val_score(svm_model, X, y, cv=cv, scoring='f1_weighted')
    
    # Display metrics
    print(f"Cross-Validation Results ({cv_folds}-folds):")
    print(f"Accuracy: {accuracy_scores.mean():.2f} ± {accuracy_scores.std():.2f}")
    print(f"Precision: {precision_scores.mean():.2f} ± {precision_scores.std():.2f}")
    print(f"Recall: {recall_scores.mean():.2f} ± {recall_scores.std():.2f}")
    print(f"F1 Score: {f1_scores.mean():.2f} ± {f1_scores.std():.2f}")


In [6]:
file_path = "../dataset abalone/abalone.data"
column_names = ["Sex", "Length", "Diameter", "Height", "Whole weight",
                "Shucked weight", "Viscera weight", "Shell weight", "Rings"]
target_column = "Rings"
one_hot_columns = ["Sex"]
threshold = 10

train_svm_model(file_path, column_names, target_column, one_hot_columns, threshold)

Cross-Validation Results (5-folds):
Accuracy: 0.78 ± 0.01
Precision: 0.77 ± 0.01
Recall: 0.78 ± 0.01
F1 Score: 0.77 ± 0.01


In [7]:
file_path = "../dataset balance+scale/balance-scale.data"
column_names = ["Class", "Left-Weight", "Left-Distance", "Right-Weight", "Right-Distance"]
target_column = "Class"
one_hot_columns = None
threshold = None

train_svm_model(file_path, column_names, target_column, one_hot_columns, threshold)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Cross-Validation Results (5-folds):
Accuracy: 0.90 ± 0.01
Precision: 0.83 ± 0.01
Recall: 0.90 ± 0.01
F1 Score: 0.87 ± 0.01


In [8]:
file_path = "../dataset breast+cancer/breast-cancer.data"
column_names = ["Class", "Age", "Menopause", "Tumor-Size", "Inv-Nodes", "Node-Caps",
                "Deg-Malig", "Breast", "Breast-Quad", "Irradiat"]
target_column = "Class"
one_hot_columns=["Age", "Menopause", "Tumor-Size", "Inv-Nodes", "Node-Caps", "Breast", "Breast-Quad", "Irradiat"]
threshold = None

train_svm_model(file_path, column_names, target_column, one_hot_columns, threshold)

Cross-Validation Results (5-folds):
Accuracy: 0.72 ± 0.04
Precision: 0.71 ± 0.05
Recall: 0.72 ± 0.04
F1 Score: 0.68 ± 0.04


In [9]:
file_path = "../dataset iris/iris.data"
column_names = ["SepalLength", "SepalWidth", "PetalLength", "PetalWidth", "Class"]
target_column = "Class"
one_hot_columns = None
threshold = None

train_svm_model(file_path, column_names, target_column, one_hot_columns, threshold)

Cross-Validation Results (5-folds):
Accuracy: 0.97 ± 0.03
Precision: 0.97 ± 0.03
Recall: 0.97 ± 0.03
F1 Score: 0.97 ± 0.03


In [10]:
file_path = "../dataset wine/wine.data"
column_names = ["Class", "Alcohol", "MalicAcid", "Ash", "AlcalinityOfAsh", "Magnesium",
                "TotalPhenols", "Flavanoids", "NonflavanoidPhenols", "Proanthocyanins",
                "ColorIntensity", "Hue", "OD280_OD315", "Proline"]
target_column = "Class"
one_hot_columns=None
threshold=None

train_svm_model(file_path, column_names, target_column, one_hot_columns, threshold)

Cross-Validation Results (5-folds):
Accuracy: 0.67 ± 0.04
Precision: 0.64 ± 0.08
Recall: 0.67 ± 0.04
F1 Score: 0.65 ± 0.06
