<a href="https://colab.research.google.com/github/rachelinaputri/data-mini-project/blob/main/10_Fold_CV_(CPB1_7).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, accuracy_score, recall_score, f1_score, confusion_matrix

def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=np.unique(y_true)).ravel()
    return tn / (tn + fp)

# Function to preprocess data: split columns, convert data types
def preprocess_data(df):
    df = df[0].str.split(';', expand=True)
    df.columns = df.iloc[0]  # Use the first row as column headers
    df = df.drop(df.index[0])
    features = df.iloc[:, :-1].apply(pd.to_numeric)  # Convert feature columns to numeric
    target = df.iloc[:, -1].astype('category')  # Convert class label to categorical
    return features, target

# Load the data files
cpa_file_paths = ['CPB1.csv', 'CPB2.csv', 'CPB3.csv', 'CPB4.csv', 'CPB5.csv', 'CPB6.csv', 'CPB7.csv']
cpa_dataframes = [pd.read_csv(file, header=None) for file in cpa_file_paths]

# Preprocess the data
cpa_features_targets = [preprocess_data(df) for df in cpa_dataframes]

# Define classifiers
svm_classifier = SVC(kernel='linear', random_state=42)
knn_classifier = KNeighborsClassifier(n_neighbors=5)

# Scoring metrics
scoring_metrics = {
    'accuracy': make_scorer(accuracy_score),
    'sensitifity': make_scorer(recall_score, average='weighted'),  # Sensitivity
    'specificity': make_scorer(specificity_score),  # Specificity
    'f1': make_scorer(f1_score, average='weighted')
}

# Perform classification and evaluation
results = []
for features, target in cpa_features_targets:
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    # SVM results
    svm_results = cross_validate(svm_classifier, features_scaled, target, cv=cv, scoring=scoring_metrics)
    svm_summary = {metric: scores.mean() for metric, scores in svm_results.items() if 'test_' in metric}

    # K-NN results
    knn_results = cross_validate(knn_classifier, features_scaled, target, cv=cv, scoring=scoring_metrics)
    knn_summary = {metric: scores.mean() for metric, scores in knn_results.items() if 'test_' in metric}

    results.append((svm_summary, knn_summary))

# Display results
for result in results:
    print("SVM Results:", result[0])
    print("K-NN Results:", result[1])


SVM Results: {'test_accuracy': 0.875, 'test_sensitifity': 0.875, 'test_specificity': 0.8400000000000001, 'test_f1': 0.8735844280157108}
K-NN Results: {'test_accuracy': 0.86, 'test_sensitifity': 0.86, 'test_specificity': 0.86, 'test_f1': 0.8585468340307484}
SVM Results: {'test_accuracy': 0.8699999999999999, 'test_sensitifity': 0.8699999999999999, 'test_specificity': 0.8200000000000001, 'test_f1': 0.8675597212048956}
K-NN Results: {'test_accuracy': 0.86, 'test_sensitifity': 0.86, 'test_specificity': 0.8099999999999999, 'test_f1': 0.8572019213343752}
SVM Results: {'test_accuracy': 0.8999999999999998, 'test_sensitifity': 0.8999999999999998, 'test_specificity': 0.8400000000000001, 'test_f1': 0.8986584908562968}
K-NN Results: {'test_accuracy': 0.8800000000000001, 'test_sensitifity': 0.8800000000000001, 'test_specificity': 0.8400000000000001, 'test_f1': 0.8790877837855897}
SVM Results: {'test_accuracy': 0.89, 'test_sensitifity': 0.89, 'test_specificity': 0.8400000000000001, 'test_f1': 0.88877