In [1]:
# FAZENDO IMPORTS

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import make_pipeline
from google.colab import drive

In [2]:
#  CARREGANDO BASE DE DADOS

#drive.mount('/content/drive')
#data = pd.read_csv('/content/drive/MyDrive/Machine Learning Project/Classification/Final_Grads_SJCU.csv')


In [3]:
#   CONTROLE DE ALEATORIEDADE

seed = 42
np.random.seed(seed)

In [4]:
#  GERANDO UM MOCK DATASET
# ok

num_samples=100

# Generate random features (float normalized between 0 and 1)
features = np.random.rand(num_samples, 3)

# Calculate the binary classification based on the last values of the features
HANS = np.zeros(num_samples)
for i in range(num_samples):
    # Example binary classification rule: if the sum of the last two values of feature1 is greater than 1, classify as 1, else 0
    if features[i, 0] + features[i-1, 0] - features[i, 1] - features[i-1, 1] + features[i, 2] + features[i-1, 2] > 1:
        HANS[i] = 1

# Create DataFrame
data = pd.DataFrame(features, columns=['feature1', 'feature2', 'feature3'])

# Add binary classification column
data['HANS'] = HANS.astype(int)


In [5]:
#  SEPARAÇÃO DOS DADOS

data.head()

data_features = data.drop('HANS', axis=1)
data_target = data['HANS']

In [6]:
#   TREINAMENTO - PREDIÇÃO - MÉTRICAS
#   Utilizando um unico fold com tamanho de treinamento 80% e teste 20%

print('Direct training')

model = SVC(kernel='linear',  class_weight='balanced',random_state=seed)  # You can choose other kernels like 'rbf', 'poly', etc.
#model = SVC(kernel='linear',  random_state=seed)  # You can choose other kernels like 'rbf', 'poly', etc.

X_train, X_test, y_train, y_test = train_test_split(data_features, data_target, test_size=0.2, random_state=seed)

model.fit(X_train, y_train)
pred_target = model.predict(X_test)

# Calculate the accuracy
print('Accuracy:')
print(accuracy_score(y_test, pred_target))

# Generate a classification report
print('Classification Report:')
print(classification_report(y_test, pred_target))

# Generate a confusion matrix
print('Confusion Matrix:')
print(confusion_matrix(y_test, pred_target))

Direct training
Accuracy:
0.75
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.55      0.71        11
           1       0.64      1.00      0.78         9

    accuracy                           0.75        20
   macro avg       0.82      0.77      0.74        20
weighted avg       0.84      0.75      0.74        20

Confusion Matrix:
[[6 5]
 [0 9]]


In [7]:
#   TREINAMENTO - PREDIÇÃO - MÉTRICAS
#   Utilizando um 5 fold com tamanho de treinamento 80% e teste 20% com distribuição uniforme de classes entre os folds

print('5-fold Stratified Cross Validation')

model = SVC(kernel='linear',  class_weight='balanced',random_state=seed)  # You can choose other kernels like 'rbf', 'poly', etc.
#model = SVC(kernel='linear',  random_state=seed)  # You can choose other kernels like 'rbf', 'poly', etc.

# Perform stratified 5-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

# Initialize lists to store true and predicted labels
true_labels = []
predicted_labels = []
fold = 0

# Loop over the folds
for train_index, test_index in skf.split(data_features, data_target):
    # Split the data into train and test sets
    X_train, X_test = data_features.iloc[train_index], data_features.iloc[test_index]
    y_train, y_test = data_target.iloc[train_index], data_target.iloc[test_index]

    # Fit the pipeline on the training data and predict on the test data
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Append true and predicted labels for this fold
    true_labels.extend(y_test)
    predicted_labels.extend(y_pred)

    fold = fold + 1
    print(f"Confusion Matrix Fold {fold}:")
    print(confusion_matrix(y_test, y_pred))

# Calculate the accuracy
print('Accuracy:')
print(accuracy_score(true_labels, predicted_labels))

# Generate a classification report
print('Classification Report:')
print(classification_report(true_labels, predicted_labels))

# Print the cross-validation confusion matrix
print("Confusion Matrix Total:")
print(confusion_matrix(true_labels, predicted_labels))

5-fold Stratified Cross Validation
Confusion Matrix Fold 1:
[[8 3]
 [3 6]]
Confusion Matrix Fold 2:
[[7 4]
 [0 9]]
Confusion Matrix Fold 3:
[[7 3]
 [3 7]]
Confusion Matrix Fold 4:
[[7 3]
 [1 9]]
Confusion Matrix Fold 5:
[[ 9  1]
 [ 0 10]]
Accuracy:
0.79
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.73      0.78        52
           1       0.75      0.85      0.80        48

    accuracy                           0.79       100
   macro avg       0.79      0.79      0.79       100
weighted avg       0.80      0.79      0.79       100

Confusion Matrix Total:
[[38 14]
 [ 7 41]]


In [12]:
#   TREINAMENTO - PREDIÇÃO - MÉTRICAS
#   Utilizando um 5 fold com tamanho de treinamento 80% e teste 20% com distribuição uniforme de classes entre os folds usando OverSample para balancear

print('5-fold Stratified Cross Validation')

model = SVC(kernel='linear',  class_weight='balanced',random_state=seed)  # You can choose other kernels like 'rbf', 'poly', etc.
#model = SVC(kernel='linear',  random_state=seed)  # You can choose other kernels like 'rbf', 'poly', etc.

# Perform balance with oversampler
oversampler = RandomOverSampler(sampling_strategy=1)

# Creates Pipeline
pipeline = make_pipeline(oversampler, model)

# Perform stratified 5-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

# Initialize lists to store true and predicted labels
true_labels = []
predicted_labels = []
fold = 0

# Loop over the folds
for train_index, test_index in skf.split(data_features, data_target):
    # Split the data into train and test sets
    X_train, X_test = data_features.iloc[train_index], data_features.iloc[test_index]
    y_train, y_test = data_target.iloc[train_index], data_target.iloc[test_index]

    # Fit the pipeline on the training data and predict on the test data
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    # Append true and predicted labels for this fold
    true_labels.extend(y_test)
    predicted_labels.extend(y_pred)

    fold = fold + 1
    print(f"Confusion Matrix Fold {fold}:")
    print(confusion_matrix(y_test, y_pred))

# Calculate the accuracy
print('Accuracy:')
print(accuracy_score(true_labels, predicted_labels))

# Generate a classification report
print('Classification Report:')
print(classification_report(true_labels, predicted_labels))

# Print the cross-validation confusion matrix
print("Confusion Matrix Total:")
print(confusion_matrix(true_labels, predicted_labels))

5-fold Stratified Cross Validation
Confusion Matrix Fold 1:
[[8 3]
 [3 6]]
Confusion Matrix Fold 2:
[[7 4]
 [0 9]]
Confusion Matrix Fold 3:
[[7 3]
 [3 7]]
Confusion Matrix Fold 4:
[[7 3]
 [1 9]]
Confusion Matrix Fold 5:
[[9 1]
 [1 9]]
Accuracy:
0.78
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.73      0.78        52
           1       0.74      0.83      0.78        48

    accuracy                           0.78       100
   macro avg       0.78      0.78      0.78       100
weighted avg       0.79      0.78      0.78       100

Confusion Matrix Total:
[[38 14]
 [ 8 40]]
