In [34]:
# FAZENDO IMPORTS

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import make_scorer, mean_squared_error, r2_score, confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import KFold
import torch
import torch.nn as nn
import torch.optim as optim

In [35]:
#  CARREGANDO BASE DE DADOS

#drive.mount('/content/drive')
#data = pd.read_csv('/content/drive/MyDrive/Machine Learning Project/Regression/Final_Grads_SJCU.csv')

#feature_data = data.iloc[:, :3]
#target_data = data.iloc[:, -1]

#feature_tensor = torch.FloatTensor(feature_data.values)
#target_tensor = torch.FloatTensor(target_data.values).view(-1, 1)  # Ensure target_tensor is a 2D tensor with shape (N, 1)

#print("Feature tensor shape:", feature_tensor.shape)
#print("Target tensor shape:", target_tensor.shape)

In [36]:
#   CONTROLE DE ALEATORIEDADE

seed = 42
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7df2e6fc1210>

In [37]:
#  GERANDO UM MOCK DATASET
# ok

num_samples=100

# Generate random features (float normalized between 0 and 1)
features = np.random.rand(num_samples, 3)

# Calculate the binary classification based on the last values of the features
HANS = np.zeros(num_samples)
for i in range(num_samples):
    # Example binary classification rule: if the sum of the last two values of feature1 is greater than 1, classify as 1, else 0
    HANS[i] = 400 * (features[i, 0] + features[i-1, 0] - features[i, 1] - features[i-1, 1] + features[i, 2] + features[i-1, 2])

# Create DataFrame
data = pd.DataFrame(features, columns=['feature1', 'feature2', 'feature3'])
# Add binary classification column
target = pd.DataFrame(HANS, columns=['target'])

# Convert data to PyTorch tensors
features_tensor = torch.FloatTensor(data.values)
target_tensor = torch.FloatTensor(target.values).view(-1, 1)

print(data)
print(target)
print(features_tensor.shape)
print(target_tensor.shape)

    feature1  feature2  feature3
0   0.374540  0.950714  0.731994
1   0.598658  0.156019  0.155995
2   0.058084  0.866176  0.601115
3   0.708073  0.020584  0.969910
4   0.832443  0.212339  0.181825
..       ...       ...       ...
95  0.035942  0.465598  0.542645
96  0.286541  0.590833  0.030500
97  0.037348  0.822601  0.360191
98  0.127061  0.522243  0.769994
99  0.215821  0.622890  0.085347

[100 rows x 3 columns]
        target
0   -66.360892
1   301.781647
2   156.662737
3   580.168166
4   983.730573
..         ...
95  389.148018
96  -64.321147
97 -279.541396
98  -20.100370
99   21.235529

[100 rows x 1 columns]
torch.Size([100, 3])
torch.Size([100, 1])


In [38]:
#  SEPARAÇÃO DOS DADOS

#data.head()

#data_features = data.drop('HANS', axis=1)
#data_target = data['HANS']

#X_train, X_test, y_train, y_test = train_test_split(data_features, data_target, test_size=0.2, random_state=seed)

#print(f"Number of samples: {X_train.shape[0]}")

In [39]:
#   CLASSES

# Define the PyTorch linear regression model
class LinearRegressionModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LinearRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)

# Custom weighted MSE loss function
class WeightedMSELoss(nn.Module):
    def __init__(self, threshold=400, high_weight=5, low_weight=1):
        super(WeightedMSELoss, self).__init__()
        self.threshold = threshold
        self.high_weight = high_weight
        self.low_weight = low_weight

    def forward(self, y_true, y_pred):
        weights = torch.where(y_true > self.threshold, self.high_weight, self.low_weight)
        return torch.mean(weights * (y_true - y_pred) ** 2)

In [40]:
#   GERAR MATRIX DE CONFUSÃO

def apply_rule(y_test, y_pred):
    new_test = np.where(y_test > 400, 1, 0)
    new_preds = np.where(y_pred > 400, 1, 0)
    return new_test, new_preds

In [41]:
#   TREINAMENTO - PREDIÇÃO - MÉTRICAS
#   Utilizando um 5 fold com tamanho de treinamento 80% e teste 20% com uma função de custo Mean Square Error

# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=seed)

# Initialize lists to store true and predicted labels
true_labels = []
predicted_labels = []

num_epochs = 100

# Perform 5-fold cross-validation
for train_index, test_index in kf.split(data):
    # Split the data into training and testing sets
    X_train = features_tensor[train_index]
    y_train = target_tensor[train_index]
    X_test = features_tensor[test_index]
    y_test = target_tensor[test_index]

    # Initialize the model, loss function, and optimizer
    model = LinearRegressionModel(input_dim=3, output_dim=1)
    criterion = nn.MSELoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01)

    # Train the model
    model.train()
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(y_train, outputs)
        loss.backward()
        optimizer.step()
        if (epoch + 1) % 10 == 0:
            print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}')

    # Evaluate the model
    model.eval()
    with torch.no_grad():
        y_pred = model(X_test)

    new_test, new_preds = apply_rule(y_test, y_pred)

    # Store the true and predicted values
    true_labels.extend(new_test)
    predicted_labels.extend(new_preds)

    # Print the confusion matrix for each fold
    print(f"Confusion Matrix Fold {len(true_labels)}:")
    print(confusion_matrix(new_test, new_preds))
    print(" ")

# Calculate the accuracy
print('Accuracy:')
print(accuracy_score(true_labels, predicted_labels))

# Generate a classification report
print('Classification Report:')
print(classification_report(true_labels, predicted_labels))

# Print the cross-validation confusion matrix
print("Confusion Matrix Total:")
print(confusion_matrix(true_labels, predicted_labels))

Epoch 10/100, Loss: 156912.1875
Epoch 20/100, Loss: 117799.015625
Epoch 30/100, Loss: 97705.3046875
Epoch 40/100, Loss: 87044.109375
Epoch 50/100, Loss: 81073.109375
Epoch 60/100, Loss: 77446.0859375
Epoch 70/100, Loss: 75001.796875
Epoch 80/100, Loss: 73165.1015625
Epoch 90/100, Loss: 71650.75
Epoch 100/100, Loss: 70316.8984375
Confusion Matrix Fold 20:
[[8 3]
 [5 4]]
 
Epoch 10/100, Loss: 155419.625
Epoch 20/100, Loss: 116714.5625
Epoch 30/100, Loss: 96868.796875
Epoch 40/100, Loss: 86352.375
Epoch 50/100, Loss: 80463.9765625
Epoch 60/100, Loss: 76883.796875
Epoch 70/100, Loss: 74466.7734375
Epoch 80/100, Loss: 72647.21875
Epoch 90/100, Loss: 71145.2421875
Epoch 100/100, Loss: 69821.96875
Confusion Matrix Fold 40:
[[8 0]
 [6 6]]
 
Epoch 10/100, Loss: 142833.96875
Epoch 20/100, Loss: 108464.3125
Epoch 30/100, Loss: 91249.6875
Epoch 40/100, Loss: 82307.96875
Epoch 50/100, Loss: 77367.1015625
Epoch 60/100, Loss: 74371.9921875
Epoch 70/100, Loss: 72333.65625
Epoch 80/100, Loss: 70775.773

In [42]:
#   TREINAMENTO - PREDIÇÃO - MÉTRICAS
#   Utilizando um 5 fold com tamanho de treinamento 80% e teste 20% com uma função de custo priorizando valores maiores que 400

# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=seed)

# Initialize lists to store true and predicted labels
true_labels = []
predicted_labels = []

num_epochs = 100

# Perform 5-fold cross-validation
for train_index, test_index in kf.split(data):
    # Split the data into training and testing sets
    X_train = features_tensor[train_index]
    y_train = target_tensor[train_index]
    X_test = features_tensor[test_index]
    y_test = target_tensor[test_index]

    # Initialize the model, loss function, and optimizer
    model = LinearRegressionModel(input_dim=3, output_dim=1)
    criterion = WeightedMSELoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01)

    # Train the model
    model.train()
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(y_train, outputs)
        loss.backward()
        optimizer.step()
        if (epoch + 1) % 10 == 0:
            print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}')

    # Evaluate the model
    model.eval()
    with torch.no_grad():
        y_pred = model(X_test)

    new_test, new_preds = apply_rule(y_test, y_pred)

    # Store the true and predicted values
    true_labels.extend(new_test)
    predicted_labels.extend(new_preds)

    # Print the confusion matrix for each fold
    print(f"Confusion Matrix Fold {len(true_labels)}:")
    print(confusion_matrix(new_test, new_preds))
    print(" ")

# Calculate the accuracy
print('Accuracy:')
print(accuracy_score(true_labels, predicted_labels))

# Generate a classification report
print('Classification Report:')
print(classification_report(true_labels, predicted_labels))

# Print the cross-validation confusion matrix
print("Confusion Matrix Total:")
print(confusion_matrix(true_labels, predicted_labels))


Epoch 10/100, Loss: 268358.21875
Epoch 20/100, Loss: 160968.375
Epoch 30/100, Loss: 144801.03125
Epoch 40/100, Loss: 138566.5
Epoch 50/100, Loss: 133739.828125
Epoch 60/100, Loss: 129398.546875
Epoch 70/100, Loss: 125417.078125
Epoch 80/100, Loss: 121755.6875
Epoch 90/100, Loss: 118385.953125
Epoch 100/100, Loss: 115282.9140625
Confusion Matrix Fold 20:
[[ 1 10]
 [ 0  9]]
 
Epoch 10/100, Loss: 281622.3125
Epoch 20/100, Loss: 163905.859375
Epoch 30/100, Loss: 144941.6875
Epoch 40/100, Loss: 138573.578125
Epoch 50/100, Loss: 134080.484375
Epoch 60/100, Loss: 130106.734375
Epoch 70/100, Loss: 126461.203125
Epoch 80/100, Loss: 123099.078125
Epoch 90/100, Loss: 119995.5234375
Epoch 100/100, Loss: 117129.671875
Confusion Matrix Fold 40:
[[ 1  7]
 [ 0 12]]
 
Epoch 10/100, Loss: 265193.8125
Epoch 20/100, Loss: 167303.53125
Epoch 30/100, Loss: 151311.109375
Epoch 40/100, Loss: 144797.5
Epoch 50/100, Loss: 139702.625
Epoch 60/100, Loss: 135103.5
Epoch 70/100, Loss: 130870.703125
Epoch 80/100, Lo