In [1]:
%load_ext autoreload
%autoreload 2

Simple case study. We have a csv file with wind turbine data.

# Loading Data

In [3]:
import pandas as pd
from matplotlib import pyplot as plt

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

In [None]:
power_data = pd.read_csv("artifacts/power_cleaned_data.csv")

bin_edges = range(0, int(power_data["Patv"].max()) + 100, 100)

# Bin the Patv column
power_data["Patv"] = pd.cut(
    power_data["Patv"], bins=bin_edges, labels=False, right=False
)

power_data.head()

In [None]:
class DataCleaner:
    def __init__(self, data):
        self.data = data

    def clean_data(self):
        # Apply all abnormal conditions to filter out invalid data
        conditions = [
            self.data["Patv"] < 0,
            (self.data["Wspd"] < 1) & (self.data["Patv"] > 10),
            (self.data["Wspd"] < 2) & (self.data["Patv"] > 100),
            (self.data["Wspd"] < 3) & (self.data["Patv"] > 200),
            (self.data["Wspd"] > 2.5) & (self.data["Patv"] == 0),
            (self.data["Wspd"] == 0)
            & (self.data["Wdir"] == 0)
            & (self.data["Etmp"] == 0),
            self.data["Etmp"] < -21,
            self.data["Itmp"] < -21,
            self.data["Etmp"] > 60,
            self.data["Itmp"] > 70,
            (self.data["Wdir"] > 180) | (self.data["Wdir"] < -180),
            (self.data["Ndir"] > 720) | (self.data["Ndir"] < -720),
            (self.data["Pab1"] > 89)
            | (self.data["Pab2"] > 89)
            | (self.data["Pab3"] > 89),
        ]

        self.data["Patv"] = self.data["Patv"].astype(int)

        # Combine all conditions with OR operator
        combined_condition = conditions[0]
        for condition in conditions[1:]:
            combined_condition |= condition

        # Filter out rows that meet any of the abnormal conditions
        self.data = self.data[~combined_condition]

        self.data = self.data.dropna()
        return self.data


# Create an instance of the DataCleaner class and clean the data
cleaner = DataCleaner(power_data)
cleaned_data = cleaner.clean_data()


# Display the first few rows of the cleaned dataframe to verify the data is cleaned correctly
cleaned_data.head()

train, test = train_test_split(cleaned_data, test_size=0.5, random_state=42)
train

In [None]:
# encode Tmstamp column to numeric values
train["Tmstamp"] = pd.to_datetime(train["Tmstamp"]).astype(int) // 10**9
test["Tmstamp"] = pd.to_datetime(test["Tmstamp"]).astype(int) // 10**9

In [None]:
train["Patv"].hist()

In [None]:
test["Patv"].hist()

## Train Classifier

In [9]:
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset


# Split the data into training and testing sets
X = train.drop(columns=["Patv"])
y = train["Patv"]

X_test = test.drop(columns=["Patv"])
y_test = test["Patv"]

#original_model = XGBClassifier()

# another alternative model in pytorch
from torch import nn
import torch
class OurModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(OurModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out
    


original_model = OurModel(input_size=12, hidden_size=100, num_classes=16)


X_train_tensor = torch.tensor(X.values).long()
y_train_tensor = torch.tensor(y.values).long()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
original_model.to(device)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

criterion = nn.CrossEntropyLoss()
optimizer = Adam(original_model.parameters(), lr=0.001)

num_epochs = 10


In [None]:

for epoch in range(num_epochs):
    for i, (X_batch, y_batch) in enumerate(train_loader):

        print(i)
        print(X_batch.shape)
        print(y_batch.shape)


        optimizer.zero_grad()
        # change to float
        X_batch = X_batch.float().to(device)
        
        outputs = original_model(X_batch)

        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print(f"Epoch {epoch+1}/{num_epochs}, Step {i+1}, Loss: {loss.item()}")


In [None]:
X_test_tensor = torch.tensor(X_test.values).long()
y_test_tensor = torch.tensor(y_test.values).long()

X_test_tensor = X_test_tensor.float().to(device)

outputs = original_model(X_test_tensor)
_, y_pred = torch.max(outputs, 1)

y_pred = y_pred.cpu().numpy()



# Calculate precision, recall, and F1 score, classification report
precision = precision_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred, average="weighted")
f1 = f1_score(y_test, y_pred, average="weighted")

print(classification_report(y_test, y_pred))


print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

In [None]:
from synthetic_energy.attacks import CarliniWagnerAttack
import torch


cw = CarliniWagnerAttack(
    device="cpu"
)

x_text_float = X_test.values.astype(float)
y_test_float = y_test.values.astype(float)


y_test_dataloader = torch.utils.data.DataLoader(
    list(zip(x_text_float, y_test_float)),
    batch_size=1, shuffle=False
)

adv_examples, all_examples = cw.attack(original_model, y_test_dataloader)