# SGD-Differential Privacy

Simple case study. We have a csv file with wind turbine data.

# Installation

In [None]:
!pip install xgboost matplotlib torch scikit-learn ipykernel pandas tqdm
# !pip install --pre torchcsprng -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html

# Loading Data

In [2]:
import pandas as pd
from matplotlib import pyplot as plt

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

In [None]:
power_data = pd.read_csv("artifacts/power_cleaned_data.csv")

bin_edges = range(0, int(power_data["Patv"].max()) + 100, 100)

# Bin the Patv column
power_data["Patv"] = pd.cut(
    power_data["Patv"], bins=bin_edges, labels=False, right=False
)

power_data.head()

In [None]:
class DataCleaner:
    def __init__(self, data):
        self.data = data

    def clean_data(self):
        # Apply all abnormal conditions to filter out invalid data
        conditions = [
            self.data["Patv"] < 0,
            (self.data["Wspd"] < 1) & (self.data["Patv"] > 10),
            (self.data["Wspd"] < 2) & (self.data["Patv"] > 100),
            (self.data["Wspd"] < 3) & (self.data["Patv"] > 200),
            (self.data["Wspd"] > 2.5) & (self.data["Patv"] == 0),
            (self.data["Wspd"] == 0)
            & (self.data["Wdir"] == 0)
            & (self.data["Etmp"] == 0),
            self.data["Etmp"] < -21,
            self.data["Itmp"] < -21,
            self.data["Etmp"] > 60,
            self.data["Itmp"] > 70,
            (self.data["Wdir"] > 180) | (self.data["Wdir"] < -180),
            (self.data["Ndir"] > 720) | (self.data["Ndir"] < -720),
            (self.data["Pab1"] > 89)
            | (self.data["Pab2"] > 89)
            | (self.data["Pab3"] > 89),
        ]

        self.data["Patv"] = self.data["Patv"].astype(int)

        # Combine all conditions with OR operator
        combined_condition = conditions[0]
        for condition in conditions[1:]:
            combined_condition |= condition

        # Filter out rows that meet any of the abnormal conditions
        self.data = self.data[~combined_condition]

        self.data = self.data.dropna()
        return self.data


# Create an instance of the DataCleaner class and clean the data
cleaner = DataCleaner(power_data)
cleaned_data = cleaner.clean_data()


# Display the first few rows of the cleaned dataframe to verify the data is cleaned correctly
cleaned_data.head()

train, test = train_test_split(cleaned_data, test_size=0.5, random_state=42)
train

In [None]:
# encode Tmstamp column to numeric values
train["Tmstamp"] = pd.to_datetime(train["Tmstamp"]).astype(int) // 10**9
test["Tmstamp"] = pd.to_datetime(test["Tmstamp"]).astype(int) // 10**9

## Train Classifier

In [None]:
import torch
import torch.nn as nn

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(12, 10)
        self.fc2 = nn.Linear(10, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x
    
net = Net()

x_train = torch.tensor(train.drop(columns=["Patv"]).values).float()
# make y values 0 and 1s
y_values = [1 if y > 0 else 0 for y in train["Patv"].values]
y_values = [[y] for y in y_values]
y_train = torch.tensor(y_values).float()
print(y_train.shape)
train_dataloader = torch.utils.data.DataLoader(
    list(zip(x_train, y_train)), batch_size=64, shuffle=True
)
optimizer = torch.optim.Adam(net.parameters(), lr=0.01)


In [None]:
# train with differentially privacy
from tqdm import tqdm, auto
for epoch in auto.trange(100):
    for x_batch, y_batch in train_dataloader:
        optimizer.zero_grad()
        output = net(x_batch)
        # for classification, we use cross entropy loss
        if len(output.shape) == 1:
            output = output.unsqueeze(1)

        loss = nn.functional.cross_entropy(output, y_batch)
        loss.backward()

        for param in net.parameters():
            param.grad += torch.normal(0, 0.1, param.grad.shape)
            # Clip each parameter's per-sample gradient
            param.grad /= len(x_batch)
            param.grad = torch.clamp(param.grad, -0.1, 0.1)


        # Introduce some noise for SGD-DP
        noise_multiplier = 0.1
        l2_norm_clip = 1.0
        batch_size = 64
        delta = 1e-5
        max_grad_norm = 0.1
        sigma = noise_multiplier * l2_norm_clip / batch_size
        for param in net.parameters():
            param.grad += torch.normal(0, sigma, param.grad.shape)
        optimizer.step()

        
        

In [None]:

# evaluate the model
x_test = torch.tensor(test.drop(columns=["Patv"]).values).float()
y_test = torch.tensor(test["Patv"].values).float()

test_dataloader = torch.utils.data.DataLoader(
    list(zip(x_test, y_test)), batch_size=64, shuffle=True
)

net.eval()
with torch.no_grad():
    for data, target in test_dataloader:
        output = net(data)
        loss = nn.functional.mse_loss(output, target.view(-1, 1))
    print(f"Test Loss: {loss.item()}")

    


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

# Split the data into training and testing sets
X = train.drop(columns=["Patv"])
y = train["Patv"]

X_test = test.drop(columns=["Patv"])
y_test = test["Patv"]

original_model = XGBClassifier()

original_model.fit(X, y)

y_pred = original_model.predict(X_test)

# Calculate precision, recall, and F1 score, classification report
precision = precision_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred, average="weighted")
f1 = f1_score(y_test, y_pred, average="weighted")

print(classification_report(y_test, y_pred))


print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")