<a href="https://colab.research.google.com/github/riyag283/Federated-Learning/blob/main/ntwk_threat_detection_fl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# project url: https://github.com/tuhinsharma121/federated-ml

In [1]:
import pandas as pd
pd.set_option('display.max_columns', 10)

colnames = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land',
            'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
            'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files',
            'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
            'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
            'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate',
            'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
            'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
            'dst_host_srv_rerror_rate']
df = pd.read_csv('http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz', names=colnames+['threat_type'])[:100000]

In [2]:
df.sample(3)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,...,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,threat_type
21946,0,tcp,smtp,SF,997,...,0.0,0.0,0.0,0.0,normal.
14840,0,tcp,http,SF,289,...,0.0,0.0,0.0,0.0,normal.
39368,0,tcp,http,SF,288,...,0.0,0.0,0.0,0.0,normal.


In [3]:
import plotly.graph_objects as go
from collections import Counter

threat_count_dict = Counter(df['threat_type'])
threat_types = list(threat_count_dict.keys())
threat_counts = [threat_count_dict[threat_type] for threat_type in threat_types]
print('Total distinct number of threat types:', len(threat_types))
fig = go.Figure([go.Bar(x=threat_types, y=threat_counts, text=threat_counts, textposition='auto')])
fig.show()

Total distinct number of threat types: 20


In [4]:
numerical_colnames = ['duration', 'src_bytes', 'dst_bytes', 'wrong_fragment', 'urgent', 'hot',
                      'num_failed_logins', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',
                      'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'count',
                      'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
                      'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
                      'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
                      'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
                      'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']
numerical_df = df[numerical_colnames].copy()
numerical_df = numerical_df.loc[:, (numerical_df != numerical_df.iloc[0]).any()]
final_df = numerical_df/numerical_df.max()
X = final_df.values
print("Shape of feature matrix : ",X.shape)

Shape of feature matrix :  (100000, 33)


In [5]:
from sklearn.preprocessing import LabelEncoder

threat_types = df["threat_type"].values
encoder = LabelEncoder()
y = encoder.fit_transform(threat_types)
print("Shape of target vector : ",y.shape)

Shape of target vector :  (100000,)


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.4, random_state=42, stratify=y)
print("Number of records in training data : ", X_train.shape[0])
print("Number of records in test data : ", X_test.shape[0])
print("Total distinct number of threat types in training data : ",len(set(y_train)))
print("Total distinct number of threat types in test data : ",len(set(y_test)))

Number of records in training data :  60000
Number of records in test data :  40000
Total distinct number of threat types in training data :  20
Total distinct number of threat types in test data :  20


In [7]:
#!pip install syft

In [8]:
%%capture
import torch
import syft as sy

In [10]:
hook = sy.TorchHook(torch)
torch.manual_seed(1)
device = torch.device('cpu')

gatway1 = sy.VirtualWorker(hook, id="gatway1")
gatway2 = sy.VirtualWorker(hook, id="gatway2")



In [12]:
import numpy as np

BATCH_SIZE = 1000
EPOCHS = 2
LOG_INTERVAL = 5
lr = 0.01

n_feature = X_train.shape[1]
n_class = np.unique(y_train).shape[0]

print("Number of training features : ",n_feature)
print("Number of training classes : ",n_class)

Number of training features :  33
Number of training classes :  20


In [13]:
train_inputs = torch.tensor(X_train,dtype=torch.float).tag("#iot", "#network","#data","#train")
train_labels = torch.tensor(y_train).tag("#iot", "#network","#target","#train")
test_inputs = torch.tensor(X_test,dtype=torch.float).tag("#iot", "#network","#data","#test")
test_labels = torch.tensor(y_test).tag("#iot", "#network","#target","#test")

In [14]:
train_idx = int(len(train_labels)/2)
test_idx = int(len(test_labels)/2)
gatway1_train_dataset = sy.BaseDataset(train_inputs[:train_idx], train_labels[:train_idx]).send(gatway1)
gatway2_train_dataset = sy.BaseDataset(train_inputs[train_idx:], train_labels[train_idx:]).send(gatway2)
gatway1_test_dataset = sy.BaseDataset(test_inputs[:test_idx], test_labels[:test_idx]).send(gatway1)
gatway2_test_dataset = sy.BaseDataset(test_inputs[test_idx:], test_labels[test_idx:]).send(gatway2)

In [15]:
federated_train_dataset = sy.FederatedDataset([gatway1_train_dataset, gatway2_train_dataset])
federated_test_dataset = sy.FederatedDataset([gatway1_test_dataset, gatway2_test_dataset])

In [16]:
federated_train_loader = sy.FederatedDataLoader(federated_train_dataset, shuffle=True, batch_size=BATCH_SIZE)
federated_test_loader = sy.FederatedDataLoader(federated_test_dataset, shuffle=False, batch_size=BATCH_SIZE)

In [17]:
import torch.nn as nn
class Net(nn.Module):
    def __init__(self, input_dim, output_dim):
        """
        input_dim: number of input features.
        output_dim: number of labels.
        """
        super(Net, self).__init__()
        self.linear = torch.nn.Linear(input_dim, output_dim)
    def forward(self, x):
        outputs = self.linear(x)
        return outputs

In [18]:
import torch.nn.functional as F

def train(model, device, federated_train_loader, optimizer, epoch):
    model.train()
    for idx, (data, target) in enumerate(federated_train_loader):
        batch_idx = idx+1
        model.send(data.location)
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.cross_entropy(output, target)
        loss.backward()
        optimizer.step()
        model.get()
        if batch_idx==len(federated_train_loader) or (batch_idx!=0 and batch_idx % LOG_INTERVAL == 0):
            loss = loss.get()
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * BATCH_SIZE, len(federated_train_loader) * BATCH_SIZE,
                100. * batch_idx / len(federated_train_loader), loss.item()))

In [19]:
import torch.nn.functional as F

def test(model, device, federated_test_loader):
    model.eval()
    correct = 0
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(federated_test_loader):
            model.send(data.location)
            data, target = data.to(device), target.to(device)
            output = model(data)
            model.get()
            loss = F.cross_entropy(output, target)
            pred = output.argmax(1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().get()

    loss = loss.get()
    print('Test set: Loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        loss.item(), correct, len(federated_test_loader.federated_dataset),
        100. * correct / len(federated_test_loader.federated_dataset)))

In [20]:
%%time
import torch.optim as optim

model = Net(n_feature,n_class)

optimizer = optim.SGD(model.parameters(), lr=lr)

for epoch in range(1, EPOCHS + 1):
    train(model, device, federated_train_loader, optimizer, epoch)
    test(model, device, federated_test_loader)

Test set: Loss: 2.2076, Accuracy: 22493/40000 (56%)

Test set: Loss: 1.6338, Accuracy: 30687/40000 (77%)

CPU times: user 1min 35s, sys: 16.8 s, total: 1min 52s
Wall time: 1min 52s


In [21]:
# Save the model
torch.save(model.state_dict(), "binaize-threat-model.pt")
# Reload the model in a new model object
model_new = Net(n_feature,n_class)
model_new.load_state_dict(torch.load("binaize-threat-model.pt"))
model_new.eval()

Net(
  (linear): Linear(in_features=33, out_features=20, bias=True)
)

In [22]:
# Take the 122th record from the test data
idx = 122
data = test_inputs[idx]
pred = model_new(data)
pred_label = int(pred.argmax().data.cpu().numpy())
pred_threat = encoder.inverse_transform([pred_label])[0]
print("Predicted threat type : ", pred_threat)
actual_label = int(test_labels[idx].data.cpu().numpy())
actual_threat = encoder.inverse_transform([actual_label])[0]
print("Actual threat type : ", actual_threat)

Predicted threat type :  neptune.
Actual threat type :  neptune.


In [24]:
# Take the 159th record from the test data
idx = 155
data = test_inputs[idx]
pred = model_new(data)
pred_label = int(pred.argmax().data.cpu().numpy())
pred_threat = encoder.inverse_transform([pred_label])[0]
print("Predicted threat type : ", pred_threat)
actual_label = int(test_labels[idx].data.cpu().numpy())
actual_threat = encoder.inverse_transform([actual_label])[0]
print("Actual threat type : ", actual_threat)

Predicted threat type :  normal.
Actual threat type :  normal.
