<a href="https://colab.research.google.com/github/smuzka/SSN-project/blob/main/Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Zespół 4:
- Jakub Smuga
- Konrad Korus
- Maksym Kazhaiev

### Podział danych

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.utils import shuffle

In [None]:
random_shuffle_state = 2024

def train_valid_test_split(features, targets, valid_p = 0.1, test_p = 0.3):
  # shuffle the features and targets in the same way
  features = shuffle(features, random_state = random_shuffle_state)
  targets = shuffle(targets, random_state = random_shuffle_state)
  train_size = int(len(features) * (1 - (test_p + valid_p)))
  valid_size = int(len(features) * valid_p)

  X_train, X_valid, X_test = features[:train_size], features[train_size:train_size + valid_size], features[train_size + valid_size:]
  y_train, y_valid, y_test = targets[:train_size], targets[train_size:train_size + valid_size], targets[train_size + valid_size:]
  return (X_train, y_train, X_valid, y_valid, X_test, y_test)

### Model

In [None]:
import torch.nn as nn
import torch.optim as optim

In [None]:
# Define the MLP model
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

### Funkcja ucząca model

In [None]:
def train(model, X_train_tensor, y_train_tensor, X_valid_tensor, y_valid_tensor):
    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Training loop
    epochs = 10
    batch_size = 32

    for epoch in range(epochs):
        for i in range(0, len(X_train_tensor), batch_size):
            inputs = X_train_tensor[i:i+batch_size]
            labels = y_train_tensor[i:i+batch_size]

            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)

            # Compute loss
            loss = criterion(outputs, labels)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

        # Compute and print training accuracy
        with torch.no_grad():
            outputs = model(X_train_tensor)
            _, predicted = torch.max(outputs, 1)
            train_accuracy = (predicted == y_train_tensor).sum().item() / len(y_train_tensor)

            # Compute and print test accuracy
            outputs = model(X_valid_tensor)
            _, predicted = torch.max(outputs, 1)
            valid_accuracy = (predicted == y_valid_tensor).sum().item() / len(y_valid_tensor)

            print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}, Training Accuracy: {train_accuracy:.4f}, Valid Accuracy: {valid_accuracy:.4f}")

## Pobieranie danych

### Internet firewall data

In [None]:
import os
if not os.path.exists('firewall_data.zip'):
    # If the file doesn't exist, download it
  !pip install wget
  !wget https://archive.ics.uci.edu/static/public/542/internet+firewall+data.zip -O firewall_data.zip
  !unzip firewall_data.zip

import pandas as pd

data = pd.read_csv('log2.csv')

# Drop any missing values
data.dropna(inplace=True)

# Encode the 'Action' column to numerical values
label_encoder = LabelEncoder()
data['Action'] = label_encoder.fit_transform(data['Action'])

# Separate features and labels
X = data.drop('Action', axis=1)
y = data['Action']

In [None]:
# Split the dataset into training and testing sets
X_train, y_train, X_valid, y_valid, X_test, y_test = train_valid_test_split(X, y)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

y_train = y_train.to_numpy()
y_valid = y_valid.to_numpy()
y_test = y_test.to_numpy()

# Initialize the model
input_dim = X_train.shape[1]
hidden_dim = 4
output_dim = len(label_encoder.classes_)  # Number of unique classes in 'Action'

model = MLP(input_dim, hidden_dim, output_dim)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_valid_tensor = torch.tensor(X_valid, dtype=torch.float32)
y_valid_tensor = torch.tensor(y_valid, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

In [None]:
train(model, X_train_tensor, y_train_tensor, X_valid_tensor, y_valid_tensor)

Epoch 1/10, Loss: 0.1364, Training Accuracy: 0.9495, Valid Accuracy: 0.9475
Epoch 2/10, Loss: 0.0415, Training Accuracy: 0.9749, Valid Accuracy: 0.9744
Epoch 3/10, Loss: 0.0214, Training Accuracy: 0.9818, Valid Accuracy: 0.9818
Epoch 4/10, Loss: 0.0151, Training Accuracy: 0.9848, Valid Accuracy: 0.9847
Epoch 5/10, Loss: 0.0119, Training Accuracy: 0.9861, Valid Accuracy: 0.9866
Epoch 6/10, Loss: 0.0102, Training Accuracy: 0.9872, Valid Accuracy: 0.9877
Epoch 7/10, Loss: 0.0093, Training Accuracy: 0.9880, Valid Accuracy: 0.9882
Epoch 8/10, Loss: 0.0085, Training Accuracy: 0.9881, Valid Accuracy: 0.9882
Epoch 9/10, Loss: 0.0081, Training Accuracy: 0.9885, Valid Accuracy: 0.9886
Epoch 10/10, Loss: 0.0079, Training Accuracy: 0.9888, Valid Accuracy: 0.9888


### Sports articles for objectivity analysis

In [None]:
import os
if not os.path.exists('sports_data.zip'):
    # If the file doesn't exist, download it
  !pip install wget
  !wget https://archive.ics.uci.edu/static/public/450/sports+articles+for+objectivity+analysis.zip -O sports_data.zip

!pip install pandas lxml

import pandas as pd
if not os.path.exists('features.xls'):
    !unzip sports_data.zip

data = pd.read_xml('features.xls', parser='lxml', xpath='//ss:Cell')

# Drop any missing values
# data.dropna(inplace=True)

print(data.head())

# Encode the 'Action' column to numerical values
# label_encoder = LabelEncoder()
# data['Action'] = label_encoder.fit_transform(data['Action'])

# # Separate features and labels
# X = data.drop('Action', axis=1)
# y = data['Action']




XPathEvalError: Undefined namespace prefix