In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('/kaggle/input/encryption-algos/output.csv')

In [3]:
df.head()

Unnamed: 0,KEY,IV,NONCE,CT,ALGORITHM,ORIGINAL TEXT
0,1d9650cd94d15814db78257b98b7845f,9944c5bf5d78a73e140fa86210634687,,5ae7ab9406ead8e35eeca5a95e9f1d74807b1dc49b676e...,AES,Today arrived with a crash of my car through t...
1,68b0284fd58244c86cfde403c1a27fb5bdb1d3a2c9a04e...,,40b11ab4f67f4f66,8db28f48a79a3a6c7bd70bbe5a8b6271051fdc7bd8df81...,ChaCha20,Today arrived with a crash of my car through t...
2,ec86b3d7d4c01d11477fd5a18b71d0ddf36405445641a8...,,675d38594ac0ea8e65cbdc25,ebed76e47179abbfb9920dfb57ea45b76ae77c6a3191bd...,ChaCha20_Poly1305,Today arrived with a crash of my car through t...
3,d070fe3955ecf07bc269d9f0c9cd7b6f8299c4361f3d80...,,ddb5b3e5f1148bb5,ddb5b3e5f1148bb513bfe4d3879b59f6a07c463b4a1322...,Salsa20,Today arrived with a crash of my car through t...
4,14e9580736248f7697b540ec0a8f3164facfd8a6369db2d5,5520e558b3efc1fa,,5520e558b3efc1fa18d951a35f38843f389d276273a308...,3DES,Today arrived with a crash of my car through t...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189371 entries, 0 to 189370
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   KEY            189371 non-null  object
 1   IV             108212 non-null  object
 2   NONCE          81159 non-null   object
 3   CT             189371 non-null  object
 4   ALGORITHM      189371 non-null  object
 5   ORIGINAL TEXT  189357 non-null  object
dtypes: object(6)
memory usage: 8.7+ MB


In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# For data preparation and visualization
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder


In [6]:


# Convert hex to numeric values for the 'KEY', 'IV', 'NONCE', and 'CT' columns
def hex_to_numeric(hex_string):
    if pd.isna(hex_string):  # Handle NaN values
        return []
    return [int(c, 16) for c in hex_string]

# Pad sequences to the maximum length
def pad_sequences(sequences, maxlen): 
    return np.array([seq + [0] * (maxlen - len(seq)) if len(seq) < maxlen else seq[:maxlen] for seq in sequences])

# Apply hex conversion to the 'KEY', 'IV', 'NONCE', and 'CT' columns
df['key_numeric'] = df['KEY'].apply(hex_to_numeric)
df['iv_numeric'] = df['IV'].apply(hex_to_numeric)
df['nonce_numeric'] = df['NONCE'].apply(hex_to_numeric)
df['ct_numeric'] = df['CT'].apply(hex_to_numeric)

# Concatenate the numeric values of KEY, IV, NONCE, and CT
df['combined_numeric'] = df['key_numeric'] + df['iv_numeric'] + df['nonce_numeric'] + df['ct_numeric']

# Get max sequence length from all combined numeric columns
maxlen = max(df['combined_numeric'].apply(len))

# Pad the combined sequences
X = pad_sequences(df['combined_numeric'].tolist(), maxlen)

# One-hot encode the 'ALGORITHM' column
encoder = OneHotEncoder(sparse_output=False)
y = encoder.fit_transform(df[['ALGORITHM']])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Create DataLoader for batching
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

batch_size = 512
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Print information to check if everything works
print(f"Train data shape: {X_train_tensor.shape}, Train labels shape: {y_train_tensor.shape}")
print(f"Test data shape: {X_test_tensor.shape}, Test labels shape: {y_test_tensor.shape}")


Train data shape: torch.Size([151496, 448]), Train labels shape: torch.Size([151496, 7])
Test data shape: torch.Size([37875, 448]), Test labels shape: torch.Size([37875, 7])


In [9]:

class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, embed_dim)
        positions = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2) * -(np.log(10000.0) / embed_dim))
        self.encoding[:, 0::2] = torch.sin(positions * div_term)
        self.encoding[:, 1::2] = torch.cos(positions * div_term)
        self.encoding = self.encoding.unsqueeze(0)

    def forward(self, x):
        seq_len = x.size(1)
        return x + self.encoding[:, :seq_len, :].to(x.device)

class TransformerModel(nn.Module):
    def __init__(self, input_dim, num_classes, embed_dim=64, num_heads=4, ff_dim=128, num_layers=2, dropout=0.2):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, embed_dim)  # Input dim reflects the size of hex values
        self.positional_encoding = PositionalEncoding(embed_dim)

        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(embed_dim, num_heads, ff_dim, dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers)

        # Fully connected layers tailored to your dataset
        self.fc1 = nn.Linear(embed_dim, 512)   # Fully connected layer 1
        self.fc2 = nn.Linear(512, 256)         # Fully connected layer 2
        self.fc3 = nn.Linear(256, 128)         # Fully connected layer 3
        self.fc4 = nn.Linear(128, 64)          # Fully connected layer 4
        self.fc5 = nn.Linear(64, num_classes)  # Output layer

        # Dropout for regularization
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Input embedding and positional encoding
        x = self.embedding(x)
        x = self.positional_encoding(x)
        
        # Pass through Transformer encoder
        x = self.transformer_encoder(x)
        
        # Global average pooling
        x = torch.mean(x, dim=1)  

        # Pass through fully connected layers
        x = self.dropout(torch.relu(self.fc1(x)))
        x = self.dropout(torch.relu(self.fc2(x)))
        x = self.dropout(torch.relu(self.fc3(x)))
        x = self.dropout(torch.relu(self.fc4(x)))
        x = self.fc5(x)  # No activation here, handled by loss function

        return x

# Define input/output dimensions
# Input dimension is 16 because hex digits range from 0-9 and a-f (16 possible characters)
input_dim = 16  # For hexadecimal characters (0-9, a-f)
num_classes = len(df['ALGORITHM'].unique())  # Number of unique algorithms as target classes

# Create the model
model = TransformerModel(input_dim=input_dim, num_classes=num_classes)

# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)


In [None]:
import torch.optim as optim
def train(model, train_loader, criterion, optimizer, num_epochs):
    model.train()
    
    for epoch in range(num_epochs):
        total_loss = 0
        correct = 0
        total = 0
        
        # Loop over batches
        for X_batch, y_batch in train_loader:
            # Move data to the correct device (GPU or CPU)
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            # Forward pass: compute the model output
            outputs = model(X_batch)
            
            # Convert one-hot encoded labels to class indices for accuracy calculation
            _, labels = torch.max(y_batch, 1)
            
            # Compute the loss (CrossEntropyLoss expects class indices)
            loss = criterion(outputs, labels)
            
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # Accumulate loss
            total_loss += loss.item()
            
            # Accuracy computation
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
        
        # Compute average loss and accuracy for the epoch
        avg_loss = total_loss / len(train_loader)
        accuracy = correct / total * 100
        
        # Print epoch details
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%")

# Train the model

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
train(model, train_loader, criterion, optimizer, num_epochs=250)


Epoch [1/250], Loss: 1.4253, Accuracy: 39.70%
Epoch [2/250], Loss: 1.1386, Accuracy: 50.70%
Epoch [3/250], Loss: 1.0564, Accuracy: 53.79%
Epoch [4/250], Loss: 1.0305, Accuracy: 54.86%
Epoch [5/250], Loss: 1.0059, Accuracy: 55.96%
Epoch [6/250], Loss: 0.9939, Accuracy: 56.78%
Epoch [7/250], Loss: 0.9781, Accuracy: 57.96%
Epoch [8/250], Loss: 1.0522, Accuracy: 55.63%
Epoch [9/250], Loss: 1.0412, Accuracy: 55.59%
Epoch [10/250], Loss: 0.9757, Accuracy: 58.30%
Epoch [11/250], Loss: 0.9583, Accuracy: 58.96%
Epoch [12/250], Loss: 0.9491, Accuracy: 59.21%
Epoch [13/250], Loss: 0.9482, Accuracy: 59.34%
Epoch [14/250], Loss: 0.9409, Accuracy: 59.65%
Epoch [15/250], Loss: 0.9821, Accuracy: 58.00%
Epoch [16/250], Loss: 0.9366, Accuracy: 59.62%
Epoch [17/250], Loss: 0.9370, Accuracy: 59.73%
Epoch [18/250], Loss: 0.9203, Accuracy: 60.42%
Epoch [19/250], Loss: 0.9136, Accuracy: 60.66%
Epoch [20/250], Loss: 0.9101, Accuracy: 60.97%
Epoch [21/250], Loss: 0.8989, Accuracy: 61.33%
Epoch [22/250], Loss: 

In [None]:
# Evaluation function
def evaluate(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            _, predicted = torch.max(outputs, 1)
            _, labels = torch.max(y_batch, 1)
            total += y_batch.size(0)
            correct += (predicted == labels).sum().item()

    print(f"Accuracy: {100 * correct / total:.2f}%")

# Evaluate the model
evaluate(model, test_loader)

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, dropout):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, hidden_dim)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = x[:, -1, :]  # Take the output from the last time step
        x = self.dropout(x)
        x = self.fc(x)
        return x

# Hyperparameters
input_dim = 16  # Based on hex digits (0-9, a-f)
hidden_dim = 128
output_dim = y_train.shape[1]
num_layers = 2
dropout = 0.2

# Model
lstm_model = LSTMModel(input_dim, hidden_dim, output_dim, num_layers, dropout)
lstm_model.to(device)


In [None]:
# Training loop
def train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * X_batch.size(0)
            _, predicted = torch.max(outputs, 1)
            _, labels = torch.max(y_batch, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        train_accuracy = 100 * correct / total
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/total:.4f}, Accuracy: {train_accuracy:.2f}%')

# Loss, optimizer, and training params
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lstm_model.parameters(), lr=0.001)
num_epochs = 20

# Train LSTM model
train_model(lstm_model, train_loader, test_loader, criterion, optimizer, num_epochs=20)


In [None]:
# Evaluation function
def evaluate(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            _, predicted = torch.max(outputs, 1)
            _, labels = torch.max(y_batch, 1)
            total += y_batch.size(0)
            correct += (predicted == labels).sum().item()

    print(f"Accuracy: {100 * correct / total:.2f}%")

# Evaluate the model
evaluate(lstm_model, test_loader)

In [None]:
class BiLSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, dropout):
        super(BiLSTMModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, hidden_dim)
        self.bilstm = nn.LSTM(hidden_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Bidirectional LSTM has 2x hidden size
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.bilstm(x)
        x = x[:, -1, :]  # Take the output from the last time step
        x = self.dropout(x)
        x = self.fc(x)
        return x

# Model
bilstm_model = BiLSTMModel(input_dim, hidden_dim, output_dim, num_layers, dropout)
bilstm_model.to(device)


In [None]:
# Train BILSTM model
train_model(bilstm_model, train_loader, test_loader, criterion, optimizer, num_epochs=20)

In [None]:
# Evaluation function
def evaluate(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            _, predicted = torch.max(outputs, 1)
            _, labels = torch.max(y_batch, 1)
            total += y_batch.size(0)
            correct += (predicted == labels).sum().item()

    print(f"Accuracy: {100 * correct / total:.2f}%")

# Evaluate the model
evaluate(bilstm_model, test_loader)

In [None]:
class ConvLSTMModel(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, output_dim, num_layers, dropout):
        super(ConvLSTMModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, embed_dim)
        self.conv = nn.Conv1d(embed_dim, hidden_dim, kernel_size=3, padding=1)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.embedding(x)  # x shape: [batch_size, seq_len, embed_dim]
        x = x.permute(0, 2, 1)  # Switch to [batch_size, embed_dim, seq_len] for Conv1D
        x = self.conv(x)  # Apply 1D convolution
        x = x.permute(0, 2, 1)  # Back to [batch_size, seq_len, hidden_dim]
        x, _ = self.lstm(x)
        x = x[:, -1, :]  # Take the output from the last time step
        x = self.dropout(x)
        x = self.fc(x)
        return x

# Model
conv_lstm_model = ConvLSTMModel(input_dim, embed_dim=64, hidden_dim=128, output_dim=output_dim, num_layers=2, dropout=0.2)
conv_lstm_model.to(device)


In [None]:
# Train CONVLSTM model
train_model(conv_lstm_model, train_loader, test_loader, criterion, optimizer, num_epochs=20)

In [None]:
def evaluate(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            _, predicted = torch.max(outputs, 1)
            _, labels = torch.max(y_batch, 1)
            total += y_batch.size(0)
            correct += (predicted == labels).sum().item()

    print(f"Accuracy: {100 * correct / total:.2f}%")

# Evaluate the model
evaluate(conv_lstm_model, test_loader)

In [None]:
class BiGRUModel(nn.Module):
    def __init__(self, input_dim, num_classes, embed_dim=128, hidden_dim=256, num_layers=2, dropout=0.2):
        super(BiGRUModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, embed_dim)

        # Bidirectional GRU
        self.bigru = nn.GRU(embed_dim, hidden_dim, num_layers=num_layers, 
                            batch_first=True, bidirectional=True, dropout=dropout)
        
        # Fully connected layers
        self.fc1 = nn.Linear(hidden_dim * 2, 128)  # Multiplied by 2 for bidirectional GRU
        self.fc2 = nn.Linear(128, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Embedding
        x = self.embedding(x)

        # BiGRU
        gru_out, _ = self.bigru(x)

        # Use the last hidden state from both directions (concatenated)
        x = gru_out[:, -1, :]

        # Fully connected layers with ReLU and dropout
        x = self.dropout(torch.relu(self.fc1(x)))
        x = self.fc2(x)
        return x

# Define input/output dimensions
input_dim = 16  # 16 possible hex digits (0-9, a-f)
num_classes = len(df['algo'].unique())

# Create the BiGRU model
bigru_model = BiGRUModel(input_dim=input_dim, num_classes=num_classes)

# Use GPU if available
bigru_model = bigru_model.to(device)

In [None]:
# Train BIGRU model
train_model(bigru_model, train_loader, test_loader, criterion, optimizer, num_epochs=20)

In [None]:
# Evaluate the model
evaluate(bigru_model, test_loader)

In [None]:
import torch
import numpy as np

def hex_to_numeric(hex_string):
    return [int(c, 16) for c in hex_string]

def predict_algorithm(encoded_text, model, encoder, device):
    # Step 1: Convert hex to numeric values
    numeric_sequence = hex_to_numeric(encoded_text)
    
    # Step 2: Pad the sequence to the maximum length
    maxlen = model.embedding.num_embeddings
    padded_sequence = pad_sequences([numeric_sequence], maxlen)
    
    # Step 3: Convert the sequence to a PyTorch tensor
    sequence_tensor = torch.tensor(padded_sequence, dtype=torch.long).to(device)
    
    # Step 4: Feed the tensor into the model
    model.eval()
    with torch.no_grad():
        output = model(sequence_tensor)
        _, predicted = torch.max(output, 1)
    
    # Step 5: Decode the prediction
    algo_classes = encoder.categories_[0]
    predicted_algorithm = algo_classes[predicted.item()]
    
    return predicted_algorithm

# Example usage:
encoded_text = "7bd65467969434bb72cc0a85ce7ea5186ff66dd381a168fb9c401bbe542e9e23"
print(f"The predicted encryption algorithm is: {predicted_algorithm}")
