In [22]:
# name of the combination you want to try 
combination = '11B,3B'

# base directory where the extracted features stored  in models named directory
models_base_directory = "features_timing" 

# targeting the individual features (we want to target)
features_base_directory = f"features/features({combination})"

# Define the base directories for features and combined features
base_combined_directory = f"combined_features/combined_features({combination})"

# Select the model's name according to the combination you're trying for
model_names = [
    "Llama-3.2-3B", 
    # "Llama-3.2-3B-Instruct", 
    "Llama-3.2-11B-Vision", 
    # "Llama-3.2-11B-Vision-Instruct"
]

# define directory where you want to store the checkpoints
checkpoint_dir = f'checkpoints/checkpoints({combination})'

# defined directory to store the evaluation (csv) file
results_directory = 'evaluation'

# evaluation file name
evaluation_file = f'checkpoints({combination})_100'

# combined_dir = base_combined_directory

In [23]:
# Define paths to your feature data
train_machine_path = f'{base_combined_directory}/train/machine/combined_train_machine_features.npy'
train_human_path = f'{base_combined_directory}/train/human/combined_train_human_features.npy'
dev_machine_path = f'{base_combined_directory}/dev/machine/combined_dev_machine_features.npy'
dev_human_path = f'{base_combined_directory}/dev/human/combined_dev_human_features.npy'


### Copying feature's files from timing model's f`iles and create combinations of individual features in featured directory

In [24]:
import os
import shutil


# combination = '3B+11B'
# # Define the base directories
# models_base_directory = "timing"  # Base directory where model folders are located
# features_base_directory = f"features/features({combination})"  # Directory where you want to store features

# # Define the base directories for features and combined features
# base_combined_directory = f"combined_features/combined_features({combination})"

# # List of models to process
# model_names = [
#     "Llama-3.2-3B", 
#     "Llama-3.2-3B-Instruct", 
#     "Llama-3.2-11B-Vision", 
#     "Llama-3.2-11B-Vision-Instruct"
# ]

# Scenarios to loop through (dev/train and human/machine)
scenarios = [
    ("train", "human"),
    ("train", "machine"),
    ("dev", "human"),
    ("dev", "machine")
]

# Iterate through each scenario
for scenario in scenarios:
    split, category = scenario
    
    # Create scenario-specific directories in the features_base_directory
    destination_scenario_directory = os.path.join(features_base_directory, split, category)
    os.makedirs(destination_scenario_directory, exist_ok=True)
    
    # Iterate through each model in the list
    for model_name in model_names:
        print(f"Processing model: {model_name} for scenario: {split}/{category}")
        
        # Define the source directory for the current model and scenario
        source_directory = os.path.join(models_base_directory, model_name, split, category)
        
        # List all .npy files in the source directory and move them to the destination
        for file_name in os.listdir(source_directory):
            if file_name.endswith('.npy'):
                source_file_path = os.path.join(source_directory, file_name)
                
                # Define the new file name in the format: model_name_scenario.npy
                new_file_name = f"{model_name}_{split}_{category}.npy"
                destination_file_path = os.path.join(destination_scenario_directory, new_file_name)
                
                # Copy the feature file to the destination and rename it
                shutil.copy(source_file_path, destination_file_path)
                print(f"Copied {file_name} from {model_name} to {destination_file_path}")

print("Feature extraction and copying completed.")


Processing model: Llama-3.2-3B for scenario: train/human
Copied Llama-3.2-3B_features(label0_train2).npy from Llama-3.2-3B to features/features(11B,3B)/train/human/Llama-3.2-3B_train_human.npy
Processing model: Llama-3.2-11B-Vision for scenario: train/human
Copied Llama-3.2-11B-Vision_features(label0_train2).npy from Llama-3.2-11B-Vision to features/features(11B,3B)/train/human/Llama-3.2-11B-Vision_train_human.npy
Processing model: Llama-3.2-3B for scenario: train/machine
Copied Llama-3.2-3B_features(label1_train3).npy from Llama-3.2-3B to features/features(11B,3B)/train/machine/Llama-3.2-3B_train_machine.npy
Processing model: Llama-3.2-11B-Vision for scenario: train/machine
Copied Llama-3.2-11B-Vision_features(label1_train3).npy from Llama-3.2-11B-Vision to features/features(11B,3B)/train/machine/Llama-3.2-11B-Vision_train_machine.npy
Processing model: Llama-3.2-3B for scenario: dev/human
Copied Llama-3.2-3B_features(label0_dev2).npy from Llama-3.2-3B to features/features(11B,3B)/dev/

### Combining the features stored in the features directory for specified combinations

In [25]:
import os
import numpy as np

# Define the base directories for features and combined features
# features_base_directory = "features/features(3B+11B)"
# base_combined_directory = "combined_features/combined_features(3B+11B)"

# Scenarios to loop through
scenarios = [
    ("train", "human"),
    ("train", "machine"),
    ("dev", "human"),
    ("dev", "machine")
]

# Loop through each scenario and process the files
for scenario in scenarios:
    split, category = scenario

    # Define the directories based on the scenario
    features_directory = os.path.join(features_base_directory, split, category)
    print('---features_directory -----', features_directory)
    combined_features_directory = os.path.join(base_combined_directory, split, category)
    os.makedirs(combined_features_directory, exist_ok=True)
    # List all the .npy files in the directory
    features_list = []
    for file_name in os.listdir(features_directory):
        if file_name.endswith('.npy'):
            # Load each .npy file and append it to the features_list
            print(f"Processing file: {file_name} for {split}/{category}")
            file_path = os.path.join(features_directory, file_name)
            print('-------file_path-------', file_path)
            features_list.append(np.load(file_path))
            for i, feature in enumerate(features_list):
                print(f"Shape of array {i}: {feature.shape}")


    # Combine features along a new axis (e.g., axis=2)
    combined_features = np.concatenate(features_list, axis=2)

    # Create the directory to save the combined features if it doesn't exist
    os.makedirs(combined_features_directory, exist_ok=True)

    # Define the combined file name based on the scenario
    combined_file_name = f"combined_{split}_{category}_features.npy"
    combined_file_path = os.path.join(combined_features_directory, combined_file_name)

    # Save the combined features
    np.save(combined_file_path, combined_features)

    print(f"Combined features saved to '{combined_file_path}'.")


---features_directory ----- features/features(11B,3B)/train/human
Processing file: Llama-3.2-11B-Vision_train_human.npy for train/human
-------file_path------- features/features(11B,3B)/train/human/Llama-3.2-11B-Vision_train_human.npy
Shape of array 0: (228922, 127, 3)
Processing file: Llama-3.2-3B_train_human.npy for train/human
-------file_path------- features/features(11B,3B)/train/human/Llama-3.2-3B_train_human.npy
Shape of array 0: (228922, 127, 3)
Shape of array 1: (228922, 127, 3)
Combined features saved to 'combined_features/combined_features(11B,3B)/train/human/combined_train_human_features.npy'.
---features_directory ----- features/features(11B,3B)/train/machine
Processing file: Llama-3.2-3B_train_machine.npy for train/machine
-------file_path------- features/features(11B,3B)/train/machine/Llama-3.2-3B_train_machine.npy
Shape of array 0: (381845, 127, 3)
Processing file: Llama-3.2-11B-Vision_train_machine.npy for train/machine
-------file_path------- features/features(11B,3B)

In [26]:
import torch

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())


PyTorch version: 2.4.1+cu121
CUDA available: True


##  Training part

# Model Definition

In [27]:
# Define the LLMIXTIC model using PyTorch. This model consists of a Transformer encoder.
import torch
import torch.nn as nn

class LLMIXTIC(nn.Module):
    def __init__(self, input_dim, num_classes=2):
        super(LLMIXTIC, self).__init__()
        self.fc = nn.Linear(input_dim, 128)  # Project to 128 dimensions
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=128, nhead=4), num_layers=1
        )
        self.classifier = nn.Linear(128, num_classes)

    def forward(self, x):
        x = torch.relu(self.fc(x))  # Linear layer followed by ReLU
        x = self.transformer_encoder(x)  # Transformer Encodertrain_texts
        x = x.mean(dim=1)  # Average pooling
        return self.classifier(x)  # Classifier layer

# Training

In [28]:
import numpy as np

# combined_dir = f'combined_features/combined_features({combination})'
# checkpoint_dir = f'checkpoints/checkpoints({combination})_test'
# evaluation_file = f'checkpoints({combination})_test_vishnu'

# dev_combined_dir = f'combined_features/combined_features({combination})'

# # Define paths to your feature data
# train_machine_path = f'{combined_dir}/train/machine/combined_train_machine_features.npy'
# train_human_path = f'{combined_dir}/train/human/combined_train_human_features.npy'
# dev_machine_path = f'{combined_dir}/dev/machine/combined_dev_machine_features.npy'
# dev_human_path = f'{combined_dir}/dev/human/combined_dev_human_features.npy'

# Load training and dev features
train_machine_features = np.load(train_machine_path)
train_human_features = np.load(train_human_path)

dev_machine_features = np.load(dev_machine_path)      # Label 1 for machine-generated
dev_human_features = np.load(dev_human_path )  

In [29]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
# Create labels for the training and dev sets
train_machine_labels = np.ones(train_machine_features.shape[0])  # Label 1 for machine-generated
train_human_labels = np.zeros(train_human_features.shape[0])     # Label 0 for human-generated

dev_machine_labels = np.ones(dev_machine_features.shape[0])      # Label 1 for machine-generated
dev_human_labels = np.zeros(dev_human_features.shape[0])         # Label 0 for human-generated

# Combine the features and labels
train_features = np.concatenate([train_machine_features, train_human_features], axis=0)
train_labels = np.concatenate([train_machine_labels, train_human_labels], axis=0)

dev_features = np.concatenate([dev_machine_features, dev_human_features], axis=0)
dev_labels = np.concatenate([dev_machine_labels, dev_human_labels], axis=0)

# Convert to torch tensors
train_features_tensor = torch.tensor(train_features, dtype=torch.float32)
train_labels_tensor = torch.tensor(train_labels, dtype=torch.long)

dev_features_tensor = torch.tensor(dev_features, dtype=torch.float32)
dev_labels_tensor = torch.tensor(dev_labels, dtype=torch.long)


In [30]:
# Create TensorDatasets
train_dataset = TensorDataset(train_features_tensor, train_labels_tensor)
dev_dataset = TensorDataset(dev_features_tensor, dev_labels_tensor)

# Create DataLoaders
batch_size = 32  # Define your batch size
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)


In [31]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import os

# Assume train_features is defined
input_dim = train_features.shape[-1]  # Input dimension from the feature size
model = LLMIXTIC(input_dim)

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

# Create checkpoint directory if it doesn't exist
checkpoint_dir = f'{checkpoint_dir}'
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

# Training loop with checkpoints and progress bar
num_epochs = 5
checkpoint_interval = 1  # Save a checkpoint every epoch

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    # Initialize tqdm progress bar
    with tqdm(train_loader, unit="batch") as tepoch:
        tepoch.set_description(f"Epoch [{epoch + 1}/{num_epochs}]")

        for inputs, labels in tepoch:
            inputs, labels = inputs.to(device), labels.to(device)  # Move to device
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            tepoch.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch + 1}/{num_epochs}], Average Loss: {avg_loss:.4f}")

    # Save model checkpoint after every epoch
    checkpoint_path = os.path.join(checkpoint_dir, f'epoch_{epoch+1}.pth')
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'epoch': epoch
    }, checkpoint_path)
    print(f"Checkpoint saved: {checkpoint_path}")

# Final model saving
model_save_path = 'llmixtic_model_final.pth'
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")


Epoch [1/5]: 100%|██████████| 19087/19087 [00:34<00:00, 548.77batch/s, loss=0.0136] 


Epoch [1/5], Average Loss: 0.0925
Checkpoint saved: checkpoints/checkpoints(11B,3B)/epoch_1.pth


Epoch [2/5]: 100%|██████████| 19087/19087 [00:34<00:00, 551.89batch/s, loss=0.000762]


Epoch [2/5], Average Loss: 0.0358
Checkpoint saved: checkpoints/checkpoints(11B,3B)/epoch_2.pth


Epoch [3/5]: 100%|██████████| 19087/19087 [00:34<00:00, 550.98batch/s, loss=0.00502] 


Epoch [3/5], Average Loss: 0.0249
Checkpoint saved: checkpoints/checkpoints(11B,3B)/epoch_3.pth


Epoch [4/5]: 100%|██████████| 19087/19087 [00:34<00:00, 558.16batch/s, loss=0.0385]  


Epoch [4/5], Average Loss: 0.0186
Checkpoint saved: checkpoints/checkpoints(11B,3B)/epoch_4.pth


Epoch [5/5]: 100%|██████████| 19087/19087 [00:34<00:00, 551.52batch/s, loss=0.00119] 


Epoch [5/5], Average Loss: 0.0149
Checkpoint saved: checkpoints/checkpoints(11B,3B)/epoch_5.pth
Model saved to llmixtic_model_final.pth


# Evaluation

### for GPU 

In [32]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm  # Progress bar
import os
import pandas as pd  # Import pandas for handling CSV files

In [33]:
# # Selecting a specific checkpoint
# checkpoint_file = f'{checkpoint_dir}/epoch_5.pth'
# # Load the checkpoint 
# checkpoint = torch.load(checkpoint_file, weights_only=True)

# # Print the keys in the checkpoint
# print(checkpoint.keys())

In [34]:
# Recreate the model and load the state_dict from checkpoint
input_dim = train_features.shape[-1]  # Input dimension should be same as during training
model = LLMIXTIC(input_dim)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move model to the specified device



LLMIXTIC(
  (fc): Linear(in_features=6, out_features=128, bias=True)
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (classifier): Linear(in_features=128, out_features=2, bias=True)
)

###  Uncomment when using a stored checkpoint

In [35]:

# model.load_state_dict(checkpoint['model_state_dict'])  # Uncomment when using a stored checkpoint

In [36]:
# Define the evaluation function
def evaluate_model(model, dataloader):
    model.eval()  # Set the model to evaluation mode
    correct = 0  # Track total number of correct predictions
    total = 0    # Track total number of samples

    # Training loop with checkpoints and progress bar
   
    with torch.no_grad():
        for inputs, labels in dataloader:
            # Move the inputs and labels to the GPU
            inputs, labels = inputs.to(device), labels.to(device)
            
            # Forward pass
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            
            # Calculate accuracy and track correct outputs
            total += labels.size(0)
            batch_correct = (predicted == labels).sum().item()
            correct += batch_correct  # Accumulate correct predictions

    accuracy = 100 * correct / total if total > 0 else 0
    return accuracy, correct, total


In [37]:
# Create a DataFrame to store the results
results_df = pd.DataFrame(columns=['epoch_no', 'num_inputs_processed', 'num_correct_outputs', 'accuracy'])

num_epochs = 5
checkpoint_interval = 1  # Save a checkpoint every epoch

# List to store results for each epoch
results = []

# Evaluate the model loaded from checkpoint on the dev dataset
for epoch in range(1, num_epochs + 1):  # Assuming num_epochs is defined
    # Load the checkpoint for the current epoch
    checkpoint_file = f'{checkpoint_dir}/epoch_{epoch}.pth'
    checkpoint = torch.load(checkpoint_file, weights_only=True)
    model.load_state_dict(checkpoint['model_state_dict'])  # Load the model state_dict

    # Evaluate the model
    dev_accuracy, correct_outputs, total_samples = evaluate_model(model, dev_loader)

    # Store the results in the list
    results.append({
        'epoch_no': epoch,
        'num_inputs_processed': total_samples,
        'num_correct_outputs': correct_outputs,
        'accuracy': dev_accuracy
    })

    # Print results for the current epoch
    print(f"Epoch {epoch} - Dev Accuracy: {dev_accuracy:.2f}%, Total Correct Outputs: {correct_outputs}/{total_samples}")

Epoch 1 - Dev Accuracy: 98.89%, Total Correct Outputs: 258846/261758
Epoch 2 - Dev Accuracy: 99.24%, Total Correct Outputs: 259779/261758
Epoch 3 - Dev Accuracy: 99.35%, Total Correct Outputs: 260047/261758
Epoch 4 - Dev Accuracy: 99.26%, Total Correct Outputs: 259811/261758
Epoch 5 - Dev Accuracy: 99.67%, Total Correct Outputs: 260903/261758


In [38]:
# Convert the list of results into a DataFrame
results_df = pd.DataFrame(results)

# Create the evaluation directory if it doesn't exist
# results_directory = 'evaluation'
os.makedirs(results_directory, exist_ok=True)

# Save the results to a CSV file
results_csv_path = f'{results_directory}/{evaluation_file}.csv'
results_df.to_csv(results_csv_path, index=False)
print(f"Evaluation results saved to {results_csv_path}")

Evaluation results saved to evaluation/checkpoints(11B,3B)_100.csv
