In [1]:
import pandas as pd
import numpy as np
df = pd.read_excel("techniques.xlsx")
texts = df["text"].to_list()
labels = df["technique_id"].to_list()

In [2]:
# list(sorted(set(labels)))
df

Unnamed: 0,text,technique_id
0,Adversaries may circumvent mechanisms designed...,T1548
1,An adversary may abuse configurations where an...,T1548
2,Adversaries may bypass UAC mechanisms to eleva...,T1548
3,Adversaries may perform sudo caching and/or us...,T1548
4,Adversaries may leverage the AuthorizationExec...,T1548
...,...,...
12001,Adversaries may bypass application control and...,T1220
12002,Astaroth executes embedded JScript or VBScript...,T1220
12003,Cobalt Group used msxsl.exe to bypass AppLocke...,T1220
12004,Higaisa used an XSL file to run VBScript code.,T1220


In [4]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from torch import nn
from sklearn.metrics import accuracy_score

# 讀取數據
df = pd.read_excel("techniques.xlsx")
texts = df['text'].tolist()
labels = df['technique_id'].tolist()
df.head()

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,text,technique_id
0,Adversaries may circumvent mechanisms designed...,T1548
1,An adversary may abuse configurations where an...,T1548
2,Adversaries may bypass UAC mechanisms to eleva...,T1548
3,Adversaries may perform sudo caching and/or us...,T1548
4,Adversaries may leverage the AuthorizationExec...,T1548


In [5]:
# 統計每個標籤的出現次數
label_counts = df['technique_id'].value_counts()

# 過濾出至少有 50 筆資料的標籤
labels_to_use = label_counts[label_counts >= 50].index

# 只保留這些標籤的資料
filtered_df = df[df['technique_id'].isin(labels_to_use)]

# 提取文本和標籤
texts = filtered_df['text'].tolist()
labels = filtered_df['technique_id'].tolist()
len(labels_to_use)

63

In [6]:

# 將文本標籤映射到整數
label_to_id = {label: id for id, label in enumerate(set(labels))}
id_to_label = {id: label for label, id in label_to_id.items()}
labels = [label_to_id[label] for label in labels]

# 現在 labels 是整數列表，可以轉換為張量


# 定義自定義模型
class CustomModel(nn.Module):
    def __init__(self, bert_model, sentence_embedding_dim, num_classes):
        super(CustomModel, self).__init__()
        self.bert = bert_model
        self.fc = nn.Linear(self.bert.config.hidden_size + sentence_embedding_dim, num_classes)

    def forward(self, input_ids, attention_mask, sentence_embeddings):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        combined = torch.cat((pooled_output, sentence_embeddings), dim=1)
        return self.fc(combined)

# 設置模型和 tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')


# 設置數據集
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
        self.labels = labels
        self.sentence_embeddings = sentence_model.encode(texts, convert_to_tensor=True)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)  # 確保是長整型
        item['sentence_embeddings'] = self.sentence_embeddings[idx]
        return item

    def __len__(self):
        return len(self.labels)


# 分割數據
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.1)
train_dataset = TextDataset(train_texts, train_labels)
val_dataset = TextDataset(val_texts, val_labels)

# 訓練參數
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CustomModel(bert_model, sentence_model.get_sentence_embedding_dimension(), len(set(labels)))
model.to(device)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)
optim = AdamW(model.parameters(), lr=5e-5)

print(model)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CustomModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)




In [9]:
from tqdm import tqdm
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import torch.nn as nn
import numpy as np

epochs = 50
best_f1 = 0.0  # 初始化最佳 F1-Score
model_path = 'best_model.pth'  # 模型保存路徑

for epoch in range(epochs):
    model.train()
    train_loop = tqdm(train_loader, leave=True)
    for batch in train_loop:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        sentence_embeddings = batch['sentence_embeddings'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask, sentence_embeddings)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optim.step()

        train_loop.set_description(f'Training Epoch {epoch+1}')
        train_loop.set_postfix(loss=loss.item())

    # 評估模型
    model.eval()
    val_loop = tqdm(val_loader, leave=True)
    all_predictions = []
    all_true_labels = []
    for batch in val_loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        sentence_embeddings = batch['sentence_embeddings'].to(device)
        labels = batch['labels'].to(device)
        with torch.no_grad():
            outputs = model(input_ids, attention_mask, sentence_embeddings)
        logits = outputs
        predictions = torch.argmax(logits, dim=-1)
        all_predictions.extend(predictions.cpu().numpy())
        all_true_labels.extend(labels.cpu().numpy())

    # 計算指標
    precision, recall, f1, _ = precision_recall_fscore_support(all_true_labels, all_predictions, average='weighted')
    accuracy = accuracy_score(all_true_labels, all_predictions)

    # 更新進度條
    val_loop.set_description(f'Validation Epoch {epoch+1}')
    val_loop.set_postfix(accuracy=accuracy, precision=precision, recall=recall, f1=f1)

    # 如果這是迄今為止最好的 F1-Score，則保存模型
    if f1 > best_f1:
        best_f1 = f1
        torch.save(model.state_dict(), model_path)
        print(f"New best F1-Score: {best_f1}. Model saved at '{model_path}'.")

print(f"Training completed. Best F1-Score: {best_f1}.")


Training Epoch 1: 100%|██████████| 71/71 [01:16<00:00,  1.07s/it, loss=0.888]
100%|██████████| 8/8 [00:02<00:00,  2.79it/s]
  _warn_prf(average, modifier, msg_start, len(result))


New best F1-Score: 0.8864521794613219. Model saved at 'best_model.pth'.


Training Epoch 2: 100%|██████████| 71/71 [01:17<00:00,  1.10s/it, loss=0.449]
100%|██████████| 8/8 [00:02<00:00,  2.77it/s]
  _warn_prf(average, modifier, msg_start, len(result))


New best F1-Score: 0.9274006251853063. Model saved at 'best_model.pth'.


Training Epoch 3: 100%|██████████| 71/71 [01:18<00:00,  1.10s/it, loss=0.304]
100%|██████████| 8/8 [00:02<00:00,  2.77it/s]


New best F1-Score: 0.9523730250982124. Model saved at 'best_model.pth'.


Training Epoch 4: 100%|██████████| 71/71 [01:18<00:00,  1.10s/it, loss=0.141] 
100%|██████████| 8/8 [00:02<00:00,  2.77it/s]


New best F1-Score: 0.9570674801671032. Model saved at 'best_model.pth'.


Training Epoch 5: 100%|██████████| 71/71 [01:18<00:00,  1.10s/it, loss=0.144] 
100%|██████████| 8/8 [00:02<00:00,  2.76it/s]
Training Epoch 6: 100%|██████████| 71/71 [01:18<00:00,  1.11s/it, loss=0.0401]
100%|██████████| 8/8 [00:02<00:00,  2.76it/s]
Training Epoch 7: 100%|██████████| 71/71 [01:18<00:00,  1.11s/it, loss=0.26]  
100%|██████████| 8/8 [00:02<00:00,  2.76it/s]
Training Epoch 8: 100%|██████████| 71/71 [01:18<00:00,  1.10s/it, loss=0.0262]
100%|██████████| 8/8 [00:02<00:00,  2.76it/s]
Training Epoch 9: 100%|██████████| 71/71 [01:18<00:00,  1.10s/it, loss=0.0262]
100%|██████████| 8/8 [00:02<00:00,  2.76it/s]
Training Epoch 10: 100%|██████████| 71/71 [01:18<00:00,  1.10s/it, loss=0.0148]
100%|██████████| 8/8 [00:02<00:00,  2.76it/s]
Training Epoch 11: 100%|██████████| 71/71 [01:18<00:00,  1.10s/it, loss=0.0181]
100%|██████████| 8/8 [00:02<00:00,  2.76it/s]
Training Epoch 12: 100%|██████████| 71/71 [01:18<00:00,  1.10s/it, loss=0.0256]
100%|██████████| 8/8 [00:02<00:00,  2.76it/

Training completed. Best F1-Score: 0.9570674801671032.





In [10]:
from sklearn.metrics import classification_report
model_path = 'best_model.pth'
model.load_state_dict(torch.load(model_path))
# 評估模型
model.eval()
all_predictions = []
all_true_labels = []
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        sentence_embeddings = batch['sentence_embeddings'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask, sentence_embeddings)
        logits = outputs
        predictions = torch.argmax(logits, dim=-1)
        all_predictions.extend(predictions.cpu().numpy())
        all_true_labels.extend(labels.cpu().numpy())

label_names = [id_to_label[id] for id in sorted(id_to_label.keys())]
# 計算並列印每個標籤的 F1-score、precision 和 recall
class_report = classification_report(all_true_labels, all_predictions, target_names=label_names, zero_division=0)
print(class_report)


              precision    recall  f1-score   support

       T1204       1.00      1.00      1.00        20
       T1018       1.00      1.00      1.00         9
       T1087       0.88      1.00      0.93         7
       T1021       0.94      0.89      0.91        18
       T1070       1.00      0.98      0.99        44
       T1082       1.00      1.00      1.00        38
       T1569       0.60      1.00      0.75         3
       T1518       1.00      1.00      1.00        11
       T1218       1.00      1.00      1.00        19
       T1573       0.82      0.88      0.85        16
       T1560       0.90      0.95      0.92        19
       T1203       1.00      1.00      1.00         5
       T1102       0.60      0.60      0.60         5
       T1562       0.92      1.00      0.96        12
       T1016       0.94      1.00      0.97        32
       T1090       1.00      1.00      1.00        16
       T1552       0.67      0.29      0.40         7
       T1041       1.00    

In [11]:
label_names = [id_to_label[id] for id in sorted(id_to_label.keys())]
label_names

['T1204',
 'T1018',
 'T1087',
 'T1021',
 'T1070',
 'T1082',
 'T1569',
 'T1518',
 'T1218',
 'T1573',
 'T1560',
 'T1203',
 'T1102',
 'T1562',
 'T1016',
 'T1090',
 'T1552',
 'T1041',
 'T1106',
 'T1049',
 'T1134',
 'T1036',
 'T1583',
 'T1140',
 'T1574',
 'T1074',
 'T1135',
 'T1564',
 'T1110',
 'T1113',
 'T1546',
 'T1012',
 'T1033',
 'T1057',
 'T1555',
 'T1112',
 'T1007',
 'T1047',
 'T1548',
 'T1055',
 'T1124',
 'T1119',
 'T1543',
 'T1588',
 'T1046',
 'T1003',
 'T1553',
 'T1083',
 'T1069',
 'T1497',
 'T1059',
 'T1132',
 'T1547',
 'T1005',
 'T1056',
 'T1027',
 'T1008',
 'T1071',
 'T1105',
 'T1078',
 'T1095',
 'T1053',
 'T1566']

In [17]:
import pandas as pd

file_path = '../dataset/ground_truth/llama2_finetuned_dataset.xlsx'

# 讀取 Excel 檔案
df = pd.read_excel(file_path)

# 顯示 DataFrame
print(df)


                                                INPUT  \
0   All observed attacks start with an email messa...   
1   Benign activity ran for most of the morning wh...   
2   The attack started by browsing to http://128.5...   
3   First attacked ta51-pivot-2 and deployed OC2, ...   
4   The attacker first tried to attack from an out...   
5   Copied files via SCP and connected via SSH fro...   
6   The threat actors sent the trojanized Microsof...   
7   The Adobe_Flash_install.rar archive that was r...   
8   Here is the email content, masquerading as an ...   
9   The attackers would usually send a malicious e...   
10  This spyware arrives on a system as a file dro...   
11  As we might expect, if the user decides to dow...   
12  The first task of the malware is to install th...   
13  The first task of the malware is to install th...   
14  The group used tactics that have become extrem...   
15  On February 11, FireEye identified a zero-day ...   
16  Jaguar Tooth is non-persist

In [18]:
input_list = df["INPUT"].tolist()
input_list

["All observed attacks start with an email message, containing either a malicious attachment or a URL which leads to the first stage of the attack. The text of the emails is likely taken from legitimate email, such as mailing lists that targeted organizations may be subscribed to. Below are three examples, with the first one purporting to be sent by the European Banking Federation and is using a newly registered domain for the spoofed sender email address. The attachment is a malicious PDF file that entices the user to click on a URL to download and open a weaponized RTF file containing exploits for CVE-2017-11882, CVE-2017-8570 and CVE-2018-8174. The final payload is a JScript backdoor also known as More_eggs that allows the attacker to control the affected system remotely.\nNotable applications used in these attacks are cmstp and msxsl. The Microsoft Connection Manager Profile Installer (cmstp.exe) is a command-line program used to install Connection Manager service profiles. Cmstp a

In [29]:
import nltk
from nltk.tokenize import sent_tokenize

input_new_list = []

for text in input_list:
    sentences = sent_tokenize(text)
    input_new_list.append(sentences)


input_new_list

[['All observed attacks start with an email message, containing either a malicious attachment or a URL which leads to the first stage of the attack.',
  'The text of the emails is likely taken from legitimate email, such as mailing lists that targeted organizations may be subscribed to.',
  'Below are three examples, with the first one purporting to be sent by the European Banking Federation and is using a newly registered domain for the spoofed sender email address.',
  'The attachment is a malicious PDF file that entices the user to click on a URL to download and open a weaponized RTF file containing exploits for CVE-2017-11882, CVE-2017-8570 and CVE-2018-8174.',
  'The final payload is a JScript backdoor also known as More_eggs that allows the attacker to control the affected system remotely.',
  'Notable applications used in these attacks are cmstp and msxsl.',
  'The Microsoft Connection Manager Profile Installer (cmstp.exe) is a command-line program used to install Connection Man

In [30]:
with open('sample.txt', 'r', encoding='utf-8') as file:
    text = file.read()

sentences = text.split('.')

sentences = [sentence.strip() for sentence in sentences]


sentences[:5]

['Unveiling NKAbuse: a new multiplatform threat abusing the NKN protocol\n\nDuring an incident response performed by Kaspersky’s Global Emergency Response Team (GERT) and GReAT, we uncovered a novel multiplatform threat named “NKAbuse”',
 'The malware utilizes\xa0NKN technology\xa0for data exchange between peers, functioning as a potent implant, and equipped with both flooder and backdoor capabilities',
 'Written in Go, it is flexible enough to generate binaries compatible with various architectures',
 'Our analysis suggests that the primary target of NKAbuse is Linux desktops',
 'However, in view of its ability to infect MISP and ARM systems, it also poses a threat to IoT devices']

In [32]:
from sklearn.metrics import classification_report

val_texts = sentences
val_labels = [0] * len(val_texts)
model_path = 'best_model.pth'  # 模型保存路徑
model.load_state_dict(torch.load(model_path))
# 評估模型
model.eval()

# 設置數據集
val_dataset = TextDataset(val_texts, val_labels)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)

import torch.nn.functional as F

all_predictions = []
all_true_labels = []
prediction_probabilities = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        sentence_embeddings = batch['sentence_embeddings'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask, sentence_embeddings)
        logits = outputs

        # 計算 softmax 機率
        probabilities = F.softmax(logits, dim=-1)

        # 獲取最高機率的預測
        predictions = torch.argmax(probabilities, dim=-1)
        
        # 提取每個預測的最高機率值
        max_probabilities = probabilities.max(dim=-1)[0]

        all_predictions.extend(predictions.cpu().numpy())
        all_true_labels.extend(labels.cpu().numpy())
        prediction_probabilities.extend(max_probabilities.cpu().numpy())


label_names = [id_to_label[id] for id in sorted(id_to_label.keys())]

label_mapping = {id: name for id, name in enumerate(label_names)}

predicted_labels = [label_mapping[prediction] for prediction in all_predictions]



# # 計算並列印每個標籤的 F1-score、precision 和 recall
# class_report = classification_report(all_true_labels, all_predictions, target_names=label_names, zero_division=0)
# print(class_report)

print(predicted_labels)

['T1008', 'T1041', 'T1027', 'T1008', 'T1562', 'T1105', 'T1547', 'T1055', 'T1095', 'T1583', 'T1008', 'T1095', 'T1071', 'T1203', 'T1036', 'T1059', 'T1105', 'T1059', 'T1059', 'T1082', 'T1105', 'T1074', 'T1027', 'T1497', 'T1547', 'T1082', 'T1074', 'T1027', 'T1016', 'T1083', 'T1074', 'T1074', 'T1053', 'T1573', 'T1053', 'T1095', 'T1078', 'T1553', 'T1041', 'T1497', 'T1090', 'T1059', 'T1008', 'T1041', 'T1090', 'T1102', 'T1027', 'T1027', 'T1059', 'T1497', 'T1082', 'T1113', 'T1113', 'T1083', 'T1057', 'T1027', 'T1041', 'T1573', 'T1102', 'T1090', 'T1553', 'T1005', 'T1497', 'T1059', 'T1027', 'T1583', 'T1588', 'T1105', 'T1018', 'T1588']


In [None]:
import pandas as pd

# 將這兩個列表合併到一個 DataFrame 中
df = pd.DataFrame({
    'Text': val_texts,
    'Predicted Label': predicted_labels,
    'Prediction Probabilities':prediction_probabilities
})

# 將 DataFrame 存儲為 Excel 文件
excel_path = 'predicted_labels.xlsx'
df.to_excel(excel_path, index=False)

excel_path



'predicted_labels.xlsx'

: 

In [37]:
#----  用來跑 ground truth 的程式碼 ----

from sklearn.metrics import classification_report

def predict(input_text):
    sentences = sent_tokenize(input_text)
    val_labels = [0] * len(val_texts)
    model_path = 'best_model.pth'  # 模型保存路徑
    model.load_state_dict(torch.load(model_path))
    # 評估模型
    model.eval()

    # 設置數據集
    val_dataset = TextDataset(val_texts, val_labels)
    val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)

    import torch.nn.functional as F

    all_predictions = []
    all_true_labels = []
    prediction_probabilities = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            sentence_embeddings = batch['sentence_embeddings'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask, sentence_embeddings)
            logits = outputs

            # 計算 softmax 機率
            probabilities = F.softmax(logits, dim=-1)

            # 獲取最高機率的預測
            predictions = torch.argmax(probabilities, dim=-1)
            
            # 提取每個預測的最高機率值
            max_probabilities = probabilities.max(dim=-1)[0]

            all_predictions.extend(predictions.cpu().numpy())
            all_true_labels.extend(labels.cpu().numpy())
            prediction_probabilities.extend(max_probabilities.cpu().numpy())


    label_names = [id_to_label[id] for id in sorted(id_to_label.keys())]

    label_mapping = {id: name for id, name in enumerate(label_names)}

    predicted_labels = [label_mapping[prediction] for prediction in all_predictions]
    return list(zip(sentences,predicted_labels,prediction_probabilities))

df["TTP"] = df["INPUT"].apply(predict)

In [39]:
df

Unnamed: 0,INPUT,IoC,TTP,TA,OUTPUT,GRAPH,URL
0,All observed attacks start with an email messa...,"{'urls': [], 'xmpp_addresses': [], 'email_addr...",[(All observed attacks start with an email mes...,,,,https://github.com/li-zhenyuan/Knowledge-enhan...
1,Benign activity ran for most of the morning wh...,{'urls': ['http://215.237.119.171/config.html....,[(Benign activity ran for most of the morning ...,,,,
2,The attack started by browsing to http://128.5...,{'urls': ['http://128.55.12.167:8641/config.ht...,[(The attack started by browsing to http://128...,,,,
3,"First attacked ta51-pivot-2 and deployed OC2, ...","{'urls': ['http://128.55.12.233'],\n 'xmpp_add...",[(First attacked ta51-pivot-2 and deployed OC2...,,,,
4,The attacker first tried to attack from an out...,"{'urls': [],\n 'xmpp_addresses': [],\n 'email_...",[(The attacker first tried to attack from an o...,,,,
5,Copied files via SCP and connected via SSH fro...,"{'urls': [],\n 'xmpp_addresses': [],\n 'email_...",[(Copied files via SCP and connected via SSH f...,,,,
6,The threat actors sent the trojanized Microsof...,{'urls': ['http://droobox.online:80/luncher.do...,[(The threat actors sent the trojanized Micros...,,,,
7,The Adobe_Flash_install.rar archive that was r...,"{'urls': ['summerevent.webhop.net/QuUA'],\n 'x...",[(The Adobe_Flash_install.rar archive that was...,,,,
8,"Here is the email content, masquerading as an ...","{'urls': ['http://bot.whatismyipaddress.com'],...","[(Here is the email content, masquerading as a...",,,,
9,The attackers would usually send a malicious e...,"{'urls': [],\n 'xmpp_addresses': [],\n 'email_...",[(The attackers would usually send a malicious...,,,,


In [40]:
df.to_excel("llama2_finetuned_dataset_new.xlsx",index=False)

: 