In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# We have to check which Torch version for Xformers (2.3 -> 0.0.27)
from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = ""
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.27. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
import pandas as pd
import json

# Load the TSV file
df = pd.read_csv('/content/output_questions (2).tsv', sep='\t')
df.columns = df.columns.str.strip()

# Create a list to hold the JSON objects
data = []

# Process each row in the DataFrame
for _, row in df.iterrows():
    entry = {
        'question': row['Question'],
        'options': {
            'A': row['Option A'],
            'B': row['Option B'],
            'C': row['Option C'],
            'D': row['Option D']
        },
        'correct_answer': row['Correct Answer']
    }
    data.append(entry)

# Save to JSONL file
with open('/content/dataJson.jsonl', 'w') as f:
    for item in data:
        f.write(json.dumps(item) + '\n')


In [None]:
import json

input_file = "/content/dataJson.jsonl"
output_file = "/content/cleaned_dataJson.jsonl"

# Function to sanitize options (convert all option values to strings)
def sanitize_options(options):
    for key, value in options.items():
        options[key] = str(value)  # Ensure all options are strings
    return options

# Open the original dataset and the output file
with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
    for line in infile:
        try:
            # Parse each line as JSON
            example = json.loads(line)

            # Sanitize the 'options' field
            if 'options' in example:
                example['options'] = sanitize_options(example['options'])

            # Write the sanitized example back to the new file
            outfile.write(json.dumps(example) + "\n")
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            continue  # Skip any lines with JSON errors

print("Dataset has been cleaned and saved to", output_file)


Dataset has been cleaned and saved to /content/cleaned_dataJson.jsonl


In [None]:
import json

input_file_path = '/content/cleaned_dataJson.jsonl'
output_file_path = '/content/cleaned_dataJson_fixed.jsonl'

with open(input_file_path, 'r') as infile, open(output_file_path, 'w') as outfile:
    for line in infile:
        try:
            record = json.loads(line)
            # Ensure 'question' and 'correct_answer' are strings
            if 'question' in record:
                record['question'] = str(record['question'])
            if 'correct_answer' in record:
                record['correct_answer'] = str(record['correct_answer'])
            # Write the fixed record to the new file
            json.dump(record, outfile)
            outfile.write('\n')
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
        except Exception as e:
            print(f"Error processing record: {e}")

print(f"Preprocessing complete. Fixed file saved as {output_file_path}")


Preprocessing complete. Fixed file saved as /content/cleaned_dataJson_fixed.jsonl


In [None]:
# Use the fixed JSONL file
file_path = "/content/cleaned_dataJson_fixed.jsonl"

# Load and format the dataset
try:
    dataset = load_dataset("json", data_files={"train": file_path}, split="train")
    # Apply the formatting function to the dataset with batching for efficiency
    formatted_dataset = dataset.map(formatting_prompts_func, batched=True)

    # Print the first formatted example to verify
    print(f"Formatted text: {formatted_dataset[0]['text']}")
except Exception as e:
    print(f"Dataset generation error: {e}")


Formatted text: Answer the following multiple-choice question.
Which MITRE ATT&CK technique involves an adversary directly interacting with a victim to manipulate them into divulging confidential information?
Options:
A: Spearphishing Attachment (T1193)
B: Social Engineering (T1140)
C: Remote Access Tools (T1219)
D: Man-in-the-Middle Attack (T1557)
The correct answer is B.


In [None]:
import re
from typing import Dict, Any, List
from datasets import load_dataset

def formatting_prompts_func(examples: Dict[str, Any]) -> Dict[str, Any]:
    questions = examples["question"]
    options = examples["options"]
    correct_answers = examples["correct_answer"]
    texts = []

    for question, option, correct_answer in zip(questions, options, correct_answers):
        try:
            # Print the raw data to ensure it's correct (for debugging)
            print(f"Processing question: {question}")
            print(f"Options: {option}")
            print(f"Correct answer: {correct_answer}")

            # Ensure the dictionary has the required keys and correct_answer is a string
            if isinstance(option, dict) and all(k in option for k in ['A', 'B', 'C', 'D']):
                instruction = "Answer the following multiple-choice question."
                input_text = f"{question}\nOptions:\nA: {option['A']}\nB: {option['B']}\nC: {option['C']}\nD: {option['D']}"
                # Convert correct_answer to string
                output = f"The correct answer is {str(correct_answer)}."
                text = f"{instruction}\n{input_text}\n{output}"  # Removed alpaca_prompt.format and EOS_TOKEN
                texts.append(text)
                print(f"Generated text: {text}")
            else:
                print(f"Skipping example with missing or invalid options: {option}")
        except Exception as e:
            print(f"Error processing question: {e}")

    return {"text": texts}

# Check if file exists and is readable
import os
file_path = "/content/cleaned_dataJson_fixed.jsonl"
if not os.path.isfile(file_path):
    raise FileNotFoundError(f"The file {file_path} does not exist or is not accessible.")

# Load and format the dataset
try:
    dataset = load_dataset("json", data_files={"train": file_path}, split="train")
    # Apply the formatting function to the dataset with batching for efficiency
    formatted_dataset = dataset.map(formatting_prompts_func, batched=True)

    # Print the first formatted example to verify
    print(f"Formatted text: {formatted_dataset[0]['text']}")
except Exception as e:
    print(f"Dataset generation error: {e}")


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/2518 [00:00<?, ? examples/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Correct answer: A
Generated text: Answer the following multiple-choice question.
Which MITRE ATT&CK technique involves an adversary utilizing software that is built into a system to take advantage of a trust relationship, bypassing the normal security mechanisms?
Options:
A:  A. Valid Accounts
B:  B. Exploit Public-Facing Application
C:  C. System Owner/User Discovery
D:  D. Use Alternate Authentication Material
The correct answer is A.
Processing question: Which MITRE ATT&CK technique involves an adversary collecting sensitive information from the victim's machine using a keylogger?
Options: {'A': 'Process Discovery', 'B': 'Clipboard Data', 'C': 'Audio Capture', 'D': 'Credential Dump'}
Correct answer: B
Generated text: Answer the following multiple-choice question.
Which MITRE ATT&CK technique involves an adversary collecting sensitive information from the victim's machine using a keylogger?
Options:
A: Process Discovery

In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and evaluation subsets
dataset_split = formatted_dataset.train_test_split(test_size=0.2, seed=42)

# Extract training and evaluation subsets
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from sklearn.metrics import accuracy_score

# Define a metrics calculation function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)  # For classification tasks
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

# Define training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=5,
    num_train_epochs=5,  # Set number of epochs to 3
    learning_rate=1e-4,
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    logging_steps=1,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    output_dir="outputs",
)

# Initialize SFTTrainer with compute_metrics
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="text",
    max_seq_length=512,  # Adjust as needed
    dataset_num_proc=2,
    packing=False,
    args=training_args,
    compute_metrics=compute_metrics,  # Add this line
)

# Train the model
trainer.train()


Map (num_proc=2):   0%|          | 0/2014 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/504 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 2,014 | Num Epochs = 5
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 1,255
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.041
2,1.9407
3,1.9363
4,1.9687
5,1.9188
6,1.7581
7,1.758
8,1.3649
9,1.3075
10,1.2694


Step,Training Loss
1,2.041
2,1.9407
3,1.9363
4,1.9687
5,1.9188
6,1.7581
7,1.758
8,1.3649
9,1.3075
10,1.2694


TrainOutput(global_step=1255, training_loss=0.5240351760767371, metrics={'train_runtime': 5445.5015, 'train_samples_per_second': 1.849, 'train_steps_per_second': 0.23, 'total_flos': 4.336572068428186e+16, 'train_loss': 0.5240351760767371, 'epoch': 4.985104270109235})

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:

model.save_pretrained("/content/drive/MyDrive/modeliiiiiiy") # Local saving
tokenizer.save_pretrained("/content/drive/MyDrive/modeliiiiiiy")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

('/content/drive/MyDrive/modeliiiiiiy/tokenizer_config.json',
 '/content/drive/MyDrive/modeliiiiiiy/special_tokens_map.json',
 '/content/drive/MyDrive/modeliiiiiiy/tokenizer.json')