In [None]:
!pip install peft



In [None]:
!cp /content/drive/MyDrive/contract-nli.zip /home/

In [None]:
!unzip /home/contract-nli.zip -d /home/

Archive:  /home/contract-nli.zip
replace /home/contract-nli/dev.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
#Load libraries
import torch
from transformers import get_scheduler, AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, AutoModel, AutoProcessor
from peft import LoraConfig, get_peft_model
import accelerate
from PIL import Image
import time
import torch.nn.functional as F
from torch import nn
import torch.optim as optim
import torchvision
from torchvision import datasets, transforms
#from datasets import Dataset
from torch.utils.data import DataLoader, Dataset
import torchvision.models as models
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from IPython.display import Image as Im
from IPython.display import display
from torch.hub import load
from torch.optim import AdamW
from pathlib import Path
import os
import requests
import gc
from peft import PeftModel

In [None]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.3")
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.3", torch_dtype=torch.float16)

# Prepare optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-6)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
device = torch.device("cuda")

In [None]:
# Define LoRA config
lora_config = LoraConfig(
    r=8,  # The rank of the decomposition
    lora_alpha=32,  # Controls the scaling of the LoRA weights
    target_modules=["q_proj", "v_proj"],  # Target projection matrices in transformer layers
    lora_dropout=0.1,  # Dropout for LoRA layers
    bias="none",  # Don't add bias
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.to(device)

PeftModel(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32768, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear(in_feat

In [None]:

# Define the custom Dataset class
class TextDataset(Dataset):
    def __init__(self, directory, tokenizer, max_length=8192, stride=4096):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.directory = directory
        self.stride = stride
        self.token_chunks = self._load_and_tokenize()

    def _load_and_tokenize(self):
        """
        Load all text files from the directory and tokenize them with sliding window chunks.
        """
        token_chunks = []
        # Loop through all text files in the directory
        for filename in os.listdir(self.directory):
            if filename.endswith(".txt"):
                file_path = os.path.join(self.directory, filename)

                # Read the text file
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = file.read()

                # Tokenize and chunk with sliding window
                chunks = self._sliding_window_tokenization(text)
                token_chunks.extend(chunks)  # Append chunks from this file
        return token_chunks

    def _sliding_window_tokenization(self, text):
        """
        Tokenizes the text and creates sliding window chunks.
        """
        # Tokenize the text without truncation, no special tokens added
        tokenized = self.tokenizer(text, return_tensors="pt", truncation=False, add_special_tokens=False)
        tokens = tokenized['input_ids'].squeeze(0)

        total_tokens = tokens.size(0)
        chunks = []

        # Create sliding window chunks
        for i in range(0, total_tokens, self.stride):
            window = tokens[i:i + self.max_length]
            attention_mask = torch.ones_like(window)

            if len(window) < self.max_length:
                # Pad the window and attention mask
                attention_mask = torch.cat([attention_mask, torch.zeros(self.max_length - len(window), dtype=torch.long)], dim=0)
                window = torch.cat([window, torch.zeros(self.max_length - len(window), dtype=torch.long)], dim=0)

            chunks.append((window, attention_mask))

        return chunks

    def __len__(self):
        return len(self.token_chunks)

    def __getitem__(self, idx):
        input_ids, attention_mask = self.token_chunks[idx]
        return {'input_ids': input_ids, 'attention_mask': attention_mask}

# Function to load text files into a dataset and prepare them for Mistral v0.3
def load_texts_to_dataset(directory, tokenizer, batch_size=8, max_length=8192):
    stride = max_length // 2  # 50% overlap
    # Create the custom dataset
    dataset = TextDataset(directory, tokenizer, max_length=max_length, stride=stride)

    # Create a DataLoader for batching
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    return dataloader


# Iterate through the dataloader
#for batch in dataloader:
#    input_ids = batch['input_ids']
#    attention_mask = batch['attention_mask']
#    print("Input IDs shape:", input_ids.shape)
#    print("Attention mask shape:", attention_mask.shape)


In [None]:

# Setting up config
directory = '/home/contract-nli/raw'
dataloader = load_texts_to_dataset(directory, tokenizer, batch_size=1, max_length=2048)


In [None]:
my_max_length=2048

In [None]:
num_epochs = 100
# Scheduler can be used to manage learning rate decay
num_training_steps = num_epochs * len(dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

model.train()
for epoch in range(num_epochs):

    start_epoch_time = time.time()  # Start timer for the epoch
    for batch in dataloader:
        optimizer.zero_grad()

        # Move input tensors to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss

        # Backward pass
        loss.backward()

        # Update weights
        optimizer.step()
        lr_scheduler.step()


    end_epoch_time = time.time()

    allocated_memory = torch.cuda.memory_allocated() / (1024 ** 2)  # Convert to MB
    reserved_memory = torch.cuda.memory_reserved() / (1024 ** 2)  # Convert to MB
    print(f"GPU memory allocated: {allocated_memory:.2f} MB")
    print(f"GPU memory reserved: {reserved_memory:.2f} MB")
    print(f"Epoch {epoch + 1}/{num_epochs} completed with loss: {loss.item()}")

    epoch_time = end_epoch_time - start_epoch_time
    print(f"Epoch {epoch+1}/{num_epochs} took {epoch_time:.4f} seconds")
    if epoch % 10 == 0:
        savepath = "/content/drive/MyDrive/mistral-lora-finetuned/" + str(epoch) + ".pth"
        torch.save({
          'model_state_dict': model.state_dict(),
          'optimizer_state_dict': optimizer.state_dict(),  # If you have an optimizer
          'epoch': epoch  # You can save other metadata like the current epoch
        }, savepath)

GPU memory allocated: 14122.81 MB
GPU memory reserved: 27876.00 MB
Epoch 1/100 completed with loss: 8.931796073913574
Epoch 1/100 took 703.2538 seconds
GPU memory allocated: 14122.81 MB
GPU memory reserved: 27876.00 MB
Epoch 2/100 completed with loss: 1.2718234062194824
Epoch 2/100 took 704.3287 seconds
GPU memory allocated: 14122.81 MB
GPU memory reserved: 27876.00 MB
Epoch 3/100 completed with loss: 10.918999671936035
Epoch 3/100 took 702.0434 seconds
GPU memory allocated: 14122.81 MB
GPU memory reserved: 27876.00 MB
Epoch 4/100 completed with loss: 14.241212844848633
Epoch 4/100 took 701.8674 seconds
GPU memory allocated: 14122.81 MB
GPU memory reserved: 27876.00 MB
Epoch 5/100 completed with loss: 2.7161953449249268
Epoch 5/100 took 701.8207 seconds
GPU memory allocated: 14122.81 MB
GPU memory reserved: 27876.00 MB
Epoch 6/100 completed with loss: 16.24726676940918
Epoch 6/100 took 701.9738 seconds
GPU memory allocated: 14122.81 MB
GPU memory reserved: 27876.00 MB
Epoch 7/100 compl

In [None]:
print(epoch)

99


In [None]:

savepath = "/content/drive/MyDrive/mistral-lora-finetuned/" + str(epoch) + ".pth"
torch.save({
  'model_state_dict': model.state_dict(),
  'optimizer_state_dict': optimizer.state_dict(),  # If you have an optimizer
  'epoch': epoch  # You can save other metadata like the current epoch
}, savepath)

In [None]:
model.save_pretrained("/content/drive/MyDrive/mistralora")
tokenizer.save_pretrained("/content/drive/MyDrive/mistralora")


('/content/drive/MyDrive/mistralora/tokenizer_config.json',
 '/content/drive/MyDrive/mistralora/special_tokens_map.json',
 '/content/drive/MyDrive/mistralora/tokenizer.model',
 '/content/drive/MyDrive/mistralora/added_tokens.json',
 '/content/drive/MyDrive/mistralora/tokenizer.json')

In [None]:
my_saved_model_base = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.3", torch_dtype=torch.float16)
my_saved_model_tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/mistralora")
my_saved_model = PeftModel.from_pretrained(my_saved_model_base, "/content/drive/MyDrive/mistralora")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
print(my_saved_model)

PeftModel(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32768, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear(in_feat

In [None]:
#Run Inference
# Step 1: Load the fine-tuned model and tokenizer
#model_dir = "/content/drive/MyDrive/mistral-lora-finetuned"  # Path to your fine-tuned model
#tokenizer = AutoTokenizer.from_pretrained(model_dir)
#model = AutoModelForCausalLM.from_pretrained(model_dir)


# Step 3: Define a function for inference
def generate_text(prompt, max_length=128, temperature=0.1, top_p=0.1):
    # Tokenize the input prompt
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate text with the fine-tuned model
    with torch.no_grad():
        output = my_saved_model.generate(**inputs,
                                max_length=max_length,
                                pad_token_id=tokenizer.eos_token_id
        )

    # Decode the generated tokens back into text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    return generated_text



In [None]:
my_saved_model.to(device)

PeftModel(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32768, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear(in_feat

In [None]:
prompt = "In the non-disclosure agreement between Creative Labs and Pacific Magtron how long have the parties have agreed to not share information"
generated_text = generate_text(prompt, max_length=256)

In [None]:
print(generated_text)

In the non-disclosure agreement between Creative Labs and Pacific Magtron how long have the parties have agreed to not share information with each other?

The agreement is for 10 years.

What is the penalty for breaching the agreement?

The penalty is $100,000.

What is the penalty for breaching the agreement?

The penalty is $100,000.

What is the penalty for breaching the agreement?

The penalty is $100,000.

What is the penalty for breaching the agreement?

The penalty is $100,000.

What is the penalty for breaching the agreement?

The penalty is $100,000.

What is the penalty for breaching the agreement?

The penalty is $100,000.

What is the penalty for breaching the agreement?

The penalty is $100,000.

What is the penalty for breaching the agreement?

The penalty is $100,000.

What is the


In [None]:
#Qualitative Model Testing
#Qquestion 1
question1_prompt = "In a typical Hitachi Corporation Non-disclosure agreement, what is the length of time the exiting employee cannot disclose trade secrets?"
Correct_response = "Exiting employee is bound to NDA for 6 months"
Testing_outcome = model(question1_prompt)

my_question1_metric = magic_formula_for_accuracy(correct_response, testing_outcome)