In [None]:
import torch
import pandas as pd
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer

# Load dataset
csv_path = "cleaned_combined_output.csv"  # Update with your file path
df = pd.read_csv(csv_path)

# Convert to Hugging Face dataset
dataset = Dataset.from_pandas(df)

# Load tokenizer and model
model_name = "microsoft/Phi-3-mini-128k-instruct"  # Update if needed
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    torch_dtype=torch.float16, 
    device_map="auto"
)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["judgment"], truncation=True, padding="max_length", max_length=1024)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# LoRA Config
lora_config = LoraConfig(
    r=8,  # Low-rank adaptation dimension
    lora_alpha=16,
    target_modules=["qkv_proj", "o_proj"],  # Fine-tune attention layers
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Training Arguments
training_args = TrainingArguments(
    output_dir="./phi3_legal_finetune1",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="no",
    save_strategy="epoch",
    logging_dir="./logs",
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=True,
    push_to_hub=False
)


# Trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_datasets,
    tokenizer=tokenizer,
    args=training_args,
)


# Start Training
trainer.train()

# Save Model
trainer.save_model("./phi3_legal_finetuned1")
tokenizer.save_pretrained("./phi3_legal_finetuned1")


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Replace with your base model (e.g., "microsoft/phi-3-mini-4k-instruct")
base_model = "microsoft/phi-3-mini-128k-instruct"

# Load model from last checkpoint
model = AutoModelForCausalLM.from_pretrained("phi3_legal_finetune/checkpoint-8127")

# Load tokenizer from base model
tokenizer = AutoTokenizer.from_pretrained(base_model)


In [None]:
save_dir = "NyayaMitra"

# Save model and tokenizer
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

print(f"Model saved successfully in {save_dir}!")


In [None]:
from huggingface_hub import notebook_login

notebook_login()  


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
save_dir = "NyayaMitra"
# Load saved model
model = AutoModelForCausalLM.from_pretrained(save_dir)
tokenizer = AutoTokenizer.from_pretrained(save_dir)

# Push to Hugging Face Hub
model.push_to_hub("Srinivastl/NyayaMit")
tokenizer.push_to_hub("srinivastl/NyayaMit")

print("Model successfully uploaded to Hugging Face!")


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/394M [00:00<?, ?B/s]

RuntimeError: Error while uploading 'model-00002-of-00004.safetensors' to the Hub.

In [None]:
from transformers import AutoModelForCausalLM

model_name = "microsoft/Phi-3-mini-128k-instruct"
model = AutoModelForCausalLM.from_pretrained(model_name)

# Print layer names
for name, module in model.named_modules():
    print(name)

In [None]:
import pandas as pd
df=pd.read_csv("cleaned_combined_output.csv")
df

In [None]:
import pandas as pd
df=pd.read_csv("cleaned_combined_output.csv")
df1=df[:10000]
df1.to_csv("Sample1.csv",index=False)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer
from datasets import load_dataset

# Load the previously fine-tuned model
model_checkpoint = "./phi3_legal_finetuned"  # Path to saved model
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Load new dataset from CSV
new_data_path = "cleaned_combined_output.csv"  # Update with your actual CSV file path
new_dataset = load_dataset("csv", data_files=new_data_path)

# Ensure the column name is correct (adjust based on your CSV structure)
TEXT_COLUMN = "judgment"  # Update this if your text is stored under a different column name

# Tokenize the new dataset
def tokenize_function(examples):
    return tokenizer(examples[TEXT_COLUMN], truncation=True, padding="max_length", max_length=512)

tokenized_new_dataset = new_dataset.map(tokenize_function, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir="./phi3_legal_finetune1",  # Continue saving in the same directory
    per_device_train_batch_size=4,
    evaluation_strategy="no",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    num_train_epochs=3,  # Continue training for more epochs
    push_to_hub=False
)

# Trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_new_dataset["train"],
    tokenizer=tokenizer,
    args=training_args,
)

# Resume Training
trainer.train()

# Save Model
trainer.save_model("./phi3_legal_finetuned2")
tokenizer.save_pretrained("./phi3_legal_finetuned2")


In [None]:
api="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"

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from trl import SFTTrainer
from datasets import load_dataset
import numpy as np
import torch
import evaluate

# Load the previously fine-tuned model
model_checkpoint = "./phi3_legal_finetune/checkpoint-8127"  # Path to saved model
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Load new dataset from CSV
new_data_path = "cleaned_combined_output.csv"  # Update with your actual CSV file path
new_dataset = load_dataset("csv", data_files=new_data_path)

# Ensure the column name is correct (adjust based on your CSV structure)
TEXT_COLUMN = "judgment"  # Update this if your text is stored under a different column name

# ✅ Split dataset into train (90%) and validation (10%)
dataset_split = new_dataset["train"].train_test_split(test_size=0.1)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples[TEXT_COLUMN], truncation=True, padding="max_length", max_length=512)

tokenized_datasets = dataset_split.map(tokenize_function, batched=True)

# ✅ Load evaluation metrics (Perplexity & Accuracy)
perplexity_metric = evaluate.load("perplexity")
accuracy_metric = evaluate.load("accuracy")

# ✅ Function to compute evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Convert logits to loss (Perplexity calculation)
    logits_tensor = torch.tensor(logits)
    labels_tensor = torch.tensor(labels)

    loss_fct = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)  # Ignore padding tokens
    loss = loss_fct(logits_tensor.view(-1, logits_tensor.size(-1)), labels_tensor.view(-1)).item()
    perplexity = np.exp(loss)

    # Accuracy: Compare predicted tokens vs actual tokens (ignoring padding)
    non_pad_mask = labels_tensor != tokenizer.pad_token_id
    correct = (predictions == labels_tensor) & non_pad_mask
    accuracy = correct.sum().item() / non_pad_mask.sum().item()

    return {"perplexity": perplexity, "accuracy": accuracy}

# ✅ Training arguments with evaluation every 10,000 or 20,000 steps
training_args = TrainingArguments(
    output_dir="./phi3_legal_finetune1",  
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="steps",  # ✅ Evaluate every X steps
    eval_steps=10000,  # Change to 20000 if needed
    save_strategy="steps",
    save_steps=10000,  # Save model every 10,000 steps
    logging_dir="./logs",
    logging_steps=500,  # Log progress every 500 steps
    save_total_limit=2,
    num_train_epochs=3,  
    push_to_hub=False,
    eval_accumulation_steps=10
)

# ✅ Trainer with evaluation enabled
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],  # Include validation set for evaluation
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,  # Add evaluation function
)

# ✅ Resume Training with Evaluation
trainer.train()

# ✅ Save Modelz
trainer.save_model("./phi3_legal_finetuned2")
tokenizer.save_pretrained("./phi3_legal_finetuned2")


In [2]:
import pandas as pd

df=pd.read_csv("cleaned_combined_output.csv")

df1=df[:50000]

df1.to_csv("sample.csv",index=False)

In [None]:
import zipfile
import os

def compress_directory(directory_path, zip_name):
    with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(directory_path):
            for file in files:
                file_path = os.path.join(root, file)
                zipf.write(file_path, os.path.relpath(file_path, directory_path))  # Preserve directory structure
    print(f"Compressed directory {directory_path} into {zip_name}")

# Example usage
compress_directory("NyayaMitra", "compressed.zip")
