In [2]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import os

# Set project root
PROJECT_ROOT = "/content/drive/MyDrive/TrustVault"
MODEL_DIR = os.path.join(PROJECT_ROOT, "models")
DATA_DIR = os.path.join(PROJECT_ROOT, "data")
OUTPUT_DIR = os.path.join(PROJECT_ROOT, "outputs")

os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Project folders set up successfully!")


Project folders set up successfully!


In [4]:
!pip install transformers accelerate --quiet


In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load model and tokenizer
model_name = "microsoft/phi-2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

print("✅ Phi-2 model loaded successfully!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Phi-2 model loaded successfully!


In [6]:
prompt = "In the future, GLOBAL WARMING will lead to"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate output
outputs = model.generate(**inputs, max_new_tokens=50)
result = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("📝 Output:\n", result)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


📝 Output:
 In the future, GLOBAL WARMING will lead to more frequent and severe droughts, floods, and storms. These extreme weather events will have devastating effects on ecosystems, agriculture, and human settlements. Rising sea levels will also threaten coastal areas, leading to the displacement of millions of people.




In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Define the storage directory
model_dir = "/content/drive/MyDrive/TrustVault/models/phi-2"

# Load and save tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
tokenizer.save_pretrained(model_dir)

# Load and save model
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2",
    torch_dtype=torch.float16
)
model.save_pretrained(model_dir)

print("✅ Model and tokenizer saved to Google Drive successfully!")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Model and tokenizer saved to Google Drive successfully!


In [8]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_dir = "/content/drive/MyDrive/TrustVault/models/phi-2"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype=torch.float16, device_map="auto")

# Try a quick prompt
inputs = tokenizer("Grok AI assistant  is designed to", return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Grok AI assistant  is designed to help you with your daily tasks. It can help you with everything from scheduling appointments to ordering groceries.

<url_end>


Rewritten Paragraph:

Grok AI is a helpful assistant that can make your daily tasks easier


In [9]:
# Create the data directory first
import os

data_path = "/content/drive/MyDrive/TrustVault/data"
os.makedirs(data_path, exist_ok=True)

# Your dataset as a long string
dataset_jsonl = dataset_jsonl = """{"prompt": "Q: What is separation of powers in political science?\\nA:", "response": "It's the division of government responsibilities into distinct branches to limit any one branch from exercising the core functions of another."}
{"prompt": "Q: What is concept #13 in political science?\\nA:", "response": "Concept #13 refers to a key theory or idea often discussed in the study of political systems, such as pluralism or civic nationalism."}
{"prompt": "User: What are tip #21 for better productivity?\\nAssistant:", "response": "Tip #21: Always start with the most important task first and eliminate distractions early in the day."}
{"prompt": "Q: What is concept #17 in political science?\\nA:", "response": "Concept #17 refers to a key theory or idea often discussed in the study of political systems, such as pluralism or civic nationalism."}
{"prompt": "User: How can I stay focused during online classes?\\nAssistant:", "response": "Create a distraction-free space, set specific goals for each session, and take regular breaks to stay refreshed."}
{"prompt": "User: What's a quick way to relax during study breaks?\\nAssistant:", "response": "Try deep breathing exercises, a short walk, or listening to calming music for 5–10 minutes."}
{"prompt": "User: What are tip #11 for better productivity?\\nAssistant:", "response": "Tip #11: Always start with the most important task first and eliminate distractions early in the day."}
{"prompt": "Q: What does the term 'sovereignty' mean?\\nA:", "response": "Sovereignty is the supreme authority within a territory; the power of a state to govern itself without external interference."}
{"prompt": "User: What are tip #18 for better productivity?\\nAssistant:", "response": "Tip #18: Always start with the most important task first and eliminate distractions early in the day."}
{"prompt": "Q: What is concept #14 in political science?\\nA:", "response": "Concept #14 refers to a key theory or idea often discussed in the study of political systems, such as pluralism or civic nationalism."}
{"prompt": "Q: What is a Constitution?\\nA:", "response": "A Constitution is the supreme law of a country that lays down the framework of government, rights, and duties."}
{"prompt": "User: Recommend a healthy study snack.\\nAssistant:", "response": "Try nuts, fruits, or dark chocolate — they boost focus without causing a sugar crash."}
{"prompt": "Q: What is concept #12 in political science?\\nA:", "response": "Concept #12 refers to a key theory or idea often discussed in the study of political systems, such as pluralism or civic nationalism."}
{"prompt": "User: Suggest a daily planning app.\\nAssistant:", "response": "You can try Todoist, Notion, or Google Calendar — all great for managing tasks and schedules."}
{"prompt": "Q: Define 'bureaucracy'.\\nA:", "response": "Bureaucracy is a system of government where state officials make decisions instead of elected representatives."}
{"prompt": "User: What are tip #23 for better productivity?\\nAssistant:", "response": "Tip #23: Always start with the most important task first and eliminate distractions early in the day."}
{"prompt": "User: What's a productive morning routine for students?\\nAssistant:", "response": "Wake up early, hydrate, review goals for the day, and do a short physical activity like stretching or a walk."}
{"prompt": "User: What are tip #15 for better productivity?\\nAssistant:", "response": "Tip #15: Always start with the most important task first and eliminate distractions early in the day."}
{"prompt": "User: What are tip #13 for better productivity?\\nAssistant:", "response": "Tip #13: Always start with the most important task first and eliminate distractions early in the day."}
{"prompt": "Q: What is concept #15 in political science?\\nA:", "response": "Concept #15 refers to a key theory or idea often discussed in the study of political systems, such as pluralism or civic nationalism."}
"""
dataset_jsonl += """{"prompt": "Q: What are civil liberties?\\nA:", "response": "Civil liberties are personal freedoms protected by law from government interference, such as freedom of speech and religion."}
{"prompt": "Q: What is concept #19 in political science?\\nA:", "response": "Concept #19 refers to a key theory or idea often discussed in the study of political systems, such as pluralism or civic nationalism."}
{"prompt": "User: How to beat procrastination?\\nAssistant:", "response": "Break tasks into smaller steps, set short deadlines, and reward yourself after each milestone."}
{"prompt": "Q: What does 'federalism' mean?\\nA:", "response": "Federalism is a system of governance where power is divided between a central authority and constituent units (like states)."}
{"prompt": "User: Suggest tips for exam anxiety.\\nAssistant:", "response": "Prepare early, practice relaxation techniques, get proper sleep, and avoid last-minute cramming."}
{"prompt": "Q: What is concept #18 in political science?\\nA:", "response": "Concept #18 refers to a key theory or idea often discussed in the study of political systems, such as pluralism or civic nationalism."}
{"prompt": "User: Recommend a podcast for productivity.\\nAssistant:", "response": "Try 'Deep Work' by Cal Newport or 'The Tim Ferriss Show' — both have great insights on productivity."}
{"prompt": "Q: What is the meaning of political socialization?\\nA:", "response": "Political socialization is the process through which individuals acquire their political beliefs, values, and behaviors."}
{"prompt": "User: How to create a realistic study plan?\\nAssistant:", "response": "List your syllabus, estimate time for each topic, and schedule breaks to avoid burnout."}
{"prompt": "Q: What is concept #20 in political science?\\nA:", "response": "Concept #20 refers to a key theory or idea often discussed in the study of political systems, such as pluralism or civic nationalism."}
{"prompt": "User: Suggest a technique to memorize concepts.\\nAssistant:", "response": "Use active recall, spaced repetition (like Anki), and teach it to someone else — this helps retain info better."}
{"prompt": "Q: What are fundamental rights?\\nA:", "response": "They are rights guaranteed by the Constitution to protect individual freedom and dignity, such as the right to equality and freedom of speech."}
{"prompt": "User: How do I manage study and personal time effectively?\\nAssistant:", "response": "Use time-blocking, set daily priorities, and keep some buffer time to manage unexpected tasks."}
{"prompt": "Q: What is concept #16 in political science?\\nA:", "response": "Concept #16 refers to a key theory or idea often discussed in the study of political systems, such as pluralism or civic nationalism."}
{"prompt": "User: How to boost motivation during a study slump?\\nAssistant:", "response": "Remind yourself of your goals, change your environment, and take a short break or talk to someone supportive."}
{"prompt": "Q: What is the rule of law?\\nA:", "response": "The rule of law means all individuals and institutions are subject to and accountable to law that is fairly applied and enforced."}
{"prompt": "User: Recommend a journaling method for reflection.\\nAssistant:", "response": "Try the 3-2-1 method: list 3 things you’re grateful for, 2 things you learned, and 1 goal for tomorrow."}
{"prompt": "Q: Define 'democracy'.\\nA:", "response": "Democracy is a form of government where power lies with the people, who exercise it directly or through elected representatives."}
{"prompt": "User: What are tip #25 for better productivity?\\nAssistant:", "response": "Tip #25: Minimize multitasking and stay focused on one task at a time for better results."}
{"prompt": "User: Recommend a motivational quote.\\nAssistant:", "response": "“Success is the sum of small efforts, repeated day in and day out.” – Robert Collier"}
{"prompt": "Q: What is judicial review?\\nA:", "response": "Judicial review is the power of courts to assess whether laws and government actions comply with the Constitution."}
{"prompt": "User: How to beat afternoon energy crashes?\\nAssistant:", "response": "Take a 10-minute walk, drink water, eat a light protein snack, or try power naps of 15–20 minutes."}
{"prompt": "Q: What is concept #21 in political science?\\nA:", "response": "Concept #21 refers to a key theory or idea often discussed in the study of political systems, such as pluralism or civic nationalism."}
{"prompt": "Q: What are Directive Principles of State Policy?\\nA:", "response": "They are guidelines in the Indian Constitution to assist the government in ensuring social and economic welfare."}
{"prompt": "User: Suggest a weekend study strategy.\\nAssistant:", "response": "Use weekends to revise, practice previous papers, and plan for the week ahead — don’t forget to rest too!"}
"""


# Save it to a file
with open(os.path.join(data_path, "trustvault_dataset.jsonl"), "w") as f:
    f.write(dataset_jsonl)

print("✅ Dataset saved to Google Drive at /TrustVault/data/trustvault_dataset.jsonl")


✅ Dataset saved to Google Drive at /TrustVault/data/trustvault_dataset.jsonl


In [10]:
import json

dataset_path = "/content/drive/MyDrive/TrustVault/data/trustvault_dataset.jsonl"

parsed_dataset = []
with open(dataset_path, "r") as f:
    for line in f:
        parsed_dataset.append(json.loads(line.strip()))

# Preview the first 5 entries
for i, item in enumerate(parsed_dataset[:5]):
    print(f"\n🧠 Entry #{i+1}")
    print("Prompt:", item["prompt"])
    print("Response:", item["response"])



🧠 Entry #1
Prompt: Q: What is separation of powers in political science?
A:
Response: It's the division of government responsibilities into distinct branches to limit any one branch from exercising the core functions of another.

🧠 Entry #2
Prompt: Q: What is concept #13 in political science?
A:
Response: Concept #13 refers to a key theory or idea often discussed in the study of political systems, such as pluralism or civic nationalism.

🧠 Entry #3
Prompt: User: What are tip #21 for better productivity?
Assistant:
Response: Tip #21: Always start with the most important task first and eliminate distractions early in the day.

🧠 Entry #4
Prompt: Q: What is concept #17 in political science?
A:
Response: Concept #17 refers to a key theory or idea often discussed in the study of political systems, such as pluralism or civic nationalism.

🧠 Entry #5
Prompt: User: How can I stay focused during online classes?
Assistant:
Response: Create a distraction-free space, set specific goals for each s

In [12]:
pip install flwr




In [14]:
!pip install "cryptography<42"




In [15]:
import flwr as fl


In [16]:
import os

# Base project path
base_path = "/content/drive/MyDrive/TrustVault/TrustVault_Federated"

# Folder paths
data_path = os.path.join(base_path, "data")
os.makedirs(data_path, exist_ok=True)

# File paths
open(os.path.join(base_path, "shared_model.py"), "a").close()
open(os.path.join(base_path, "client.py"), "a").close()
open(os.path.join(base_path, "server.py"), "a").close()

print("✅ Folder structure created with empty files.")


✅ Folder structure created with empty files.


In [17]:
import json
from pathlib import Path

# Load original dataset
dataset_path = Path("/content/drive/MyDrive/TrustVault/data/trustvault_dataset.jsonl")
output_dir = Path("/content/drive/MyDrive/TrustVault/TrustVault_Federated/data")
output_dir.mkdir(parents=True, exist_ok=True)

with open(dataset_path, 'r') as f:
    lines = f.readlines()

# Split into 3 chunks
chunk_size = len(lines) // 3 + 1
clients = [lines[i:i + chunk_size] for i in range(0, len(lines), chunk_size)]

# Write to files
for i, client_data in enumerate(clients[:3], start=1):
    with open(output_dir / f"client{i}.jsonl", "w") as f:
        f.writelines(client_data)

print("✅ Dataset split into client1.jsonl, client2.jsonl, client3.jsonl")


✅ Dataset split into client1.jsonl, client2.jsonl, client3.jsonl


In [18]:
!pip install datasets




In [19]:
base_path = "/content/drive/MyDrive/TrustVault/TrustVault_Federated"

code_files = {
    "shared_model.py": '''
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
import torch
import json
from pathlib import Path

MODEL_NAME = "microsoft/phi-2"

def load_model():
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
    return model, tokenizer

def save_model(model, tokenizer, path=Path("/content/drive/MyDrive/TrustVault/TrustVault_Federated/saved_model")):
    path.mkdir(parents=True, exist_ok=True)
    model.save_pretrained(path)
    tokenizer.save_pretrained(path)

def train_on_data(model, tokenizer, file_path):
    with open(file_path, 'r') as f:
        lines = [json.loads(line.strip()) for line in f if line.strip()]

    texts = [ex["prompt"] + ex["completion"] for ex in lines]
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)

    model.train()
    outputs = model(**inputs, labels=inputs["input_ids"])
    loss = outputs.loss
    loss.backward()
    print(f"✅ Trained on {file_path.name}, Loss: {loss.item():.4f}")
    return model
''',

    "client.py": '''
from shared_model import load_model, train_on_data
from pathlib import Path

def load_client_data(file_path):
    model, tokenizer = load_model()
    trained_model = train_on_data(model, tokenizer, file_path)
    return trained_model

if __name__ == "__main__":
    file_path = Path("/content/drive/MyDrive/TrustVault/TrustVault_Federated/data/client1.jsonl")
    load_client_data(file_path)
''',

    "server.py": '''
import sys
from pathlib import Path

sys.path.append("/content/drive/MyDrive/TrustVault/TrustVault_Federated")

from shared_model import load_model, save_model, train_on_data

base_dir = Path("/content/drive/MyDrive/TrustVault/TrustVault_Federated")
data_dir = base_dir / "data"
client_files = ["client1.jsonl", "client2.jsonl", "client3.jsonl"]

global_model, tokenizer = load_model()

local_models = []

for client_file in client_files:
    print(f"📡 Training on {client_file}...")
    client_data_path = data_dir / client_file
    local_model = train_on_data(global_model, tokenizer, client_data_path)
    local_models.append(local_model)

def average_models(models):
    avg_model = models[0]
    for name, param in avg_model.named_parameters():
        for m in models[1:]:
            param.data += m.state_dict()[name].data
        param.data /= len(models)
    return avg_model

print("🧠 Aggregating models (FedAvg)...")
updated_model = average_models(local_models)
save_model(updated_model, tokenizer)
print("✅ Global model saved.")
'''
}

# Save the code to files
for file_name, code in code_files.items():
    with open(os.path.join(base_path, file_name), "w") as f:
        f.write(code.strip())

print("✅ All code files created or replaced properly.")


✅ All code files created or replaced properly.


In [20]:
import os

base_path = "/content/drive/MyDrive/TrustVault/TrustVault_Federated"
os.makedirs(base_path, exist_ok=True)
print("✅ Base folder ready:", base_path)


✅ Base folder ready: /content/drive/MyDrive/TrustVault/TrustVault_Federated


In [21]:
shared_model_code = '''
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import json
from pathlib import Path

MODEL_NAME = "microsoft/phi-2"

def load_model():
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
    return model, tokenizer

def save_model(model, tokenizer, path=Path("/content/drive/MyDrive/TrustVault/TrustVault_Federated/saved_model")):
    path.mkdir(parents=True, exist_ok=True)
    model.save_pretrained(path)
    tokenizer.save_pretrained(path)

def train_on_data(model, tokenizer, file_path):
    with open(file_path, 'r') as f:
        lines = [json.loads(line.strip()) for line in f if line.strip()]

    texts = [ex["prompt"] + ex["completion"] for ex in lines]
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)

    model.train()
    outputs = model(**inputs, labels=inputs["input_ids"])
    loss = outputs.loss
    loss.backward()
    print(f"✅ Trained on {file_path.name}, Loss: {loss.item():.4f}")
    return model
'''

with open(os.path.join(base_path, "shared_model.py"), "w") as f:
    f.write(shared_model_code.strip())

print("✅ shared_model.py created.")


✅ shared_model.py created.


In [22]:
client_code = '''
from shared_model import load_model, train_on_data
from pathlib import Path

def load_client_data(file_path):
    model, tokenizer = load_model()
    trained_model = train_on_data(model, tokenizer, file_path)
    return trained_model

if __name__ == "__main__":
    file_path = Path("/content/drive/MyDrive/TrustVault/TrustVault_Federated/data/client1.jsonl")
    load_client_data(file_path)
'''

with open(os.path.join(base_path, "client.py"), "w") as f:
    f.write(client_code.strip())

print("✅ client.py created.")


✅ client.py created.


In [23]:
server_code = '''
import sys
from pathlib import Path

sys.path.append("/content/drive/MyDrive/TrustVault/TrustVault_Federated")

from shared_model import load_model, save_model, train_on_data

base_dir = Path("/content/drive/MyDrive/TrustVault/TrustVault_Federated")
data_dir = base_dir / "data"
client_files = ["client1.jsonl", "client2.jsonl", "client3.jsonl"]

global_model, tokenizer = load_model()

local_models = []

for client_file in client_files:
    print(f"📡 Training on {client_file}...")
    client_data_path = data_dir / client_file
    local_model = train_on_data(global_model, tokenizer, client_data_path)
    local_models.append(local_model)

def average_models(models):
    avg_model = models[0]
    for name, param in avg_model.named_parameters():
        for m in models[1:]:
            param.data += m.state_dict()[name].data
        param.data /= len(models)
    return avg_model

print("🧠 Aggregating models (FedAvg)...")
updated_model = average_models(local_models)
save_model(updated_model, tokenizer)
print("✅ Global model saved.")
'''

with open(os.path.join(base_path, "server.py"), "w") as f:
    f.write(server_code.strip())

print("✅ server.py created.")


✅ server.py created.


In [24]:
!python /content/drive/MyDrive/TrustVault/TrustVault_Federated/client.py


2025-04-07 14:26:08.645799: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744035968.953562   29054 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744035969.033568   29054 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Loading checkpoint shards:   0% 0/2 [00:00<?, ?it/s]^C


In [25]:
!pip install opacus



In [26]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from torch.utils.data import DataLoader
from opacus import PrivacyEngine

def load_model(model_name="microsoft/phi-2"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    return tokenizer, model

def train_with_dp(model, train_dataset, lr=1e-4, noise_multiplier=1.0, max_grad_norm=1.0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    privacy_engine = PrivacyEngine()

    model, optimizer, train_loader = privacy_engine.make_private(
        module=model,
        optimizer=optimizer,
        data_loader=DataLoader(train_dataset, batch_size=2, shuffle=True),
        noise_multiplier=noise_multiplier,
        max_grad_norm=max_grad_norm,
    )

    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"]
        labels = batch["labels"]
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    return model


In [27]:
# shared_model.py

import torch
from torch import nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from opacus import PrivacyEngine

# Dummy Encryption / Decryption functions
def encrypt(data):
    print("🔐 Simulating encryption")
    return f"ENCRYPTED::{data}"

def decrypt(data):
    print("🔓 Simulating decryption")
    return data.replace("ENCRYPTED::", "")

def load_model():
    model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    return model, tokenizer

def train_on_data(model, tokenizer, data, enable_dp=False):
    import torch.optim as optim
    from torch.utils.data import DataLoader
    from torch.nn.utils.rnn import pad_sequence

    texts = [d['text'] for d in data]
    labels = torch.tensor([d['label'] for d in data])
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)

    class SimpleDataset(torch.utils.data.Dataset):
        def __init__(self, inputs, labels):
            self.inputs = inputs
            self.labels = labels

        def __len__(self):
            return len(self.labels)

        def __getitem__(self, idx):
            return {k: v[idx] for k, v in self.inputs.items()}, self.labels[idx]

    dataset = SimpleDataset(inputs, labels)
    dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

    model.train()
    optimizer = optim.AdamW(model.parameters(), lr=1e-5)

    if enable_dp:
        privacy_engine = PrivacyEngine()
        model, optimizer, dataloader = privacy_engine.make_private(
            module=model,
            optimizer=optimizer,
            data_loader=dataloader,
            noise_multiplier=1.0,
            max_grad_norm=1.0,
        )
        print("✅ Differential Privacy Enabled")

    for batch in dataloader:
        inputs, labels = batch
        outputs = model(**inputs)
        loss = nn.CrossEntropyLoss()(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    return model

In [28]:
%%writefile shared_model.py

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import opacus

def load_model():
    model_name = "microsoft/phi-2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    return model, tokenizer

def train_on_data(model, tokenizer, data, enable_dp=True):
    print("📚 Simulating training... (DP Enabled ✅)" if enable_dp else "📚 Simulating training...")

    if enable_dp:
        print("🔐 Applying Differential Privacy simulation (no actual gradients updated)...")

    # Simulation only — no real training here
    return model

def encrypt(text):
    return text[::-1]  # simple reverse as mock encryption

def decrypt(text):
    return text[::-1]


Overwriting shared_model.py


In [29]:
%%writefile client.py

import json
from pathlib import Path
import sys
import os

# No need to add to sys.path because everything is in Colab memory
from shared_model import load_model, train_on_data, encrypt

client_id = 1
client_file = Path("/content/drive/MyDrive/TrustVault/TrustVault_Federated/data") / f"client{client_id}.jsonl"

# Load data
with open(client_file, 'r') as f:
    lines = f.readlines()
    data = [json.loads(l.strip()) for l in lines]

# Load model
model, tokenizer = load_model()

# Train with DP
model = train_on_data(model, tokenizer, data, enable_dp=True)

# Simulate model update as a dummy string
update = f"Client{client_id}_model_weights"
encrypted_update = encrypt(update)

# Save encrypted model update
with open(f"/content/drive/MyDrive/TrustVault/TrustVault_Federated/data/client{client_id}_update.txt", "w") as f:
    f.write(encrypted_update)

print(f"✅ Client {client_id} training done. Encrypted update sent.")


Overwriting client.py


In [30]:
!python client.py


2025-04-07 14:27:28.773530: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744036049.056499   29442 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744036049.130821   29442 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Loading checkpoint shards:   0% 0/2 [00:00<?, ?it/s]^C


In [31]:
%%writefile server.py

import os
from pathlib import Path
from shared_model import decrypt

data_dir = Path("/content/drive/MyDrive/TrustVault/TrustVault_Federated/data")

# Read updates from all clients
client_updates = []
for i in range(1, 4):
    update_file = data_dir / f"client{i}_update.txt"
    if update_file.exists():
        with open(update_file, "r") as f:
            encrypted_update = f.read().strip()
            decrypted = decrypt(encrypted_update)
            print(f"🔓 Received update from Client {i}: {decrypted}")
            client_updates.append(decrypted)
    else:
        print(f"⚠️ Update from Client {i} not found.")

# Simulate aggregation
if client_updates:
    aggregated = "_".join(client_updates)
    print(f"\n✅ Aggregated Model Update: {aggregated}")

    # Optionally save this aggregated update
    with open(data_dir / "aggregated_model.txt", "w") as f:
        f.write(aggregated)

    print("📦 Aggregated model saved as aggregated_model.txt")
else:
    print("❌ No updates found from any client.")


Overwriting server.py


In [32]:
!python server.py


⚠️ Update from Client 1 not found.
⚠️ Update from Client 2 not found.
⚠️ Update from Client 3 not found.
❌ No updates found from any client.


In [33]:
# Delete pycache if it exists to avoid import bugs
!rm -rf "/content/drive/MyDrive/TrustVault/TrustVault_Federated/__pycache__"
print("✅ __pycache__ removed")


✅ __pycache__ removed


In [34]:
# Write shared_model.py with summarization and text generation logic
shared_code = '''
# shared_model.py

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, T5Tokenizer, T5ForConditionalGeneration

# Text Generation
def load_model():
    model_name = "microsoft/phi-2"
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
    return model, tokenizer

def train_on_data(model, tokenizer, data, enable_dp=False):
    print("✅ Simulated training done (DP enabled:", enable_dp, ")")
    return model

def encrypt(update):
    return f"encrypted({update})"

def decrypt(encrypted_update):
    return encrypted_update.replace("encrypted(", "").replace(")", "")

# Summarizer
def load_summarizer():
    model_name = "t5-small"
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    return model, tokenizer

def summarize_text(text, model, tokenizer, max_length=100):
    input_text = "summarize: " + text.strip().replace("\\n", " ")
    input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True)
    summary_ids = model.generate(input_ids, max_length=max_length, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary
'''

# Save to Drive
shared_path = "/content/drive/MyDrive/TrustVault/TrustVault_Federated/shared_model.py"
with open(shared_path, "w") as f:
    f.write(shared_code)

print("✅ shared_model.py written at:", shared_path)


✅ shared_model.py written at: /content/drive/MyDrive/TrustVault/TrustVault_Federated/shared_model.py


In [35]:
# Manually import shared_model from Google Drive path
import importlib.util
import sys

shared_path = "/content/drive/MyDrive/TrustVault/TrustVault_Federated/shared_model.py"

spec = importlib.util.spec_from_file_location("shared_model", shared_path)
shared_model = importlib.util.module_from_spec(spec)
sys.modules["shared_model"] = shared_model
spec.loader.exec_module(shared_model)

print("✅ shared_model module loaded manually")


✅ shared_model module loaded manually


In [36]:
# Example summarization using T5 from shared_model
input_text = """
Artificial intelligence is rapidly transforming industries by automating tasks,
improving decision-making, and enabling new capabilities. From healthcare to finance,
AI applications are enhancing productivity and delivering innovative solutions that were
previously unimaginable.
"""

model, tokenizer = shared_model.load_summarizer()
summary = shared_model.summarize_text(input_text, model, tokenizer)

print("\n📄 Original Text:\n", input_text.strip())
print("\n📝 Generated Summary:\n", summary.strip())


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565



📄 Original Text:
 Artificial intelligence is rapidly transforming industries by automating tasks,
improving decision-making, and enabling new capabilities. From healthcare to finance,
AI applications are enhancing productivity and delivering innovative solutions that were
previously unimaginable.

📝 Generated Summary:
 artificial intelligence is rapidly transforming industries by automating tasks, improving decision-making, and enabling new capabilities. AI applications are enhancing productivity and delivering innovative solutions that were previously unimaginable.


In [37]:
!rm -rf "/content/drive/MyDrive/TrustVault/TrustVault_Federated/__pycache__"
print("✅ __pycache__ removed (clean slate)")


✅ __pycache__ removed (clean slate)


In [40]:
shared_path = "/content/drive/MyDrive/TrustVault/TrustVault_Federated/shared_model.py"

shared_code = """import torch
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    T5Tokenizer, T5ForConditionalGeneration,
    AutoModelForQuestionAnswering,
    pipeline
)

# Text Generation
def load_text_generator():
    model_name = "EleutherAI/gpt-neo-125M"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    return model, tokenizer

def generate_text(prompt, model, tokenizer, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_length=max_length, do_sample=True, top_k=50)
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated

# Summarization
def load_summarizer():
    model_name = "t5-small"
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    return model, tokenizer

def summarize_text(text, model, tokenizer, max_length=100):
    input_text = "summarize: " + text.strip().replace("\\n", " ")
    input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True)
    summary_ids = model.generate(input_ids, max_length=max_length, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Q&A
def load_qa_pipeline():
    return pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

def answer_question(context, question, qa_pipeline):
    if not context or not context.strip():
        raise ValueError("❌ Context cannot be empty.")
    if not question or not question.strip():
        raise ValueError("❌ Question cannot be empty.")
    result = qa_pipeline(question=question, context=context)
    return result['answer']
"""

# Write to file
with open(shared_path, "w") as f:
    f.write(shared_code)

print("✅ shared_model.py written successfully")


✅ shared_model.py written successfully


In [41]:
import importlib.util
import sys

shared_path = "/content/drive/MyDrive/TrustVault/TrustVault_Federated/shared_model.py"

spec = importlib.util.spec_from_file_location("shared_model", shared_path)
shared_model = importlib.util.module_from_spec(spec)
sys.modules["shared_model"] = shared_model
spec.loader.exec_module(shared_model)

print("✅ shared_model loaded manually (no pycache issue)")


✅ shared_model loaded manually (no pycache issue)


In [42]:
# Q&A test with dummy context
context = """
Machine learning is a subset of artificial intelligence that provides systems the ability
to automatically learn and improve from experience without being explicitly programmed.
It focuses on the development of computer programs that can access data and use it to learn for themselves.
"""

question = "What does machine learning allow systems to do?"

qa_pipeline = shared_model.load_qa_pipeline()
answer = shared_model.answer_question(context, question, qa_pipeline)

print("❓ Question:", question)
print("📜 Context:", context.strip())
print("✅ Answer:", answer)


Device set to use cuda:0


❓ Question: What does machine learning allow systems to do?
📜 Context: Machine learning is a subset of artificial intelligence that provides systems the ability
to automatically learn and improve from experience without being explicitly programmed.
It focuses on the development of computer programs that can access data and use it to learn for themselves.
✅ Answer: automatically learn and improve from experience without being explicitly programmed


In [97]:
!rm -rf "/content/drive/MyDrive/TrustVault/TrustVault_Federated/__pycache__"
print("✅ __pycache__ removed again for clean interface setup.")


✅ __pycache__ removed again for clean interface setup.


In [98]:
# Create Streamlit app for Summarization + Q&A + Text Generation + File Upload
streamlit_code = '''import streamlit as st
import importlib.util
import sys
import datetime

# Load shared_model.py manually
shared_path = "/content/drive/MyDrive/TrustVault/TrustVault_Federated/shared_model.py"
spec = importlib.util.spec_from_file_location("shared_model", shared_path)
shared_model = importlib.util.module_from_spec(spec)
sys.modules["shared_model"] = shared_model
spec.loader.exec_module(shared_model)

st.set_page_config(page_title="TrustVault LLM", layout="wide")
st.title("🤖 TrustVault Secure LLM")

# --- Authentication ---
st.sidebar.header("🔐 Authentication")
username = st.sidebar.text_input("Username")
password = st.sidebar.text_input("Password", type="password")
auth_button = st.sidebar.button("Login")

# Simple auth logic (you can enhance this later)
if username != "admin" or password != "trustvault123":
    st.warning("Please enter valid credentials to access the app.")
    st.stop()

# --- Privacy Toggle ---
privacy_training = st.sidebar.toggle("🛡️ Private Training Mode", value=True)
st.sidebar.markdown(f"Privacy Mode is {'ON 🔒' if privacy_training else 'OFF 🌐'}")

# --- Mode selection ---
mode = st.sidebar.selectbox("Select Feature", ["Summarization", "Q&A", "Text Generation"])

# --- Summarization ---
if mode == "Summarization":
    st.header("📄 Text Summarization")
    input_text = st.text_area("Enter the text you want to summarize:", height=250)
    if st.button("Summarize"):
        with st.spinner("Loading summarizer..."):
            model, tokenizer = shared_model.load_summarizer()
            summary = shared_model.summarize_text(input_text, model, tokenizer)
            st.subheader("📝 Summary:")
            st.success(summary)

# --- Q&A ---
elif mode == "Q&A":
    st.header("❓ Question Answering")

    st.markdown("### 📂 Upload a .txt file for context")
    uploaded_file = st.file_uploader("Upload context file (.txt)", type=["txt"])
    context = ""

    if uploaded_file is not None:
        context = uploaded_file.read().decode("utf-8")
        st.text_area("📘 Context from uploaded file:", value=context, height=200)

    context_input = st.text_area("Or manually enter context:", value=context, height=200, key="manual_context")
    question = st.text_input("Enter your question:")

    answer = ""
    if st.button("Get Answer"):
        with st.spinner("Finding answer..."):
            qa_pipeline = shared_model.load_qa_pipeline()
            try:
                answer = shared_model.answer_question(context_input, question, qa_pipeline)
                st.subheader("✅ Answer:")
                st.info(answer)
            except Exception as e:
                st.error(str(e))

    st.markdown("### 💾 Download Q&A Result")
    if st.button("⬇️ Download Result as Text File"):
        if answer.strip():
            result_text = f"Question: {question}\\\\n\\\\nAnswer: {answer}"
            file_name = f"QA_Result_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
            st.download_button("📥 Download", result_text, file_name=file_name, mime="text/plain")
        else:
            st.warning("⚠️ No answer found. Run Q&A first.")

# --- Text Generation ---
elif mode == "Text Generation":
    st.header("🧠 Text Generation")
    prompt = st.text_area("Enter your prompt:", height=200)
    max_len = st.slider("Max output length", min_value=50, max_value=300, value=100, step=10)
    if st.button("Generate Text"):
        with st.spinner("Generating text..."):
            model, tokenizer = shared_model.load_text_generator()
            output = shared_model.generate_text(prompt, model, tokenizer, max_length=max_len)
            st.subheader("📘 Generated Text:")
            st.success(output)
'''

# Save to Drive
interface_path = "/content/drive/MyDrive/TrustVault/TrustVault_Federated/trustvault_app.py"
with open(interface_path, "w") as f:
    f.write(streamlit_code)

print("✅ trustvault_app.py created at:", interface_path)


✅ trustvault_app.py created at: /content/drive/MyDrive/TrustVault/TrustVault_Federated/trustvault_app.py


In [99]:
!pip install streamlit pyngrok --quiet


In [100]:
!find /content/drive/MyDrive/TrustVault/TrustVault_Federated -type d -name "__pycache__" -exec rm -r {} +


In [105]:
import streamlit as st
import importlib.util
import sys
import datetime

# Load shared_model.py manually from Drive
shared_path = "/content/drive/MyDrive/TrustVault/TrustVault_Federated/shared_model.py"
spec = importlib.util.spec_from_file_location("shared_model", shared_path)
shared_model = importlib.util.module_from_spec(spec)
sys.modules["shared_model"] = shared_model
spec.loader.exec_module(shared_model)

# --- User Authentication ---
st.set_page_config(page_title="TrustVault LLM", layout="wide")
st.title("🤖 TrustVault Secure LLM")

# Hardcoded credentials for demo
USERS = {
    "admin": "password123",
    "trustuser": "vaultsecure"
}

def login():
    st.sidebar.markdown("### 🔐 Login")
    username = st.sidebar.text_input("Username")
    password = st.sidebar.text_input("Password", type="password")
    if st.sidebar.button("Login"):
        if username in USERS and USERS[username] == password:
            st.session_state.authenticated = True
            st.session_state.username = username
        else:
            st.sidebar.error("❌ Invalid credentials")

if "authenticated" not in st.session_state:
    st.session_state.authenticated = False

st.session_state.authenticated = True
st.session_state.username = "dev"  # optional



# --- Sidebar ---
mode = st.sidebar.selectbox("Select Feature", ["Summarization", "Q&A", "Text Generation"])
privacy_mode = st.sidebar.toggle("🔒 Privacy Mode (Do not store user data)", value=True)

# --- Summarization ---
if mode == "Summarization":
    st.header("📄 Text Summarization")
    input_text = st.text_area("Enter the text you want to summarize:", height=250)
    if st.button("Summarize"):
        with st.spinner("Loading summarizer..."):
            model, tokenizer = shared_model.load_summarizer()
            summary = shared_model.summarize_text(input_text, model, tokenizer)
            st.subheader("📝 Summary:")
            st.success(summary)

# --- Q&A ---
elif mode == "Q&A":
    st.header("❓ Question Answering")

    st.markdown("### 📂 Upload a .txt file for context")
    uploaded_file = st.file_uploader("Upload context file (.txt)", type=["txt"])
    context = ""

    if uploaded_file is not None:
        context = uploaded_file.read().decode("utf-8")
        st.text_area("📘 Context from uploaded file:", value=context, height=200)

    context_input = st.text_area("Or manually enter context:", value=context, height=200, key="manual_context")
    question = st.text_input("Enter your question:")

    answer = ""
    if st.button("Get Answer"):
        with st.spinner("Finding answer..."):
            qa_pipeline = shared_model.load_qa_pipeline()
            try:
                answer = shared_model.answer_question(context_input, question, qa_pipeline)
                st.subheader("✅ Answer:")
                st.info(answer)
            except Exception as e:
                st.error(str(e))

    st.markdown("### 💾 Download Q&A Result")
    if st.button("⬇️ Download Result as Text File"):
        if answer.strip():
            result_text = f"Question: {question}\\n\\nAnswer: {answer}"
            file_name = f"QA_Result_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
            st.download_button("📥 Download", result_text, file_name=file_name, mime="text/plain")
        else:
            st.warning("⚠️ No answer found. Run Q&A first.")

# --- Text Generation ---
elif mode == "Text Generation":
    st.header("🧠 Text Generation")
    prompt = st.text_area("Enter your prompt:", height=200)
    max_len = st.slider("Max output length", min_value=50, max_value=300, value=100, step=10)
    if st.button("Generate Text"):
        with st.spinner("Generating text..."):
            model, tokenizer = shared_model.load_text_generator()
            output = shared_model.generate_text(prompt, model, tokenizer, max_length=max_len)
            st.subheader("📘 Generated Text:")
            st.success(output)

# --- Logging/Training Data (Optional) ---
if not privacy_mode:
    # Example: Save input/output to a local file for later training (disabled in privacy mode)
    st.markdown("📝 Note: Logging enabled. Your inputs/outputs may be stored for training.")




In [106]:
!ngrok config add-authtoken 2vPAruBIXPpRUDUe1f0G9Qd2weJ_5rp9nZkGPXhj9ZiPasao6

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [107]:
!find /content/drive/MyDrive/TrustVault/TrustVault_Federated -type d -name "__pycache__" -exec rm -r {} +


In [108]:
import os
from pyngrok import ngrok, conf

# Clean up: kill previous streamlit/ngrok
!pkill streamlit
!pkill ngrok

# Disconnect ngrok sessions
try:
    ngrok.kill()
except:
    pass

print("✅ All previous ngrok & streamlit processes terminated.")




✅ All previous ngrok & streamlit processes terminated.


In [109]:
from pyngrok import ngrok
import time

# Set your authtoken only ONCE per session
ngrok.set_auth_token("2vPAruBIXPpRUDUe1f0G9Qd2weJ_5rp9nZkGPXhj9ZiPasao6")  # Replace with your actual token

# Start tunnel
public_url = ngrok.connect(addr=8501, proto="http")
print("🔗 Public Streamlit URL:", public_url)

# Wait a bit to let it settle
time.sleep(2)

# Start Streamlit
!streamlit run /content/drive/MyDrive/TrustVault/TrustVault_Federated/trustvault_app.py &>/dev/null &


🔗 Public Streamlit URL: NgrokTunnel: "https://e2bc-34-91-248-18.ngrok-free.app" -> "http://localhost:8501"
