In [None]:
%pip install transformers datasets scikit-learn torch

In [None]:
import json

with open('data/synthetic.json', 'r') as f:
    data = json.load(f)

print(len(data), "samples loaded")

texts = [item['sentence'] for item in data]
risk_scores = [item['risk'] for item in data]

In [4]:
from transformers import AutoTokenizer

# Load BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data

encodings = tokenizer(
    texts,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors='pt'
)

In [5]:
import torch
from torch.utils.data import Dataset

class riskDataset(Dataset):
    def __init__(self, encodings, risk_scores):
        self.encodings = encodings
        self.risks = risk_scores
    
    def __len__(self):
        return len(self.risks)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'risk': torch.tensor(self.risks[idx], dtype=torch.float)
        }

# Create dataset
dataset = riskDataset(encodings, risk_scores)


In [6]:
from torch.utils.data import DataLoader

# Split data into train, validation, and test sets
total_size = len(dataset)
train_size = int(total_size * 0.8)
val_size = int(total_size * 0.1)
test_size = total_size - train_size - val_size

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, val_size, test_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [7]:
from transformers import AutoModel
import torch.nn as nn
import torch

class riskBert(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = AutoModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        hidden_size = self.bert.config.hidden_size

        self.risk_head = nn.Linear(hidden_size, 1)   # Scalar risk score
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = self.dropout(outputs.pooler_output)
        risk = torch.sigmoid(self.risk_head(pooled))
        return risk

In [8]:
from torch.optim import AdamW
from tqdm import tqdm
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = riskBert().to(device)

risk_loss_fn = nn.MSELoss()

optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
epochs = 10
train_risk_losses = []
val_risk_losses = []
test_risk_losses = []

best_val_loss = float('inf')
best_model_state = None

for epoch in range(epochs):
    model.train()
    total_risk_train_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} - Training"):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        risk = batch['risk'].float().unsqueeze(1).to(device)

        risk_pred = model(input_ids, attention_mask)
        risk_loss = risk_loss_fn(risk_pred, risk)

        risk_loss.backward()
        optimizer.step()

        total_risk_train_loss += risk_loss.item()

    avg_train_loss = total_risk_train_loss / len(train_loader)
    train_risk_losses.append(avg_train_loss)

    # --- Validation ---
    model.eval()
    total_risk_val_loss = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            risk = batch['risk'].float().unsqueeze(1).to(device)

            risk_pred = model(input_ids, attention_mask)
            risk_loss = risk_loss_fn(risk_pred, risk)

            total_risk_val_loss += risk_loss.item()

    avg_val_loss = total_risk_val_loss / len(val_loader)
    val_risk_losses.append(avg_val_loss)

    # Save best model
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        best_model_state = model.state_dict()

    print(f"Epoch {epoch+1} | "
          f"Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

# --- Test Evaluation ---
    model.eval()
    total_risk_test_loss = 0

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            risk = batch['risk'].float().unsqueeze(1).to(device)

            risk_pred = model(input_ids, attention_mask)
            risk_loss = risk_loss_fn(risk_pred, risk)

            total_risk_test_loss += risk_loss.item()

    avg_test_loss = total_risk_test_loss / len(test_loader)
    test_risk_losses.append(avg_test_loss)

    print(f"Epoch {epoch+1}: Test Loss = {avg_test_loss:.4f}")

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

model.eval()

risk_true_test = []
risk_pred_test = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        risk = batch['risk'].to(device)

        risk_output = model(input_ids, attention_mask)
        risk_pred = risk_output.squeeze(1)  # scalar float output

        risk_true_test.extend(risk.cpu().tolist())
        risk_pred_test.extend(risk_pred.cpu().tolist())

# Metrics
risk_mse = mean_squared_error(risk_true_test, risk_pred_test)
risk_mae = mean_absolute_error(risk_true_test, risk_pred_test)
r2 = r2_score(risk_true_test, risk_pred_test)

print(f"✅ Test risk MSE: {risk_mse:.4f}")
print(f"✅ Test risk MAE: {risk_mae:.4f}")
print(f"✅ Test risk R2: {r2:.4f}")

In [None]:
import matplotlib.pyplot as plt

plt.plot(range(1, epochs + 1), train_risk_losses, label='Train MSE', marker='o')
plt.plot(range(1, epochs + 1), val_risk_losses, label='Val MSE', marker='s')
plt.plot(range(1, epochs + 1), test_risk_losses, label='Test MSE', marker='^')

plt.xlabel('Epoch')
plt.ylabel('Mean Squared Error')
plt.title('📉 risk Prediction Loss Over Epochs')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [12]:
torch.save(best_model_state, "best_model.pt")

In [None]:
import torch

# Initialize the model architecture
model = riskBert()  # Your model class
model.load_state_dict(torch.load("best_model.pt"))
model.eval()  # Set to evaluation mode

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [14]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def prepare_input(text):
    encoding = tokenizer(
        text,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )
    return encoding['input_ids'].to(device), encoding['attention_mask'].to(device)


In [15]:
def predict_risk(text):
    input_ids, attention_mask = prepare_input(text)

    with torch.no_grad():
        output = model(input_ids, attention_mask)
        risk_score = output.squeeze().item()  # scalar float

    return risk_score


In [17]:
test_sentences = [
    "The sky is blue today.",
    "I had cereal for breakfast.",
    "My window faces the street.",
    "There’s a new cafe near my office.",
    "My laptop battery died during a meeting.",
    "I watered the plants this morning.",
    "There’s a pigeon on the balcony again.",
    "I updated my apps last night.",
    "My desk is made of wood.",
    "I used a blue pen instead of black.",
    "My headphones were tangled again.",
    "The fridge light is oddly bright.",
    "I passed three traffic lights on green.",
    "My socks don’t match today.",
    "I folded laundry for an hour.",
    "The elevator was slow as usual.",
    "My shoes squeaked on the floor.",
    "The air conditioner is a bit loud.",
    "I sneezed three times in a row.",
    "I picked up a parcel from the post office.",
    "I sharpened two pencils this morning.",
    "The chair at my desk is slightly wobbly.",
    "I spilled some water on the counter.",
    "The calendar still says May.",
    "I forgot to close a tab on my browser.",
    "My coffee cup is chipped on the rim.",
    "I used the wrong remote for the TV.",
    "I heard a bird chirping outside.",
    "The light switch makes a clicking noise.",
    "I cleaned my keyboard with a tissue."
]

In [None]:
for text in test_sentences:
    risk = predict_risk(text)
    print(f"Input: {text}\nPredicted risk: {risk:.4f}\n")
