In [7]:
# Secure ML Environment
class SecureMLEnvironment:
    """Environment to securely handle data within isolated sandboxes."""
    def __init__(self):
        self.sandbox = {}

    def isolate(self, data):
        """Isolate data to sandbox for secure handling."""
        data_id = f"data_{len(self.sandbox)}"
        self.sandbox[data_id] = data
        return data_id, self.sandbox[data_id]

secure_env = SecureMLEnvironment()

# Privacy Safeguard Functions
class PrivacySafeguard:
    """Static methods for data anonymization and encryption."""
    key = Fernet.generate_key()
    cipher = Fernet(key)  # Create cipher outside the class for security

    @staticmethod
    def anonymize(data):
        return np.array(['ANONYMIZED' for _ in data])

    @staticmethod
    def encrypt_data(data):
        return [PrivacySafeguard.cipher.encrypt(str(item).encode()).decode() for item in data]

# Learning Dataset Class
class LearningDataset(Dataset):
    """Custom dataset class for secure and private data handling in machine learning."""
    def __init__(self, data, targets, tokenizer, max_length=512):
        data = PrivacySafeguard.anonymize(PrivacySafeguard.encrypt_data(data))
        self.data = data
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length  # Consistent max_length across instances

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        label = self.targets[idx]
        tokens = self.tokenizer(
            ' '.join(map(str, item)),
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            'input_ids': tokens['input_ids'].squeeze(0),
            'attention_mask': tokens['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.float32)
        }

# Model Setup
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

# Loading and preparing data

csv_file_path = './sample_data/learner_behavior_data7.csv'

def load_data(file_path):
    """Load data from a CSV file and isolate it in a secure environment."""
    df = pd.read_csv(file_path, chunksize=1000)  # Efficient data loading
    data = []
    targets = []
    for chunk in df:
        data.extend(chunk.iloc[:, :-4].values)
        targets.extend(chunk.iloc[:, -4:].values)
    data_id, isolated_data = secure_env.isolate(data)
    return isolated_data, targets

training_data, target_scores = load_data(csv_file_path)

# Data preparation
dataset = LearningDataset(training_data, target_scores, tokenizer)
train_size = int(0.75 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2)

# Optimizer and device setup
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training and validation loop
def train_and_validate(model, train_loader, val_loader, device, num_epochs=1):
    """Train and validate the model."""
    personalized_val_loss = float('inf')
    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0

        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_loader)

        # Validation
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_loader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

        # Save the personalized model
        if avg_val_loss < personalized_val_loss:
            personalized_val_loss = avg_val_loss
            model.save_pretrained('personalized-learning-model')
            tokenizer.save_pretrained('personalized-learning-model')

train_and_validate(model, train_loader, val_loader, device)

# Hyperparameter Tuning Function
def hyperparameter_tuning(data, targets, tokenizer, max_length=512):
    """
    Performs hyperparameter tuning for the machine learning model.

    Args:
        data: The training data.
        targets: The target labels.
        tokenizer: The tokenizer for text processing.
        max_length: The maximum sequence length for input data.

    Returns:
        The best hyperparameter configuration and the corresponding model.
    """

    def objective(params):
        """
        Objective function for hyperparameter optimization.

        Args:
            params: A dictionary of hyperparameter values.

        Returns:
            The validation loss of the model trained with the given hyperparameters.
        """

        # Create dataset and dataloaders
        dataset = LearningDataset(data, targets, tokenizer, max_length)
        train_size = int(0.75 * len(dataset))
        val_size = len(dataset) - train_size
        train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
        train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=params['batch_size'])

        # Create model and optimizer
        model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)
        optimizer = torch.optim.AdamW(model.parameters(), lr=params['learning_rate'])

        # Train and validate the model
        best_val_loss = float('inf')
        for epoch in range(params['epochs']):
            model.train()
            total_train_loss = 0

            for batch in train_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                optimizer.zero_grad()
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                loss.backward()
                optimizer.step()
                total_train_loss += loss.item()

            avg_train_loss = total_train_loss / len(train_loader)

            # Validation
            model.eval()
            total_val_loss = 0
            with torch.no_grad():
                for batch in val_loader:
                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    labels = batch['labels'].to(device)

                    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                    loss = outputs.loss
                    total_val_loss += loss.item()

            avg_val_loss = total_val_loss / len(val_loader)

            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss

        return {'loss': best_val_loss, 'status': STATUS_OK}

    # Define hyperparameter search space
    search_space = {
        'learning_rate': hp.loguniform('learning_rate', -5, -3),
        'batch_size': hp.choice('batch_size', [2, 4, 8]),
        'epochs': hp.choice('epochs', [3, 5, 10])
    }

    # Run hyperparameter tuning
    trials = Trials()
    best_params = fmin(objective, search_space, algo=tpe.suggest, max_evals=1, trials=trials)

    # Print best hyperparameters and validation loss
    print('Best hyperparameters:', best_params)
    print('Best validation loss:', trials.best_trial['result']['loss'])

    # Create and train the model with the best hyperparameters
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)
    optimizer = torch.optim.AdamW(model.parameters(), lr=best_params['learning_rate'])
    train_and_validate(model, train_loader, val_loader, device, num_epochs=best_params['epochs'])

    return model, best_params  # Return both model and hyperparameters

# Call the hyperparameter tuning function
model, best_params = hyperparameter_tuning(training_data, target_scores, tokenizer)

# Note: Additional functions for averaging models or further uses could be added similarly.
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score

# Inference Function
def calculate_learning_measures(logins, time_spent, page_visits, search_queries, activity_completion, quiz_score, reactions_pos, reactions_neg, feedback):
    input_text = f"{logins} {time_spent} {page_visits} {search_queries} {activity_completion} {quiz_score} {reactions_pos} {reactions_neg} {feedback}"
    tokens = tokenizer(input_text, padding='max_length', truncation=True, max_length=512, return_tensors="pt")
    tokens = {key: value.to(device) for key, value in tokens.items()}
    with torch.no_grad():
        output = model(**tokens)
        scores = torch.sigmoid(output.logits).squeeze(0).cpu().numpy()

    return {
        'Conscientiousness': round(scores[0] * 10, 2),
        'Motivation': round(scores[1] * 10, 2),
        'Understanding': round(scores[2] * 10, 2),
        'Engagement': round(scores[3] * 10, 2)
    }

# Example call to the function to demonstrate its use
learning_scores = calculate_learning_measures(4, 9, 11, 5, 80.0, 84.0, 3, 2, 6)
print("Learning Measures Scores:", learning_scores)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1, Train Loss: -13.4922, Val Loss: -20.3699
  0%|          | 0/1 [00:00<?, ?trial/s, best loss=?]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


100%|██████████| 1/1 [05:39<00:00, 339.59s/trial, best loss: -4435.16845703125]
Best hyperparameters: {'batch_size': 0, 'epochs': 2, 'learning_rate': 0.01851008248976466}
Best validation loss: -4435.16845703125


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2, Train Loss: -4.9585, Val Loss: -2.7928
Epoch 2/2, Train Loss: -4.6514, Val Loss: -2.7928
Learning Measures Scores: {'Conscientiousness': 5.16, 'Motivation': 6.17, 'Understanding': 5.98, 'Engagement': 6.17}
