In [None]:
import os
import random
import functools
import csv
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from sklearn.metrics import f1_score
from skmultilearn.model_selection import iterative_train_test_split
from datasets import Dataset, DatasetDict
from peft import PeftModel, PeftConfig
from peft import (
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model
)
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)
import wandb
wandb.login()

# let's log every trained
# %env WANDB_LOG_MODEL=true

from huggingface_hub import notebook_login

notebook_login()

In [None]:
# Method and Model Configuration
# ---------------------------------------------------------
entity = "rstern"
debugg = False
model_name = 'mistralai/Mistral-7B-v0.1'
# ---------------------------------------------------------
model_config = "embeddings" # possible values: org_model, frozen
augmented_data = True # True: aug, False: org
author_label_only = True # False: b and True: a
experiment_name = "emb_aug_a" # model_config augmented_data author_label_only
# model name

In [None]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False
if IN_COLAB:
  from google.colab import drive
  import sys
  drive.mount('/content/drive')
  # sys.path.append('/content/drive/MyDrive/ucph/LP Project') # If working in collab change this path
  path = '/content/drive/MyDrive/ucph/LP Project/'
  if augmented_data:
    train_df = pd.read_csv(f'{path}balanced_train.csv')
    val_df = pd.read_csv(f'{path}balanced_val.csv')
  else:
    train_df = pd.read_csv(f'{path}df_train.csv')
    val_df = pd.read_csv(f'{path}df_validation.csv')
else:
    if augmented_data:
        train_df = pd.read_csv(f'balanced_train.csv')
        val_df = pd.read_csv(f'balanced_val.csv')
    else:
        train_df = pd.read_csv(f'df_train.csv')
        val_df = pd.read_csv(f'df_validation.csv')

# shuffle dataset
train_df = train_df.sample(frac=1, random_state=42)
val_df = val_df.sample(frac=1, random_state=42)
print(train_df.sample(5))

In [None]:
def create_multilabel(row):
  label_dataset = max(min(1, row["label_dataset"]), 0)
  multilabel = np.array([int(row["label_author"]), int(label_dataset)])
  return multilabel

def create_singlelabel(row):
  label = np.array([int(row["label_author"])])
  return label

if author_label_only:
  train_df["label"] = train_df.apply(create_singlelabel, axis=1)
  val_df["label"] = val_df.apply(create_singlelabel, axis=1)
  label_weights = [1]
else:
  train_df["label"] = train_df.apply(create_multilabel, axis=1)
  val_df["label"] = val_df.apply(create_multilabel, axis=1)
  # weight author label heavier than topic change label
  label_weights = [2,1]

y_train = train_df["label"].values
y_train = np.stack(y_train)
y_val = val_df["label"].values
y_val = np.stack(y_val)

x_train_par1 = train_df["paragraph1"].values
x_train_par2 = train_df["paragraph2"].values

x_val_par1 = val_df["paragraph1"].values
x_val_par2 = val_df["paragraph2"].values


In [None]:

ds = DatasetDict({
    'train': Dataset.from_dict({'paragraph1': x_train_par1, 'paragraph2': x_train_par2, 'labels': y_train}),
    'val': Dataset.from_dict({'paragraph1': x_val_par1, 'paragraph2': x_val_par2, 'labels': y_train})
})


In [None]:
"""import torch
import torch.nn.functional as F

from torch import Tensor
from transformers import AutoTokenizer, AutoModel


def last_token_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]


tokenizer = AutoTokenizer.from_pretrained('mistralai/Mistral-7B-v0.1')
model = AutoModel.from_pretrained('mistralai/Mistral-7B-v0.1')
# model.to('cuda')

max_length = 4096

batch_dict = tokenizer(documents, max_length=max_length - 1, return_attention_mask=False, padding=False, truncation=True)

batch_dict['input_ids'] = [input_ids + [tokenizer.eos_token_id] for input_ids in batch_dict['input_ids']]
batch_dict = tokenizer.pad(batch_dict, padding=True, return_attention_mask=True, return_tensors='pt')
# batch_dict.to('cuda')

outputs = model(**batch_dict)
embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

embeddings = F.normalize(embeddings, p=2, dim=1)
embeddings = embeddings.tolist()
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel
from datasets import DatasetDict, Dataset

# Define the model architecture for the classifier
class Classifier(nn.Module):
    def __init__(self, input_size):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(input_size * 2, 512)
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, 2)  # Assuming binary classification

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

def last_token_pool(last_hidden_states, attention_mask):
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]


# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('mistralai/Mistral-7B-v0.1')
model = AutoModel.from_pretrained('mistralai/Mistral-7B-v0.1')

# Define batch size and other parameters
batch_size = 16
max_length = 4096

# Prepare DataLoader for training and validation
def collate_fn(batch):
    inputs1 = tokenizer(batch['paragraph1'], padding='max_length', max_length=max_length, truncation=True, return_tensors='pt')
    inputs2 = tokenizer(batch['paragraph2'], padding='max_length', max_length=max_length, truncation=True, return_tensors='pt')
    
    with torch.no_grad():
        outputs1 = model(**inputs1)
        embeddings1 = last_token_pool(outputs1.last_hidden_state, inputs1['attention_mask'])
        
        outputs2 = model(**inputs2)
        embeddings2 = last_token_pool(outputs2.last_hidden_state, inputs2['attention_mask'])
    
    return embeddings1, embeddings2

train_loader = DataLoader(ds['train'], batch_size=batch_size, collate_fn=collate_fn)
val_loader = DataLoader(ds['val'], batch_size=batch_size, collate_fn=collate_fn)

# Instantiate the classifier model
classifier = Classifier(model.config.hidden_size)

# Initialize wandb
wandb.init(project='lp2-embeddings', entity='entity')

# Log hyperparameters
config = wandb.config
config.batch_size = batch_size
config.max_length = max_length

# Training loop
optimizer = torch.optim.Adam(classifier.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
classifier.to(device)

for epoch in range(num_epochs):
    classifier.train()
    for batch_embeddings1, batch_embeddings2 in train_loader:
        batch_embeddings1 = batch_embeddings1.to(device)
        batch_embeddings2 = batch_embeddings2.to(device)
        
        output = classifier(torch.cat((batch_embeddings1, batch_embeddings2), dim=1))
        optimizer.zero_grad()
        loss = criterion(output, labels)  # You need to define labels
        loss.backward()
        optimizer.step()
    
    # Validation loop
    classifier.eval()
    val_labels = []
    val_predictions = []
    with torch.no_grad():
        for batch_embeddings1, batch_embeddings2 in val_loader:
            batch_embeddings1 = batch_embeddings1.to(device)
            batch_embeddings2 = batch_embeddings2.to(device)
            
            output = classifier(torch.cat((batch_embeddings1, batch_embeddings2), dim=1))
            val_labels.extend(batch['labels'].cpu().numpy())
            val_predictions.extend(torch.argmax(output, dim=1).cpu().numpy())
    
    # Calculate validation metrics
    val_f1 = f1_score(val_labels, val_predictions, average='macro')
    
    # Log metrics to wandb
    wandb.log({'epoch': epoch+1, 'loss': loss.item(), 'val_f1': val_f1})
    
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Validation F1: {val_f1}')

# Finish wandb run
wandb.finish()