In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Tokenize the classifier dataset

In [None]:
!pip install sentencepiece
!pip install transformers



In [None]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [None]:
import torch


def my_tokenize(articles, labels, max_len):
    '''
    Tokenize a dataset of articles.

    Parameters:
      `articles` - List of articles, represented as strings.
        `labels` - List of labels for the corresponding articles.
       `max_len` - Truncate all of the articles down to this length.

    Returns:
      `input_ids` - All of the articles represented as lists of token IDs,
                    padded out to `max_len`, and cast as a PyTorch tensor.
         `labels` - The labels for the corresponding articles, formatted as
                    a PyTorch tensor.
      `attention_masks` - PyTorch tensor with the same dimensions as
                          `input_ids`. For each token, simply indicates whether
                           it is padding or not.
    '''
    # Tokenize all of the articles and map the tokens to thier word IDs.
    input_ids = []
    attention_masks = []

    print('Tokenizing {:,} articles...'.format(len(articles)))

    # For every article...
    for art in articles:

        # Report progress.
        if ((len(input_ids) % 100) == 0):
            print('  Tokenized {:,} articles.'.format(len(input_ids)))

        # `encode_plus` will:
        #   (1) Tokenize the article.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        #   (5) Pad or truncate the article to `max_length`
        #   (6) Create attention masks for [PAD] tokens.
        encoded_dict = tokenizer.encode_plus(
                            art,                      # article to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = max_len,      # Pad & truncate all articles.
                            padding = 'max_length',
                            truncation = True,
                            return_attention_mask = True,   # Construct attn. masks.
                            return_tensors = 'pt',     # Return pytorch tensors.
                    )

        # Add the encoded article to the list.
        input_ids.append(encoded_dict['input_ids'])

        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    # Convert string labels to numeric values
    label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
    numeric_labels = [label_mapping[label] for label in labels]

    # Convert the labels to a tensor
    b_labels = torch.tensor(numeric_labels).to(device)

    return input_ids, labels, attention_masks

In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/xian/sentences_with_sentiment.csv')
sentiment_mapping = {0: 'Neutral', 1: 'Positive', -1: 'Negative'}
df['sentiment'] = df['sentiment'].map(sentiment_mapping)
df

Unnamed: 0,text,meta,sentence_id,sentiment
0,Yes.,{'company_name': 'Equitas Small Finance Bank L...,290E07F9-CC12-49A7-AFDA-90D8B990B3E7_001_049_0...,Neutral
1,Our solutions address those challenges by rapi...,"{'company_name': 'Cognyte Software Ltd', 'econ...",5FEED293-F3D6-4EB2-AF78-10F836DF18E1_000_002_0...,Positive
2,"On the film side, we are seeing an unprecedent...","{'company_name': 'Imax Corp', 'economic_sector...",F8CD4487-B16C-406B-AEB8-329507BAA839_000_002_0...,Positive
3,"And with that, I'd like to turn the call over ...","{'company_name': 'RadNet Inc', 'economic_secto...",35C8EA53-5EAC-4949-AD85-77E4E021A114_000_001_0...,Neutral
4,Welcome to our fourth quarter and full year 20...,"{'company_name': 'SB Financial Group Inc', 'ec...",D4DF925D-2B99-4B7A-8EB4-5F28BD9B4215_000_002_0...,Neutral
...,...,...,...,...
514,We'll work with our customers on their require...,"{'company_name': 'Meritor Inc', 'economic_sect...",AA183E6A-5BC6-4081-91E9-A03A0C3C289F_000_004_0...,Positive
515,I think that's fair.,"{'company_name': 'Genuine Parts Co', 'economic...",44629B65-7B84-46E8-85E6-D0FF3179698E_001_087_0...,Neutral
516,"Our customers have chemical plants, typically ...","{'company_name': 'TEL FSI Inc', 'economic_sect...",2117359B-FFB1-4BCF-B6B5-7BE3A91DFE13_001_035_0...,Neutral
517,I should probably also point out that we had o...,{'company_name': 'Multi-Fineline Electronix In...,3583F2E7-7827-43E4-9A57-4C2248CCB82C_001_075_0...,Positive


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

# Split into training+validation and testing sets
X_temp, X_test, y_temp, y_test = train_test_split(df['text'], df['sentiment'], test_size=0.2, random_state=42)

# Split the training+validation set into training and validation sets
# X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

In [None]:
train = pd.DataFrame({'sentence': X_temp, 'label': y_temp})
test = pd.DataFrame({'sentence': X_test, 'label': y_test})

In [None]:
train

Unnamed: 0,sentence,label
433,"In content operations, for example, we are hel...",Positive
517,I should probably also point out that we had o...,Positive
208,I can give you the swing on the circuit board ...,Neutral
332,IT Services business delivered a strong year o...,Positive
220,"Thanks, Neil.",Neutral
...,...,...
71,"In the large majority of cases, we're very hap...",Positive
106,"Thank you, Mark.",Neutral
270,So have the LTVs somehow fallen and you have a...,Negative
435,And again we believe that the market is up at ...,Positive


In [None]:
train.to_csv('/content/drive/MyDrive/xian/train_bert_trust_issue.csv', index=False)
test.to_csv('/content/drive/MyDrive/xian/test_bert_trust_issue.csv', index=False)

In [None]:
# Tokenize our entire training set.
sentences = train['sentence']
labels = train['label']

(train_input_ids,
 train_labels,
 train_attention_masks) = my_tokenize(sentences, labels, max_len = 500)


Tokenizing 415 articles...
  Tokenized 0 articles.
  Tokenized 100 articles.
  Tokenized 200 articles.
  Tokenized 300 articles.
  Tokenized 400 articles.


In [None]:
# Print sentence 0, now as a list of IDs.
print('Original: ', train.sentence.iloc[0])
print('Token IDs:', train_input_ids[0])

Original:  In content operations, for example, we are helping clients generate automated content, image, audio, and video.
Token IDs: tensor([  101,  1999,  4180,  3136,  1010,  2005,  2742,  1010,  2057,  2024,
         5094,  7846,  9699, 12978,  4180,  1010,  3746,  1010,  5746,  1010,
         1998,  2678,  1012,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
        

## Finetune RoBERTa

In [None]:
!pip install datasets huggingface_hub tensorboard==2.11
!sudo apt-get install git-lfs --yes

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting tensorboard==2.11
  Downloading tensorboard-2.11.0-py3-none-any.whl.metadata (1.9 kB)
Collecting google-auth-oauthlib<0.5,>=0.4.1 (from tensorboard==2.11)
  Downloading google_auth_oauthlib-0.4.6-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting tensorboard-data-server<0.7.0,>=0.6.0 (from tensorboard==2.11)
  Downloading tensorboard_data_server-0.6.1-py3-none-manylinux2010_x86_64.whl.metadata (1.1 kB)
Collecting tensorboard-plugin-wit>=1.6.0 (from tensorboard==2.11)
  Downloading tensorboard_plugin_wit-1.8.1-py3-none-any.whl.metadata (873 bytes)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (

In [None]:
import torch
from datasets import load_dataset
from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer
from sklearn.model_selection import train_test_split

from huggingface_hub import HfFolder, notebook_login

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import torch
from datasets import load_dataset
from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer
from sklearn.model_selection import train_test_split
from huggingface_hub import HfFolder, notebook_login



# Model ID and Repository ID
model_id = "roberta-base"
repository_id = "hzduuuu/roberta-base-sentiment"

# Load dataset
df = load_dataset("csv", data_files="/content/drive/MyDrive/xian/train_bert_trust_issue.csv")
train_test_split = df['train'].train_test_split(test_size=0.25)
train = train_test_split['train']
eval = train_test_split['test']

# Tokenize the datasets
tokenizer = AutoTokenizer.from_pretrained(model_id)

def tokenize_function(examples):
    return tokenizer(examples['sentence'], padding="max_length", truncation=True)

train_dataset = train.map(tokenize_function, batched=True)
eval_dataset = eval.map(tokenize_function, batched=True)

# Create label2id and id2label mappings
labels_list = ['Negative', 'Neutral', 'Positive']
label2id = {label: i for i, label in enumerate(labels_list)}

# Convert string labels to numeric values using label2id mapping
def convert_labels(examples):
    examples['label'] = [label2id[label] for label in examples['label']]
    return examples

train_dataset = train_dataset.map(convert_labels, batched=True)
eval_dataset = eval_dataset.map(convert_labels, batched=True)

# Set the format of the datasets to be compatible with PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Load the model
model = RobertaForSequenceClassification.from_pretrained(model_id, num_labels=3)

# Define TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    fp16=True,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token()
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Train the model
trainer.train()

# Save the tokenizer and model locally
local_directory = "/content/drive/MyDrive/xian/roberta"
tokenizer.save_pretrained(local_directory)
model.save_pretrained(local_directory)

# Push to the Hugging Face hub
from huggingface_hub import HfApi, HfFolder

api = HfApi()
api.upload_folder(
    folder_path=local_directory,
    path_in_repo=".",
    repo_id=repository_id,
    repo_type="model"
)

# You can also push the results to the hub using the Trainer's method
trainer.create_model_card()
trainer.push_to_hub()

Generating train split: 0 examples [00:00, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.09,1.085302
2,1.0329,1.025457


Epoch,Training Loss,Validation Loss
1,1.09,1.085302
2,1.0329,1.025457
3,0.7433,0.806609
4,0.7679,0.7961
5,0.4994,0.818968


No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/hzduuuu/roberta-base-sentiment/commit/0f305024286bcfb57c70e1f8c36fd570afe6cd4c', commit_message='End of training', commit_description='', oid='0f305024286bcfb57c70e1f8c36fd570afe6cd4c', pr_url=None, pr_revision=None, pr_num=None)

(Optional) Check GPU RAM

In [None]:
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize

import psutil
import humanize
import os
import GPUtil as GPU

GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
    process = psutil.Process(os.getpid())
    print("Gen RAM Free: " + humanize.naturalsize(psutil.virtual_memory().available), " |     Proc size: " + humanize.naturalsize(process.memory_info().rss))
    print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total     {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

Collecting gputil
  Downloading GPUtil-1.4.0.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gputil
  Building wheel for gputil (setup.py) ... [?25l[?25hdone
  Created wheel for gputil: filename=GPUtil-1.4.0-py3-none-any.whl size=7392 sha256=fc67374c44eb4ab11f18475c751c13812dc8716148eb17cd172664697eb8527e
  Stored in directory: /root/.cache/pip/wheels/a9/8a/bd/81082387151853ab8b6b3ef33426e98f5cbfebc3c397a9d4d0
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.4.0
Gen RAM Free: 11.4 GB  |     Proc size: 1.6 GB
GPU RAM Free: 41MB | Used: 15061MB | Util  98% | Total     15360MB


In [16]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from sklearn.metrics import roc_auc_score
import torch

# Load test set
train_df = pd.read_csv('/content/drive/MyDrive/xian/train_bert_trust_issue.csv')
test_df = pd.read_csv('/content/drive/MyDrive/xian/test_bert_trust_issue.csv')

# Map string labels directly to integers for AUC calculation
reverse_label_mapping = {'Positive': 1, 'Neutral': 0, 'Negative': 2}
train_df['label_encoded'] = train_df['label'].map(reverse_label_mapping)
test_df['label_encoded'] = test_df['label'].map(reverse_label_mapping)

# Load tokenizer and both models
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
raw_model = AutoModelForSequenceClassification.from_pretrained("roberta-base")
finetuned_model = AutoModelForSequenceClassification.from_pretrained("hzduuuu/roberta-base-sentiment")

# Updated reverse label mapping for Hugging Face model output format
hf_label_mapping = {'LABEL_0': 0, 'LABEL_1': 1, 'LABEL_2': 2}

import numpy as np

def compute_auc(model, test_data, tokenizer, device):
    # Initialize a pipeline for sentiment classification
    classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device)

    # Store the true labels and predicted probabilities
    true_labels = []
    pred_probs = []

    # Predict probabilities
    for _, row in test_data.iterrows():
        sentence, label = row['sentence'], row['label_encoded']
        true_labels.append(label)

        # Get predicted probabilities
        outputs = classifier(sentence)
        probs = [0, 0, 0]  # Adjust this list size based on number of classes

        for output in outputs:  # Process each dictionary in outputs
            class_label = output['label']
            if class_label in hf_label_mapping:  # Check if label exists in mapping
                class_index = hf_label_mapping[class_label]
                probs[class_index] = output['score']

        # Normalize the probabilities so they sum to 1
        prob_sum = sum(probs)
        if prob_sum > 0:  # Avoid division by zero
            probs = [p / prob_sum for p in probs]

        pred_probs.append(probs)

    # Calculate AUC (macro-average)
    return roc_auc_score(true_labels, pred_probs, multi_class='ovo')


# Check if GPU is available and set device
device = 0 if torch.cuda.is_available() else -1

# Calculate AUC for both models on training and testing sets
raw_auc_train = compute_auc(raw_model, train_df, tokenizer, device)
finetuned_auc_train = compute_auc(finetuned_model, train_df, tokenizer, device)
raw_auc_test = compute_auc(raw_model, test_df, tokenizer, device)
finetuned_auc_test = compute_auc(finetuned_model, test_df, tokenizer, device)

# Construct a 2x2 DataFrame with train and test AUC scores for each model
results = pd.DataFrame({
    "RoBERTa-raw": [raw_auc_train, raw_auc_test],
    "RoBERTa-finetuned": [finetuned_auc_train, finetuned_auc_test]
}, index=["train-AUC", "test-AUC"])

results


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Unnamed: 0,RoBERTa-raw,RoBERTa-finetuned
train-AUC,0.5,0.451991
test-AUC,0.5,0.450658


In [19]:
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize
import pandas as pd

# Load train and test sets
train_df = pd.read_csv('/content/drive/MyDrive/xian/train_bert_trust_issue.csv')
test_df = pd.read_csv('/content/drive/MyDrive/xian/test_bert_trust_issue.csv')

# Label encoding
reverse_label_mapping = {'Positive': 1, 'Neutral': 0, 'Negative': 2}
train_df['label_encoded'] = train_df['label'].map(reverse_label_mapping)
test_df['label_encoded'] = test_df['label'].map(reverse_label_mapping)

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Function to compute sentence embeddings
def compute_embeddings(model, tokenizer, data, column_name):
    embeddings = []
    with torch.no_grad():
        for sentence in data["sentence"]:
            encoding = tokenizer.encode_plus(
                sentence,
                truncation=True,
                return_tensors='pt',
                add_special_tokens=True
            )
            outputs = model(encoding['input_ids'].to(device))
            # Mean pooling on the last hidden layer
            embedding = torch.mean(outputs['hidden_states'][-1], dim=1).cpu().numpy()[0]
            embeddings.append(embedding)
    data[column_name] = embeddings
    return data

# Load both models and tokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
raw_model = AutoModelForSequenceClassification.from_pretrained("roberta-base", output_hidden_states=True).to(device)
finetuned_model = AutoModelForSequenceClassification.from_pretrained("hzduuuu/roberta-base-sentiment", output_hidden_states=True).to(device)

# Calculate embeddings for both raw and fine-tuned models
train_df = compute_embeddings(raw_model, tokenizer, train_df, "embedding_raw")
train_df = compute_embeddings(finetuned_model, tokenizer, train_df, "embedding_finetuned")
test_df = compute_embeddings(raw_model, tokenizer, test_df, "embedding_raw")
test_df = compute_embeddings(finetuned_model, tokenizer, test_df, "embedding_finetuned")

# Function to evaluate embeddings with logistic regression
def evaluate_embedding(train_df, test_df, embedding_col):
    y_train = train_df['label_encoded']
    y_test = test_df['label_encoded']

    X_train = np.array(train_df[embedding_col].tolist())
    X_test = np.array(test_df[embedding_col].tolist())

    # Binarize labels for multiclass AUC
    y_train_bin = label_binarize(y_train, classes=[0, 1, 2])
    y_test_bin = label_binarize(y_test, classes=[0, 1, 2])

    # Fit logistic regression model with multi_class='ovr' for multiclass support
    lr = LogisticRegression(max_iter=1000, fit_intercept=True, multi_class='ovr')
    lr.fit(X_train, y_train)

    # Predict probabilities for each class
    pred_proba_train = lr.predict_proba(X_train)
    pred_proba_test = lr.predict_proba(X_test)

    # Calculate AUC with multiclass 'ovr'
    train_auc = roc_auc_score(y_train_bin, pred_proba_train, multi_class='ovr')
    test_auc = roc_auc_score(y_test_bin, pred_proba_test, multi_class='ovr')
    return train_auc, test_auc

# Evaluate and store results
results = {}
for embedding_col in ["embedding_raw", "embedding_finetuned"]:
    train_auc, test_auc = evaluate_embedding(train_df, test_df, embedding_col=embedding_col)
    results[embedding_col] = {"train_auc": train_auc, "test_auc": test_auc}

# Convert results to DataFrame for display
results_df = pd.DataFrame(results).T
results_df.columns = ["train-AUC", "test-AUC"]

results_df

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


                     train-AUC  test-AUC
embedding_raw         0.984088  0.946429
embedding_finetuned   1.000000  0.923559


In [20]:
results_df

Unnamed: 0,train-AUC,test-AUC
embedding_raw,0.984088,0.946429
embedding_finetuned,1.0,0.923559
