In [1]:
# !pip install datasets
# !pip install transformers

In [17]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
import numpy as np

In [18]:
import json
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import MultiLabelBinarizer

json_data={}

for iter in range(3):

    # Read the JSON data from the file
    with open('202'+str(iter)+'data.txt', 'r') as f:
        json_data[iter] = json.load(f)

# Create an empty dataframe with two columns: text and labels
df = pd.DataFrame(columns=['Text', 'labels'])

# Loop through each item in the JSON data and append a new row to the dataframe
for key in json_data.keys():
    for item in json_data[key]:
        df = df.append({
            'Text': item['text'],
            'labels': item['labels']
        }, ignore_index=True)

# Print the resulting dataframe

mlb = MultiLabelBinarizer()

labels_matrix = mlb.fit_transform(df['labels'])

# Create a new dataframe with the binary matrix and column names from the MultiLabelBinarizer
labels_df = pd.DataFrame(labels_matrix, columns=mlb.classes_)

# Concatenate the new dataframe with the original dataframe
df = pd.concat([df, labels_df], axis=1)

df['sum_one_hot'] = df.iloc[:, 2:].sum(axis=1)
df = df[df['sum_one_hot']>0]

df = df.drop(['labels', 'sum_one_hot'], axis=1)

dataset = Dataset.from_pandas(df)

In [19]:
# Define your labels
labels = ["Environmental Negative", "Environmental Neutral", "Environmental Positive", "Social Negative", "Social Neutral", 
          "Social Positive", "Governance Negative", "Governance Neutral", "Governance Positive"]

# Create label conversion dictionaries
id2label = {idx: label for idx, label in enumerate(labels)}
label2id = {label: idx for idx, label in enumerate(labels)}

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Convert the data dictionary into a list of dictionaries
#data_list = [{"ID": id, "Text": item["text"], **{label: int(label in item["labels"]) for label in labels}} for id, item in data.items()]

# Convert the data list into a Dataset
#dataset = Dataset.from_dict({k: [d[k] for d in data_list] for k in data_list[0]})

# Split dataset into training and validation sets
dataset = dataset.train_test_split(test_size=0.1)

# Preprocessing function
def preprocess_data(examples):
    text = examples["Text"]
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)

    labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
    labels_matrix = np.zeros((len(text), len(labels)))

    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]

    encoding["labels"] = labels_matrix.tolist()

    return encoding

# Preprocess the dataset
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)
encoded_dataset.set_format("torch")

# Load the model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

#Evaluate function still in process
def evaluate(model, encoded_dataset):
    input_ids = encoded_dataset["input_ids"]
    attention_mask = encoded_dataset["attention_mask"]
    labels = encoded_dataset["labels"]

    with torch.no_grad():
        logits = model(input_ids, attention_mask=attention_mask).logits
        sigmoid = torch.nn.Sigmoid()
        probs = sigmoid(logits)

    threshold = 0.5
    predicted_labels = (probs >= threshold).int()

    f1 = f1_score(labels, predicted_labels, average="micro")
    return f1

loading configuration file config.json from cache at C:\Users\Risto Trajanov/.cache\huggingface\hub\models--bert-base-uncased\snapshots\0a6aa9128b6194f4f3c4db429b6cb4891cdb421b\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.23.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.txt from cache at C:\Users\Risto Trajanov/.cache\huggingface\hub\models--bert-base-uncased\snapshots\0a6aa9128b6194f4f3c4db429b6cb4

In [20]:
#tokenized_inputs = encoded_dataset["test"]

# Define input texts
input_texts = [
    "Some companies are investing in green technologies and reducing their environmental impact.",
    "We are committed to supporting strong energy allies who promote democracy.",
    "Shareholders request the Company to set and publish medium- and long-term targets to reduce the greenhouse gas (GHG) emissions of the Company’s operations and energy products (Scope 1, 2, and 3) consistent with the goal of the Paris Climate Agreement: to limit global warming to well below 2°C above pre-industrial levels and to pursue efforts to limit the temperature increase to 1.5°C."
]

# Tokenize input texts
tokenized_inputs = tokenizer.batch_encode_plus(input_texts, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

with torch.no_grad():
    logits = model(**tokenized_inputs)[0]

# Convert logits to probabilities using sigmoid
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits)

# Get predicted labels based on probabilities
threshold = 0.5
predicted_labels = []

for i in range(len(input_texts)):
    predictions = (probs[i] >= threshold).tolist()
    predicted_labels.append([labels[idx] for idx, value in enumerate(predictions) if value])

# Print predicted labels
for i in range(len(input_texts)):
    print(f"Input text: {input_texts[i]}")
    print(f"Predicted labels: {predicted_labels[i]}")

Input text: Some companies are investing in green technologies and reducing their environmental impact.
Predicted labels: ['Environmental Negative', 'Environmental Neutral', 'Social Neutral', 'Social Positive', 'Governance Positive']
Input text: We are committed to supporting strong energy allies who promote democracy.
Predicted labels: ['Environmental Negative', 'Environmental Neutral', 'Social Neutral', 'Social Positive', 'Governance Neutral', 'Governance Positive']
Input text: Shareholders request the Company to set and publish medium- and long-term targets to reduce the greenhouse gas (GHG) emissions of the Company’s operations and energy products (Scope 1, 2, and 3) consistent with the goal of the Paris Climate Agreement: to limit global warming to well below 2°C above pre-industrial levels and to pursue efforts to limit the temperature increase to 1.5°C.
Predicted labels: ['Environmental Neutral', 'Social Neutral', 'Social Positive', 'Governance Neutral', 'Governance Positive']


In [21]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Text', 'Environmental Negative', 'Environmental Neutral', 'Environmental Positive', 'Governance Negative', 'Governance Neutral', 'Governance Positive', 'Social Negative', 'Social Neutral', 'Social Positive', '__index_level_0__'],
        num_rows: 611
    })
    test: Dataset({
        features: ['Text', 'Environmental Negative', 'Environmental Neutral', 'Environmental Positive', 'Governance Negative', 'Governance Neutral', 'Governance Positive', 'Social Negative', 'Social Neutral', 'Social Positive', '__index_level_0__'],
        num_rows: 68
    })
})

In [22]:
f1 = evaluate(model, encoded_dataset['test'])
print(f"Micro-averaged F1 score: {f1:.4f}")

Micro-averaged F1 score: 0.2169


In [23]:
import wandb

run = wandb.init(entity='trajanov', project='sentiment-esg')

In [24]:
# import gc
# gc.collect()
# torch.cuda.empty_cache()

###Training Process

In [25]:

# Set training arguments
args = TrainingArguments(
    f"bert-finetuned-custom-data",
    logging_dir='./logs',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

# Metrics computation function
def multi_label_metrics(predictions, labels, threshold=0.5):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average='micro')
    accuracy = accuracy_score(y_true, y_pred)

    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(predictions=preds, labels=p.label_ids)
    return result

# Create a trainer
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


# Train the model
trainer.train()

# Save the model
trainer.save_model("custom_multi_label_model")

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
  0%|          | 0/770 [04:34<?, ?it/s]
***** Running training *****
  Num examples = 611
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 770
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
  0%|          | 0/770 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 10%|█         | 77/770 [00:26<03:28,  3.32it/s]***** 

{'eval_loss': 0.33962491154670715, 'eval_f1': 0.4122137404580153, 'eval_roc_auc': 0.6345639878177637, 'eval_accuracy': 0.27941176470588236, 'eval_runtime': 0.8488, 'eval_samples_per_second': 80.11, 'eval_steps_per_second': 10.603, 'epoch': 1.0}


Model weights saved in bert-finetuned-custom-data\checkpoint-77\pytorch_model.bin
tokenizer config file saved in bert-finetuned-custom-data\checkpoint-77\tokenizer_config.json
Special tokens file saved in bert-finetuned-custom-data\checkpoint-77\special_tokens_map.json
 20%|██        | 154/770 [01:01<03:09,  3.25it/s]***** Running Evaluation *****
  Num examples = 68
  Batch size = 8

 20%|██        | 154/770 [01:02<03:09,  3.25it/s]Saving model checkpoint to bert-finetuned-custom-data\checkpoint-154
Configuration saved in bert-finetuned-custom-data\checkpoint-154\config.json


{'eval_loss': 0.2911434769630432, 'eval_f1': 0.4411764705882353, 'eval_roc_auc': 0.6487662378022251, 'eval_accuracy': 0.27941176470588236, 'eval_runtime': 0.8541, 'eval_samples_per_second': 79.62, 'eval_steps_per_second': 10.538, 'epoch': 2.0}


Model weights saved in bert-finetuned-custom-data\checkpoint-154\pytorch_model.bin
tokenizer config file saved in bert-finetuned-custom-data\checkpoint-154\tokenizer_config.json
Special tokens file saved in bert-finetuned-custom-data\checkpoint-154\special_tokens_map.json
 30%|███       | 231/770 [01:34<02:45,  3.26it/s]***** Running Evaluation *****
  Num examples = 68
  Batch size = 8

 30%|███       | 231/770 [01:35<02:45,  3.26it/s]Saving model checkpoint to bert-finetuned-custom-data\checkpoint-231
Configuration saved in bert-finetuned-custom-data\checkpoint-231\config.json


{'eval_loss': 0.25965210795402527, 'eval_f1': 0.5975609756097561, 'eval_roc_auc': 0.7422462552054199, 'eval_accuracy': 0.36764705882352944, 'eval_runtime': 0.854, 'eval_samples_per_second': 79.626, 'eval_steps_per_second': 10.539, 'epoch': 3.0}


Model weights saved in bert-finetuned-custom-data\checkpoint-231\pytorch_model.bin
tokenizer config file saved in bert-finetuned-custom-data\checkpoint-231\tokenizer_config.json
Special tokens file saved in bert-finetuned-custom-data\checkpoint-231\special_tokens_map.json
 40%|████      | 308/770 [02:08<02:23,  3.22it/s]***** Running Evaluation *****
  Num examples = 68
  Batch size = 8

 40%|████      | 308/770 [02:08<02:23,  3.22it/s]Saving model checkpoint to bert-finetuned-custom-data\checkpoint-308
Configuration saved in bert-finetuned-custom-data\checkpoint-308\config.json


{'eval_loss': 0.23677203059196472, 'eval_f1': 0.6666666666666667, 'eval_roc_auc': 0.7758717135931381, 'eval_accuracy': 0.4264705882352941, 'eval_runtime': 0.8718, 'eval_samples_per_second': 78.003, 'eval_steps_per_second': 10.324, 'epoch': 4.0}


Model weights saved in bert-finetuned-custom-data\checkpoint-308\pytorch_model.bin
tokenizer config file saved in bert-finetuned-custom-data\checkpoint-308\tokenizer_config.json
Special tokens file saved in bert-finetuned-custom-data\checkpoint-308\special_tokens_map.json
 50%|█████     | 385/770 [02:44<02:43,  2.35it/s]***** Running Evaluation *****
  Num examples = 68
  Batch size = 8

 50%|█████     | 385/770 [02:45<02:43,  2.35it/s]Saving model checkpoint to bert-finetuned-custom-data\checkpoint-385
Configuration saved in bert-finetuned-custom-data\checkpoint-385\config.json


{'eval_loss': 0.22581231594085693, 'eval_f1': 0.7058823529411764, 'eval_roc_auc': 0.8062029958356641, 'eval_accuracy': 0.4852941176470588, 'eval_runtime': 0.9359, 'eval_samples_per_second': 72.66, 'eval_steps_per_second': 9.617, 'epoch': 5.0}


Model weights saved in bert-finetuned-custom-data\checkpoint-385\pytorch_model.bin
tokenizer config file saved in bert-finetuned-custom-data\checkpoint-385\tokenizer_config.json
Special tokens file saved in bert-finetuned-custom-data\checkpoint-385\special_tokens_map.json
 60%|██████    | 462/770 [03:23<01:36,  3.20it/s]***** Running Evaluation *****
  Num examples = 68
  Batch size = 8

 60%|██████    | 462/770 [03:24<01:36,  3.20it/s]Saving model checkpoint to bert-finetuned-custom-data\checkpoint-462
Configuration saved in bert-finetuned-custom-data\checkpoint-462\config.json


{'eval_loss': 0.21450766921043396, 'eval_f1': 0.7428571428571429, 'eval_roc_auc': 0.8330847162657717, 'eval_accuracy': 0.5588235294117647, 'eval_runtime': 0.8707, 'eval_samples_per_second': 78.094, 'eval_steps_per_second': 10.336, 'epoch': 6.0}


Model weights saved in bert-finetuned-custom-data\checkpoint-462\pytorch_model.bin
tokenizer config file saved in bert-finetuned-custom-data\checkpoint-462\tokenizer_config.json
Special tokens file saved in bert-finetuned-custom-data\checkpoint-462\special_tokens_map.json
 65%|██████▍   | 500/770 [03:45<02:11,  2.05it/s]

{'loss': 0.2534, 'learning_rate': 7.012987012987014e-06, 'epoch': 6.49}


 70%|███████   | 539/770 [03:59<01:13,  3.14it/s]***** Running Evaluation *****
  Num examples = 68
  Batch size = 8

 70%|███████   | 539/770 [04:00<01:13,  3.14it/s]Saving model checkpoint to bert-finetuned-custom-data\checkpoint-539
Configuration saved in bert-finetuned-custom-data\checkpoint-539\config.json


{'eval_loss': 0.2099231779575348, 'eval_f1': 0.7455621301775148, 'eval_roc_auc': 0.8261855926409347, 'eval_accuracy': 0.5588235294117647, 'eval_runtime': 0.8789, 'eval_samples_per_second': 77.37, 'eval_steps_per_second': 10.24, 'epoch': 7.0}


Model weights saved in bert-finetuned-custom-data\checkpoint-539\pytorch_model.bin
tokenizer config file saved in bert-finetuned-custom-data\checkpoint-539\tokenizer_config.json
Special tokens file saved in bert-finetuned-custom-data\checkpoint-539\special_tokens_map.json
 80%|████████  | 616/770 [04:32<00:48,  3.18it/s]***** Running Evaluation *****
  Num examples = 68
  Batch size = 8

 80%|████████  | 616/770 [04:33<00:48,  3.18it/s]Saving model checkpoint to bert-finetuned-custom-data\checkpoint-616
Configuration saved in bert-finetuned-custom-data\checkpoint-616\config.json


{'eval_loss': 0.21168552339076996, 'eval_f1': 0.7570621468926554, 'eval_roc_auc': 0.8438374044378147, 'eval_accuracy': 0.5588235294117647, 'eval_runtime': 0.8882, 'eval_samples_per_second': 76.56, 'eval_steps_per_second': 10.133, 'epoch': 8.0}


Model weights saved in bert-finetuned-custom-data\checkpoint-616\pytorch_model.bin
tokenizer config file saved in bert-finetuned-custom-data\checkpoint-616\tokenizer_config.json
Special tokens file saved in bert-finetuned-custom-data\checkpoint-616\special_tokens_map.json
 90%|█████████ | 693/770 [05:06<00:24,  3.18it/s]***** Running Evaluation *****
  Num examples = 68
  Batch size = 8
                                                 
 90%|█████████ | 693/770 [05:07<00:24,  3.18it/s]Saving model checkpoint to bert-finetuned-custom-data\checkpoint-693
Configuration saved in bert-finetuned-custom-data\checkpoint-693\config.json


{'eval_loss': 0.20493310689926147, 'eval_f1': 0.7586206896551724, 'eval_roc_auc': 0.8403878426253962, 'eval_accuracy': 0.5882352941176471, 'eval_runtime': 0.8885, 'eval_samples_per_second': 76.536, 'eval_steps_per_second': 10.13, 'epoch': 9.0}


Model weights saved in bert-finetuned-custom-data\checkpoint-693\pytorch_model.bin
tokenizer config file saved in bert-finetuned-custom-data\checkpoint-693\tokenizer_config.json
Special tokens file saved in bert-finetuned-custom-data\checkpoint-693\special_tokens_map.json
100%|██████████| 770/770 [05:40<00:00,  3.11it/s]***** Running Evaluation *****
  Num examples = 68
  Batch size = 8
                                                 
100%|██████████| 770/770 [05:41<00:00,  3.11it/s]Saving model checkpoint to bert-finetuned-custom-data\checkpoint-770
Configuration saved in bert-finetuned-custom-data\checkpoint-770\config.json


{'eval_loss': 0.2071578949689865, 'eval_f1': 0.7514450867052023, 'eval_roc_auc': 0.8350114985393747, 'eval_accuracy': 0.5735294117647058, 'eval_runtime': 0.9091, 'eval_samples_per_second': 74.802, 'eval_steps_per_second': 9.9, 'epoch': 10.0}


Model weights saved in bert-finetuned-custom-data\checkpoint-770\pytorch_model.bin
tokenizer config file saved in bert-finetuned-custom-data\checkpoint-770\tokenizer_config.json
Special tokens file saved in bert-finetuned-custom-data\checkpoint-770\special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from bert-finetuned-custom-data\checkpoint-693 (score: 0.7586206896551724).
100%|██████████| 770/770 [05:48<00:00,  2.21it/s]
Saving model checkpoint to custom_multi_label_model
Configuration saved in custom_multi_label_model\config.json


{'train_runtime': 348.6251, 'train_samples_per_second': 17.526, 'train_steps_per_second': 2.209, 'train_loss': 0.2074269183270343, 'epoch': 10.0}


Model weights saved in custom_multi_label_model\pytorch_model.bin
tokenizer config file saved in custom_multi_label_model\tokenizer_config.json
Special tokens file saved in custom_multi_label_model\special_tokens_map.json


# Upload the model to huggingface hub

In [26]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (manager-core).
Your token has been saved to C:\Users\Risto Trajanov\.cache\huggingface\token
Login successful


In [31]:
model.push_to_hub('bert-esg')

Configuration saved in C:\Users\RISTOT~1\AppData\Local\Temp\tmp6w5di3i3\config.json
Model weights saved in C:\Users\RISTOT~1\AppData\Local\Temp\tmp6w5di3i3\pytorch_model.bin
Uploading the following files to TrajanovRisto/bert-esg: config.json,pytorch_model.bin
pytorch_model.bin: 100%|██████████| 438M/438M [00:54<00:00, 8.10MB/s]
Upload 1 LFS files: 100%|██████████| 1/1 [00:54<00:00, 54.10s/it]


CommitInfo(commit_url='https://huggingface.co/TrajanovRisto/bert-esg/commit/87f83e77a864d67b4c12101642f46c501d9ce31e', commit_message='Upload BertForSequenceClassification', commit_description='', oid='87f83e77a864d67b4c12101642f46c501d9ce31e', pr_url=None, pr_revision=None, pr_num=None)

In [32]:
tokenizer.push_to_hub('bert-esg')

tokenizer config file saved in C:\Users\RISTOT~1\AppData\Local\Temp\tmpuiz_09xn\tokenizer_config.json
Special tokens file saved in C:\Users\RISTOT~1\AppData\Local\Temp\tmpuiz_09xn\special_tokens_map.json
Uploading the following files to TrajanovRisto/bert-esg: special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.txt


CommitInfo(commit_url='https://huggingface.co/TrajanovRisto/bert-esg/commit/38f5c87f95d3e294a0b78b82545f28cbded95dd0', commit_message='Upload tokenizer', commit_description='', oid='38f5c87f95d3e294a0b78b82545f28cbded95dd0', pr_url=None, pr_revision=None, pr_num=None)

In [30]:
dataset.push_to_hub('esg-sentiment')

Pushing split train to the Hub.
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 11.40ba/s]
Upload 1 LFS files: 100%|██████████| 1/1 [00:00<00:00,  2.13it/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:01<00:00,  1.02s/it]
Pushing split test to the Hub.
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 163.45ba/s]
Upload 1 LFS files: 100%|██████████| 1/1 [00:00<00:00,  2.19it/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:00<00:00,  1.12it/s]


In [None]:
#tokenized_inputs = encoded_dataset["test"]

# Define input texts
input_texts = [
    "Some companies are investing in green technologies and reducing their environmental impact.",
    "We are committed to supporting strong energy allies who promote democracy.",
    "Shareholders request the Company to set and publish medium- and long-term targets to reduce the greenhouse gas (GHG) emissions of the Company’s operations and energy products (Scope 1, 2, and 3) consistent with the goal of the Paris Climate Agreement: to limit global warming to well below 2°C above pre-industrial levels and to pursue efforts to limit the temperature increase to 1.5°C."
]

# Tokenize input texts
input_tensor = tokenizer.batch_encode_plus(input_texts, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

device = model.device

input_tensor = {k: v.to(device) for k, v in input_tensor.items()}

with torch.no_grad():
    logits = model(**input_tensor)[0]

# Convert logits to probabilities using sigmoid
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits)

# Get predicted labels based on probabilities
threshold = 0.5
predicted_labels = []

for i in range(len(input_texts)):
    predictions = (probs[i] >= threshold).tolist()
    predicted_labels.append([labels[idx] for idx, value in enumerate(predictions) if value])

# Print predicted labels
for i in range(len(input_texts)):
    print(f"Input text: {input_texts[i]}")
    print(f"Predicted labels: {predicted_labels[i]}")

Input text: Some companies are investing in green technologies and reducing their environmental impact.
Predicted labels: ['Environmental Positive']
Input text: We are committed to supporting strong energy allies who promote democracy.
Predicted labels: ['Social Positive', 'Governance Positive']
Input text: Shareholders request the Company to set and publish medium- and long-term targets to reduce the greenhouse gas (GHG) emissions of the Company’s operations and energy products (Scope 1, 2, and 3) consistent with the goal of the Paris Climate Agreement: to limit global warming to well below 2°C above pre-industrial levels and to pursue efforts to limit the temperature increase to 1.5°C.
Predicted labels: ['Environmental Positive', 'Governance Positive']


In [None]:
from google.colab import files
import os

# Define the path of the model
model_path = "custom_multi_label_model"

# Zip the model files
os.system(f"tar -czf {model_path}.tar.gz {model_path}")

# Download the zipped model
files.download(f"{model_path}.tar.gz")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Experimental

In [None]:
# Read the JSON data from the file
with open('2022data.txt', 'r') as f:
    json_data_eval = json.load(f)

# Create an empty dataframe with two columns: text and labels
df_eval = pd.DataFrame(columns=['Text', 'labels'])

# Loop through each item in the JSON data and append a new row to the dataframe
for item in json_data_eval:
    df_eval = df_eval.append({
        'Text': item['text'],
        'labels': item['labels']
    }, ignore_index=True)

# Print the resulting dataframe

mlb = MultiLabelBinarizer()

labels_matrix = mlb.fit_transform(df_eval['labels'])

# Create a new dataframe with the binary matrix and column names from the MultiLabelBinarizer
labels_df_eval = pd.DataFrame(labels_matrix, columns=mlb.classes_)

# Concatenate the new dataframe with the original dataframe
df_eval = pd.concat([df_eval, labels_df_eval], axis=1)

df_eval['sum_one_hot'] = df_eval.iloc[:, 2:].sum(axis=1)
df_eval = df_eval[df_eval['sum_one_hot']>0]

df_eval = df_eval.drop(['labels', 'sum_one_hot'], axis=1)

dataset_eval = Dataset.from_pandas(df_eval)

In [None]:
# Test the model on a new example
example_text = "We are committed to supporting strong energy allies who promote democracy."
input_tensor = tokenizer.encode_plus(example_text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

# Move the input tensor to the same device as the model
input_tensor = {k: v.to(device) for k, v in input_tensor.items()}

logits = model(**input_tensor).logits
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits)

threshold = 0.45
predictions = (probs >= threshold).tolist()[0]

predicted_labels = [labels[idx] for idx, value in enumerate(predictions) if value]

print(predicted_labels)

['Social Positive', 'Governance Positive']
