In [None]:
!pip3 install transformers accelerate datasets evaluate
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch

    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42)

Collecting transformers
  Downloading transformers-4.33.3-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
df = pd.read_json("/content/drive/MyDrive/FinNLP/Data/ML-ESG-2_English_Train.json")
train, test = train_test_split(df, test_size=0.2, random_state=42)
print("Train shape:", train.shape)
print("Test shape:", test.shape)
column="news_content"
subset_columns = [column, 'impact_type']
train = train[subset_columns]
test=test[subset_columns]

Train shape: (646, 4)
Test shape: (162, 4)


In [None]:
df

Unnamed: 0,URL,news_title,news_content,impact_type
0,https://www.esgtoday.com/arabesque-ai-appoints...,Arabesque AI Appoints Carolina Minio Paluello ...,ESG-focused financial technology company Arabe...,Opportunity
1,https://www.esgtoday.com/arabesque-ai-appoints...,Arabesque AI Appoints Carolina Minio Paluello ...,The company also announced the appointment of ...,Opportunity
2,https://www.esgtoday.com/arabesque-ai-appoints...,Arabesque AI Appoints Carolina Minio Paluello ...,Wong said: \n“Personalised portfolios demand ...,Opportunity
3,https://www.esgtoday.com/ukraine-war-inflation...,"Ukraine War, Inflation Reduction Act Driving F...",One of the key themes of the report is the imp...,Opportunity
4,https://www.esgtoday.com/eu-regulators-welcome...,"EU Regulators Welcome, Critique New European S...",Europe’s three primary financial regulatory ag...,Opportunity
...,...,...,...,...
803,https://www.esgtoday.com/cdp-lack-of-action-on...,CDP: Lack of Action on Water Risks Could Cost ...,"According to CDP, investors are becoming incre...",Opportunity
804,https://www.esgtoday.com/survey-investors-shif...,Survey: Investors Shifting to Offense on Clima...,O’Brien said: “Investors globally are increasi...,Opportunity
805,https://www.esgtoday.com/glencore-targets-net-...,"Glencore Targets Net Zero Emissions by 2050, S...","Ivan Glasenberg, Glencore Chief Executive Offi...",Opportunity
806,https://www.esgtoday.com/trafigura-commits-to-...,"Trafigura Commits to Buy 50,000 Tons of Carbon...",FMC was launched at the COP26 climate conferen...,Opportunity


In [None]:
#chinese dataset is already numerical
impact_type_mapping = {
    'Opportunity': 0,
    'Risk': 1,
}

train['impact_type'] = train['impact_type'].map(impact_type_mapping).astype(int)
test['impact_type'] = test['impact_type'].map(impact_type_mapping).astype(int)


In [None]:
train["impact_type"].value_counts()

0    560
1     86
Name: impact_type, dtype: int64

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
from transformers import AutoTokenizer
import numpy as np
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)

    accuracy = accuracy_score(labels, predictions)
    option="weighted"
    precision = precision_score(labels, predictions, average=option)
    recall = recall_score(labels, predictions, average=option)
    f1 = f1_score(labels, predictions, average=option)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [None]:
from torch.utils.data import Dataset, DataLoader
mlmpath="bert-base-cased"
print(mlmpath,type(mlmpath))
tokenizer = AutoTokenizer.from_pretrained(mlmpath)
model = BertForSequenceClassification.from_pretrained(mlmpath, num_labels=5)
train_encodings = tokenizer(list(train[column]), truncation=True, padding=True)
test_encodings = tokenizer(list(test[column]), truncation=True, padding=True)
from datasets import Dataset
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': list(train['impact_type']),  # Replace 'label' with your label column name
})

dev_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': list(test['impact_type']),    # Replace 'label' with your label column name
})

bert-base-cased <class 'str'>


Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="ClassificationModel",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()
evaluation_results = trainer.evaluate()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.506159,0.82716,0.684194,0.82716,0.748916
2,No log,0.275865,0.925926,0.924187,0.925926,0.924829
3,No log,0.338906,0.907407,0.916729,0.907407,0.892924
4,No log,0.304833,0.919753,0.916645,0.919753,0.915042
5,No log,0.352149,0.888889,0.896047,0.888889,0.891751
6,No log,0.365721,0.901235,0.904284,0.901235,0.902566
7,No log,0.380257,0.91358,0.910047,0.91358,0.910892
8,No log,0.389124,0.907407,0.90418,0.907407,0.905299
9,No log,0.387954,0.901235,0.898728,0.901235,0.899771
10,No log,0.389116,0.907407,0.906173,0.907407,0.906738


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
evaluation_results

{'eval_loss': 0.2758651673793793,
 'eval_accuracy': 0.9259259259259259,
 'eval_precision': 0.9241871962460197,
 'eval_recall': 0.9259259259259259,
 'eval_f1': 0.9248285322359395,
 'eval_runtime': 1.7145,
 'eval_samples_per_second': 94.487,
 'eval_steps_per_second': 3.5,
 'epoch': 10.0}

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import classification_report
y_true = []
for i in dev_dataset:
  y_true.append(i["labels"])
predictions = trainer.predict(dev_dataset)
y_pred = predictions.predictions.argmax(axis=1)
print("done")
report = classification_report(y_true, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


done


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
print(report)

              precision    recall  f1-score   support

           0       0.83      0.84      0.83       110
           1       0.80      0.31      0.44        13
           2       0.00      0.00      0.00         8
           3       0.77      0.93      0.84       136
           4       0.00      0.00      0.00        13

    accuracy                           0.79       280
   macro avg       0.48      0.41      0.42       280
weighted avg       0.74      0.79      0.76       280

