In [None]:
!pip3 install transformers accelerate datasets evaluate
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch

    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42)



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
chinese_df = pd.read_csv("/content/drive/MyDrive/FinNLP/Data/ML-ESG-2_Chinese_Total.csv")
train_chinese, test_chinese = train_test_split(chinese_df, test_size=0.2, random_state=42)
print("Train shape:", train_chinese.shape)
print("Test shape:", test_chinese.shape)
column="news_headline"
subset_columns = [column, 'impact_type']
train_chinese = train_chinese[subset_columns]
test_chinese=test_chinese[subset_columns]

Train shape: (1120, 5)
Test shape: (280, 5)


In [None]:
chinese_df["impact_type"].value_counts()

3    659
0    596
1     64
4     55
2     26
Name: impact_type, dtype: int64

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
eng_df = pd.read_json("/content/drive/MyDrive/FinNLP/Data/ML-ESG-2_English_Train.json")
impact_type_mapping = {
    'Opportunity': 0,
    'Risk': 1,
}
eng_df['impact_type'] = eng_df['impact_type'].map(impact_type_mapping).astype(int)
train_eng, test_eng = train_test_split(eng_df, test_size=0.2, random_state=42)
print("Train shape:", train_eng.shape)
print("Test shape:", test_eng.shape)
column="news_content"
subset_columns = [column, 'impact_type']
train_eng = train_eng[subset_columns]
test_eng = test_eng[subset_columns]

Train shape: (646, 4)
Test shape: (162, 4)


In [None]:
eng_df["impact_type"].value_counts()

0    694
1    114
Name: impact_type, dtype: int64

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
fr_df = pd.read_json("/content/drive/MyDrive/FinNLP/Data/ML-ESG-2_French_Train.json")
fr_df['impact_type'] = fr_df['impact_type'].map(impact_type_mapping).astype(int)
train_fr, test_fr = train_test_split(fr_df, test_size=0.2, random_state=42)
print("Train shape:", train_fr.shape)
print("Test shape:", test_fr.shape)
column="news_content"
subset_columns = [column, 'impact_type']
train_fr = train_fr[subset_columns]
test_fr=test_fr[subset_columns]

Train shape: (654, 4)
Test shape: (164, 4)


In [None]:
fr_df["impact_type"].value_counts()

0    458
1    360
Name: impact_type, dtype: int64

In [None]:
train_chinese
train_chinese.rename(columns={'news_headline': 'news_content'}, inplace=True)
test_chinese.rename(columns={'news_headline': 'news_content'}, inplace=True)

In [None]:
train_eng

Unnamed: 0,news_content,impact_type
789,"According to WBCSD, the initiative started wit...",0
637,"According to the NGO, a similar proposal was f...",0
444,"UK fashion retailers ASOS, Boohoo and George a...",1
332,BMO GAM anticipates using these findings to he...,0
291,3M’s new goals include achieving carbon neutra...,0
...,...,...
71,Fidelity stated that engaged with several stra...,0
106,"Alongside the EU ETS deal, the agreement also ...",0
270,"As part of the ban on petrol and diesel cars, ...",0
435,JPMorgan Chase announced the launch of a serie...,0


In [None]:
train_fr

Unnamed: 0,news_content,impact_type
773,Des mesures qui ne ciblent pas les plus vulnér...,1
451,Les mégaprojets d’extraction de pétrole (Tilen...,1
338,"Ce projet de loi sur le nucléaire ""ne préempte...",1
580,110 000 foyers raccordés en autoconsommation\n...,0
722,La hausse des coûts de l'énergie est structure...,0
...,...,...
71,L’objectif est de permettre de trouver un équi...,0
106,"La pandémie a également créé ""une nouvelle réa...",1
270,"Selon Greenpeace, la France continue de s’appr...",1
435,"Outre le dossier énergétique, c’est un maratho...",0


In [None]:
train = pd.concat([ train_chinese,train_eng,train_fr], axis=0)
test = pd.concat([ test_chinese,test_eng,test_fr], axis=0)
#train=train_chinese
#test=test_chinese

In [None]:
len(train),len(test)

(2420, 606)

In [None]:
train["impact_type"].value_counts()

0    1416
3     523
1     421
4      42
2      18
Name: impact_type, dtype: int64

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
from transformers import AutoTokenizer
import numpy as np
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)

    accuracy = accuracy_score(labels, predictions)
    option="weighted"
    precision = precision_score(labels, predictions, average=option)
    recall = recall_score(labels, predictions, average=option)
    f1 = f1_score(labels, predictions, average=option)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [None]:
from torch.utils.data import Dataset, DataLoader
mlmpath="bert-base-multilingual-cased"
print(mlmpath,type(mlmpath))
tokenizer = AutoTokenizer.from_pretrained(mlmpath)
model = BertForSequenceClassification.from_pretrained(mlmpath, num_labels=5)
train_encodings = tokenizer(list(train[column]), truncation=True, padding=True)
test_encodings = tokenizer(list(test[column]), truncation=True, padding=True)
from datasets import Dataset
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': list(train['impact_type']),  # Replace 'label' with your label column name
})

dev_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': list(test['impact_type']),    # Replace 'label' with your label column name
})

bert-base-multilingual-cased <class 'str'>


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="ClassificationModel",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()
evaluation_results = trainer.evaluate()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.637218,0.742574,0.711736,0.742574,0.721271
2,0.660000,0.727586,0.755776,0.746156,0.755776,0.716567
3,0.660000,0.973443,0.760726,0.743364,0.760726,0.749864
4,0.336600,1.159971,0.747525,0.738151,0.747525,0.740302
5,0.173500,1.33173,0.772277,0.748863,0.772277,0.759437
6,0.173500,1.495254,0.772277,0.75941,0.772277,0.765281
7,0.062000,1.661188,0.765677,0.748266,0.765677,0.754963
8,0.062000,1.711851,0.762376,0.746984,0.762376,0.754185
9,0.017400,1.710284,0.777228,0.761853,0.777228,0.767522
10,0.003500,1.740611,0.773927,0.758072,0.773927,0.764899


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
evaluation_results

{'eval_loss': 0.6372178196907043,
 'eval_accuracy': 0.7425742574257426,
 'eval_precision': 0.7117361736173617,
 'eval_recall': 0.7425742574257426,
 'eval_f1': 0.721271190933354,
 'eval_runtime': 11.8404,
 'eval_samples_per_second': 51.181,
 'eval_steps_per_second': 6.419,
 'epoch': 10.0}

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import classification_report
y_true = []
for i in dev_dataset:
  y_true.append(i["labels"])
predictions = trainer.predict(dev_dataset)
y_pred = predictions.predictions.argmax(axis=1)
print("done")
report = classification_report(y_true, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


done


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
print(report)

              precision    recall  f1-score   support

           0       0.78      0.85      0.81       332
           1       0.65      0.45      0.54       117
           2       0.00      0.00      0.00         8
           3       0.70      0.85      0.77       136
           4       0.00      0.00      0.00        13

    accuracy                           0.74       606
   macro avg       0.43      0.43      0.42       606
weighted avg       0.71      0.74      0.72       606

