In [None]:
!pip3 install transformers accelerate datasets evaluate

Installing collected packages: tokenizers, safetensors, xxhash, dill, responses, multiprocess, huggingface-hub, transformers, datasets, evaluate, accelerate
Successfully installed accelerate-0.23.0 datasets-2.14.5 dill-0.3.7 evaluate-0.4.0 huggingface-hub-0.17.2 multiprocess-0.70.15 responses-0.18.0 safetensors-0.3.3 tokenizers-0.13.3 transformers-4.33.2 xxhash-3.3.0


In [None]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch

    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42)

In [None]:
from datasets import Dataset
import pandas as pd
df=pd.read_json("/content/drive/MyDrive/FinNLP/sustainable.json")
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2)

In [None]:
dataset["train"][0]

{'sentence': 'The term ‘carbon neutral’ generally refers to the heavy reliance on carbon offsetting measures to reduce a company’s carbon footprint.',
 'label': 'unsustainable'}

In [None]:
from transformers import AutoTokenizer
model="ProsusAI/Finbert"
tokenizer = AutoTokenizer.from_pretrained(model)
from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained(model)

Downloading (…)okenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForMaskedLM were not initialized from the model checkpoint at ProsusAI/Finbert and are newly initialized: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples["sentence"]])

In [None]:
tokenized_data = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=dataset["train"].column_names,
)

Map (num_proc=4):   0%|          | 0/1812 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (546 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (656 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (572 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (638 > 512). Running this sequence through the model will result in indexing errors


Map (num_proc=4):   0%|          | 0/453 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (522 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (780 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (819 > 512). Running this sequence through the model will result in indexing errors


In [None]:
block_size = 128
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

In [None]:
lm_dataset = tokenized_data.map(group_texts, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/1812 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/453 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
from transformers import TrainingArguments, Trainer
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [None]:
training_args = TrainingArguments(
    output_dir="MLMTunedModel",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
)

trainer = Trainer(
    tokenizer=tokenizer,
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)
trainer.model.to(device)
trainer.train()


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,2.063874
2,No log,1.886474
3,2.300100,1.764656


TrainOutput(global_step=747, training_loss=2.156096717599565, metrics={'train_runtime': 230.2839, 'train_samples_per_second': 25.911, 'train_steps_per_second': 3.244, 'total_flos': 392635779494400.0, 'train_loss': 2.156096717599565, 'epoch': 3.0})

In [None]:
mlmpath="/content/drive/MyDrive/FinNLP/SaveModels/French-PROSUSAI-FINBERT-MLM"
trainer.save_model(mlmpath)

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
!nvidia-smi

Tue Sep 26 05:02:33 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   71C    P0    34W /  70W |   2613MiB / 15360MiB |     34%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

Finetuning the classification problem

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
df = pd.read_csv("/content/drive/MyDrive/FinNLP/Data/Japanese_Paraphrased.csv")
train, test = train_test_split(df, test_size=0.2, shuffle=True,random_state=42)
print("Train shape:", train.shape)
print("Test shape:", test.shape)

Train shape: (654, 6)
Test shape: (164, 6)


In [None]:
df

Unnamed: 0.1,Unnamed: 0,URL,news_title,news_content,impact_type,translated
0,0,https://www.novethic.fr/actualite/energie/mobi...,Chaos dans les transports publics : le pari ri...,"Journées de 13 heures, conflits à gérer, salai...",Risk,"13-hour days, conflicts to be managed, wages d..."
1,1,https://www.novethic.fr/actualite/energie/mobi...,Chaos dans les transports publics : le pari ri...,L'ouverture à la concurrence des transports en...,Risk,Opening to competition from Ile -de -France pu...
2,2,https://www.novethic.fr/actualite/energie/mobi...,Chaos dans les transports publics : le pari ri...,"""Les opérateurs en compétition contractent les...",Opportunity,"""Operators in competition contract costs to th..."
3,3,https://www.novethic.fr/actualite/energie/mobi...,Chaos dans les transports publics : le pari ri...,"Toutefois, les salariés s'inquiètent. ""Les opé...",Risk,"However, employees are worried. ""Competition o..."
4,4,https://www.novethic.fr/actualite/energie/mobi...,Chaos dans les transports publics : le pari ri...,"""La profession s'est tendue""\nLes conditions d...",Risk,"""The profession has stretched""\nWorking condit..."
...,...,...,...,...,...,...
813,813,https://www.novethic.fr/actualite/energie/ener...,"""La guerre en Ukraine devrait être le déclic p...","Le GNL, très émetteur de CO2\nOutre les problè...",Risk,"LNG, very CO2 transmitter\nIn addition to the ..."
814,814,https://www.novethic.fr/actualite/energie/ener...,"""La guerre en Ukraine devrait être le déclic p...","En octobre dernier, il a co-publié une analyse...",Risk,"Last October, he co-published a comparative an..."
815,815,https://www.novethic.fr/actualite/energie/ener...,"""La guerre en Ukraine devrait être le déclic p...","""Remplacer le gaz russe par le GNL américain, ...",Opportunity,"""Replacing Russian gas with American LNG, part..."
816,816,https://www.novethic.fr/actualite/energie/ener...,"""La guerre en Ukraine devrait être le déclic p...",L'industrie éolienne européenne en perte de vi...,Risk,"European wind industry losing speed\n""We must ..."


In [None]:
column="translated"
subset_columns = [column, 'impact_type']
train = train[subset_columns]
test=test[subset_columns]

In [None]:
impact_type_mapping = {
    'Not': 0,
    'Risk': 1,
}
train['impact_type'] = train['impact_type'].map(impact_type_mapping).astype(int)
test['impact_type'] = test['impact_type'].map(impact_type_mapping).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['impact_type'] = train['impact_type'].map(impact_type_mapping).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['impact_type'] = test['impact_type'].map(impact_type_mapping).astype(int)


In [None]:
train["impact_type"].value_counts()

0    370
1    284
Name: impact_type, dtype: int64

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
import numpy as np
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)

    accuracy = accuracy_score(labels, predictions)
    option="weighted"
    precision = precision_score(labels, predictions, average=option)
    recall = recall_score(labels, predictions, average=option)
    f1 = f1_score(labels, predictions, average=option)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [None]:
from torch.utils.data import Dataset, DataLoader
model_path=mlmpath
print(model_path,type(model_path))
tokenizer = AutoTokenizer.from_pretrained(mlmpath)
model = BertForSequenceClassification.from_pretrained(mlmpath, num_labels=5)
train_encodings = tokenizer(list(train[column]), truncation=True, padding=True)
test_encodings = tokenizer(list(test[column]), truncation=True, padding=True)
from datasets import Dataset
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': list(train['impact_type']),  # Replace 'label' with your label column name
})

dev_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': list(test['impact_type']),    # Replace 'label' with your label column name
})

/content/drive/MyDrive/FinNLP/SaveModels/French-PROSUSAI-FINBERT-MLM <class 'str'>


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/FinNLP/SaveModels/French-PROSUSAI-FINBERT-MLM and are newly initialized: ['classifier.weight', 'classifier.bias', 'bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="ClassificationModel",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()
evaluation_results = trainer.evaluate()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.629066,0.756098,0.796896,0.756098,0.742339
2,No log,0.548856,0.786585,0.78736,0.786585,0.785516
3,No log,0.567827,0.756098,0.762852,0.756098,0.751913
4,No log,0.698312,0.743902,0.743564,0.743902,0.743287
5,No log,0.819522,0.75,0.753575,0.75,0.746907
6,No log,1.003328,0.731707,0.731707,0.731707,0.731707
7,No log,1.178764,0.72561,0.725172,0.72561,0.725147
8,No log,1.250703,0.719512,0.719179,0.719512,0.718372
9,No log,1.304968,0.719512,0.719179,0.719512,0.718372
10,No log,1.321865,0.72561,0.725217,0.72561,0.724733


In [None]:
evaluation_results

{'eval_loss': 0.5488561987876892,
 'eval_accuracy': 0.7865853658536586,
 'eval_precision': 0.7873602351584156,
 'eval_recall': 0.7865853658536586,
 'eval_f1': 0.7855156787859569,
 'eval_runtime': 1.9317,
 'eval_samples_per_second': 84.898,
 'eval_steps_per_second': 3.106,
 'epoch': 10.0}

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import classification_report
y_true = []
for i in dev_dataset:
  y_true.append(i["labels"])
predictions = trainer.predict(dev_dataset)
y_pred = predictions.predictions.argmax(axis=1)
print("done")
report = classification_report(y_true, y_pred)

done


In [None]:
print(report)

              precision    recall  f1-score   support

           0       0.78      0.84      0.81        88
           1       0.80      0.72      0.76        76

    accuracy                           0.79       164
   macro avg       0.79      0.78      0.78       164
weighted avg       0.79      0.79      0.79       164



In [None]:
trainer.save_model("/content/drive/MyDrive/FinNLP/SaveModels/French-PROSUSAI-FINBERT-Classification")

Sanity Check

In [None]:
import pandas as pd
dfactual=pd.read_csv("French_Paraphrased.csv")

In [None]:
dfactual["translated"][0]

'13-hour days, conflicts to be managed, wages deemed insufficient ... Public transport employees in Île-de-France fear a deterioration in their working conditions, already difficult, with the opening to competition. This takes place gradually, under the sign of a reduction in costs. The quality of the service, already strongly criticized when it is a key element of decarbonation, is also likely to suffer from it. The deadline for deadlines is increasingly envisaged.'

In [None]:
impact_type_mapping = {
    'Opportunity': 0,
    'Risk': 1,
}
dfactual['impact_type'] = dfactual['impact_type'].map(impact_type_mapping).astype(int)

In [None]:
test_encodings = tokenizer(list(dfactual["translated"]), truncation=True, padding=True)
test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': list(dfactual['impact_type']),    # Replace 'label' with your label column name
})

In [None]:
from sklearn.metrics import classification_report
y_true = []
for i in test_dataset:
  y_true.append(i["labels"])
predictions = trainer.predict(test_dataset)
y_pred = predictions.predictions.argmax(axis=1)
print("done")
report = classification_report(y_true, y_pred)

done


In [None]:
print(report)

              precision    recall  f1-score   support

           0       0.83      0.84      0.83       458
           1       0.79      0.78      0.78       360

    accuracy                           0.81       818
   macro avg       0.81      0.81      0.81       818
weighted avg       0.81      0.81      0.81       818



Final Testsets

In [None]:
dfactual=pd.read_csv("/content/drive/MyDrive/FinNLP/Data/Testsets/French_Test.csv")

In [None]:
dfactual

Unnamed: 0.1,Unnamed: 0,URL,news_title,news_content,impact_type,ID,translated
0,0,https://www.novethic.fr/actualite/numerique/do...,"Démocratie, Pouvoir, loi Avia…. Cinq questions...","Le lundi 11 janvier, Parler n’était toutefois ...",,0,"On Monday, January 11, speaking was no longer ..."
1,1,https://www.novethic.fr/actualite/social/econo...,"Jean Gadrey : ""le risque climatique, une oppor...",L’une des étapes essentielles pour atteindre c...,,1,One of the essential stages to reach this repr...
2,2,https://www.novethic.fr/actualite/social/diver...,Inde : la banque publique des femmes se développe,La dirigeante affiche sa satisfaction : depuis...,,2,The manager has displayed her satisfaction: si...
3,3,https://www.novethic.fr/actualite/social/diver...,Inde : la banque publique des femmes se développe,"Dans les foyers modestes, l’argent gagné par l...",,3,"In modest homes, the money earned by women is ..."
4,4,https://www.novethic.fr/actualite/social/diver...,Inde : la banque publique des femmes se développe,La Bharatiya Mahila Bank veut croître en misan...,,4,The Bharatiya Mahila Bank wants to grow by bet...
...,...,...,...,...,...,...,...
195,195,https://www.novethic.fr/actualite/social/condi...,Qualité de l’air au bureau : des dangers connu...,"Selon le projet européen Officair, sur 167 bur...",,195,"According to the European Officair project, ou..."
196,196,https://www.novethic.fr/actualite/social/condi...,Qualité de l’air au bureau : des dangers connu...,Selon une étude de l’Agence nationale de sécur...,,196,According to a study by the National Health Se...
197,197,https://www.novethic.fr/actualite/social/condi...,Qualité de l’air au bureau : des dangers connu...,Une vingtaine de composés chimiques sont aujou...,,197,About twenty chemical compounds are today stud...
198,198,https://www.novethic.fr/actualite/social/condi...,Qualité de l’air au bureau : des dangers connu...,L’OQAI déroge ainsi à sa méthodologie et accep...,,198,The OQAI thus derogates from its methodology a...


In [None]:
test_encodings = tokenizer(list(dfactual["translated"]), truncation=True, padding=True)
test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
})

In [None]:
predictions = trainer.predict(test_dataset)
y_pred = predictions.predictions.argmax(axis=1)

In [None]:
len(y_pred)

200

In [None]:
#for chinese, do as is
dfactual['impact_type'] = np.where(np.array(y_pred) == 0, 'Opportunity', 'Risk')

In [None]:
dfactual["impact_type"].value_counts()

Opportunity    104
Risk            96
Name: impact_type, dtype: int64

In [None]:
dfactual[dfactual["impact_type"]=="Risk"]

Unnamed: 0,URL,news_title,news_content,impact_type,ID
0,https://www.novethic.fr/actualite/numerique/do...,"Démocratie, Pouvoir, loi Avia…. Cinq questions...","Le lundi 11 janvier, Parler n’était toutefois ...",Risk,0
3,https://www.novethic.fr/actualite/social/diver...,Inde : la banque publique des femmes se développe,"Dans les foyers modestes, l’argent gagné par l...",Risk,3
5,https://www.novethic.fr/actualite/social/diver...,Inde : la banque publique des femmes se développe,Créée en 1997 par l’activiste Chetna Gala Sinh...,Risk,5
13,https://www.novethic.fr/actualite/social/droit...,Covid-19 : En privant les pays pauvres de vacc...,Financiarisation de la santé \nL’Afrique du Su...,Risk,13
14,https://www.novethic.fr/actualite/social/droit...,Covid-19 : En privant les pays pauvres de vacc...,"""Les pays du Nord continuent de protéger leur ...",Risk,14
...,...,...,...,...,...
193,https://www.novethic.fr/actualite/social/condi...,Qualité de l’air au bureau : des dangers connu...,Les Français passent en moyenne 90% de leur te...,Risk,193
194,https://www.novethic.fr/actualite/social/condi...,Qualité de l’air au bureau : des dangers connu...,Autre différence avec l’air intérieur des loge...,Risk,194
195,https://www.novethic.fr/actualite/social/condi...,Qualité de l’air au bureau : des dangers connu...,"Selon le projet européen Officair, sur 167 bur...",Risk,195
196,https://www.novethic.fr/actualite/social/condi...,Qualité de l’air au bureau : des dangers connu...,Selon une étude de l’Agence nationale de sécur...,Risk,196


In [None]:
dfactual = dfactual.drop(columns=["translated",'Unnamed: 0'], axis=1)

In [None]:
dfactual.to_json("/content/drive/MyDrive/FinNLP/SaveModels/LIPI_French_2.json",orient="records")