In [None]:
#Enlazamos con google drive y obtenemos los archivos del directorio en el que están los textos
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/MyDrive/TFG')

In [None]:
!pip install datasets
!pip install transformers
!pip install sentencepiece

import pandas as pd
from datasets import load_dataset, concatenate_datasets
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer, DataCollatorWithPadding, EarlyStoppingCallback
import operator
import sklearn.metrics as metrics
import torch
import random
import datetime
import tensorflow as tf
import torch.nn as nn
from transformers import AutoModel
from transformers.modeling_outputs import SequenceClassifierOutput
from datasets import ClassLabel, Value

In [None]:
dicParametersNoMetadata = {
    'finiteautomata/beto-sentiment-analysis': {'tam_Embeddings':512, 'tokenizer': f'finiteautomata/beto-sentiment-analysis','learning_rate':2.3e-5,'weight_decay':3.57e-3,'batch_size':8,'directory':'./BetoTwitter'},
    'dccuchile/bert-base-spanish-wwm-cased': {'tam_Embeddings':512, 'tokenizer':f'dccuchile/bert-base-spanish-wwm-cased','learning_rate':3.89e-06,'weight_decay':5.23e-5,'batch_size':16,'directory':'./BetoNormal'},
    'PlanTL-GOB-ES/roberta-base-bne': {'tam_Embeddings':512, 'tokenizer':f'PlanTL-GOB-ES/roberta-base-bne','learning_rate':3.14e-06,'weight_decay':1.2e-03,'batch_size':16,'directory':'./RobertaMarIA'},
    'cardiffnlp/twitter-xlm-roberta-base-sentiment': {'tam_Embeddings':512, 'tokenizer':f'cardiffnlp/twitter-xlm-roberta-base-sentiment','learning_rate':1.11e-05,'weight_decay':3.66e-03,'batch_size':8,'directory':'./RobertaCardiff'},
    'maxpe/bertin-roberta-base-spanish_semeval18_emodetection': {'tam_Embeddings':512, 'tokenizer':f'bertin-project/bertin-roberta-base-spanish','learning_rate':6.26e-06,'weight_decay':1e-04,'batch_size':8,'directory':'./BertinTwitter'},
    'bertin-project/bertin-roberta-base-spanish': {'tam_Embeddings':512, 'tokenizer':f'bertin-project/bertin-roberta-base-spanish','learning_rate':7.82e-07,'weight_decay':8.7e-04,'batch_size':8,'directory':'./BertinNormal'},    
    'pysentimiento/robertuito-base-cased': {'tam_Embeddings':128, 'tokenizer':f'pysentimiento/robertuito-base-cased','learning_rate':2.72e-05,'weight_decay':1.27e-03,'batch_size':8,'directory':'./RobertaTwitter'},
}

In [None]:
input_dim = 768 + 17
output_dim = 4

class PosModelNoCapaOculta(nn.Module):
    def __init__(self,num_labels=4, modelo=""):
        super(PosModelNoCapaOculta, self).__init__()
        
        self.num_labels= num_labels
        self.modelo = modelo
        self.base_model = AutoModel.from_pretrained(modelo,num_labels=self.num_labels)
        self.dropout = nn.Dropout(self.base_model.config.classifier_dropout if self.base_model.config.classifier_dropout is not None else self.base_model.config.hidden_dropout_prob)
        self.linear = nn.Linear(input_dim, output_dim)
        self.dense = nn.Linear(input_dim, input_dim)
        
        
    def forward(self, input_ids=None, attention_mask=None, influencerAux=None, genderAux=None, mediaAux=None, labels=None):
        outputs = self.base_model(input_ids, attention_mask=attention_mask)
        
        if self.modelo == 'finiteautomata/beto-sentiment-analysis' or self.modelo == 'dccuchile/bert-base-spanish-wwm-cased':
          outputs = torch.cat((outputs[1],influencerAux,genderAux,mediaAux), dim = -1)
        else:
          outputs = torch.cat((outputs[0][:,0,:],influencerAux,genderAux,mediaAux), dim = -1)
          outputs = self.dropout(outputs)
          outputs = self.dense(outputs)
          outputs = torch.tanh(outputs)

        outputs = self.dropout(outputs)
        
        logits = self.linear(outputs)
        
        loss = None
        if labels is not None:
          loss_fct = nn.CrossEntropyLoss()
          loss = loss_fct(logits, labels)

        return SequenceClassifierOutput(loss=loss, logits=logits)

class MyTrainer(Trainer):
    def log(self, logs) -> None:
        logs["learning_rate"] = self._get_learning_rate()
        super().log(logs)

class Pruebas():

    def __init__(self,dicObject):
        self.dicObject = dicObject

    def predictAndTrain(self,model,dicObject):

        model= PosModelNoCapaOculta(4,model)
        
        dataset = load_dataset('csv', data_files={'train': 'TrainPreprocessCls.csv', 'test':'TestPreprocessCls.csv','validation': 'ValidationPreprocessCls.csv'})

        #Cargamos los datos que necesitamos desde local:
        self.tokenizer = AutoTokenizer.from_pretrained(dicObject['tokenizer'], model_max_length=dicObject['tam_Embeddings'])
        #Preprocesamos todos los textos a la vez
        tokenized_OffendES = dataset.map(self.preprocess_text, batched=True)
        tokenized_OffendES['train'] = concatenate_datasets([tokenized_OffendES['train'], tokenized_OffendES['validation']])

        data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
         

        training_args = TrainingArguments(         
            output_dir=dicObject['directory'],                 
            learning_rate=dicObject['learning_rate'],    
            weight_decay=dicObject['weight_decay'],       
            num_train_epochs=5,
            per_device_train_batch_size=dicObject['batch_size'],      
            per_device_eval_batch_size = dicObject['batch_size'],
            load_best_model_at_end = True,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            metric_for_best_model="eval_maf",
            disable_tqdm=True,
            seed=seed
    )     
        trainer = MyTrainer(
            model=model,
            args=training_args,         
            train_dataset=tokenized_OffendES["train"],
            eval_dataset=tokenized_OffendES["test"],
            tokenizer=self.tokenizer,
            data_collator=data_collator,
            compute_metrics=self.compute_metrics,
            callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]  
    )      
    

        trainer.train() 
        predictions = trainer.predict(tokenized_OffendES['test'])
        predictions = tf.math.softmax(predictions.predictions, axis=-1)
        tags=[]
        for p in predictions:
          max_value = max(p)
          tags.append(np.where(p == max_value)[0][0])
        df=pd.DataFrame()
        df['label'] = tags
        df.to_csv(str(dicObject['directory'].split('/')[1]+'tags.csv'), sep='\t', index=False)
        self.results(tags, tokenized_OffendES['test']["label"], str(dicObject['directory'].split('/')[1]))

    def results(self,pred_labels,true_labels, nameModel):
        map = metrics.precision_score(y_true=true_labels, y_pred=pred_labels, average='macro')
        mar = metrics.recall_score(y_true=true_labels, y_pred=pred_labels, average='macro')
        maf1 = metrics.f1_score(y_true=true_labels, y_pred=pred_labels, average='macro')
        mip = metrics.precision_score(y_true=true_labels, y_pred=pred_labels, average='micro')
        mir = metrics.recall_score(y_true=true_labels, y_pred=pred_labels, average='micro')
        mif1 = metrics.f1_score(y_true=true_labels, y_pred=pred_labels, average='micro')
        wp = metrics.precision_score(y_true=true_labels, y_pred=pred_labels, average='weighted')
        wr = metrics.recall_score(y_true=true_labels, y_pred=pred_labels, average='weighted')
        wf1 = metrics.f1_score(y_true=true_labels, y_pred=pred_labels, average='weighted')
        results = {'maf': maf1, 'map': map, 'mar': mar, 'mif': mif1, 'mip': mip, 'mir': mir, 'avgf': wf1, 'avgp': wp, 'avgr': wr}
        df = pd.DataFrame(results, index=[0])
        df.to_csv(str(nameModel+' '+'results.csv'), sep='\t', index=False)

    def preprocess_text(self, texts):
        tokens = self.tokenizer(texts["comment"],truncation=True, padding='max_length',max_length=self.dicObject['tam_Embeddings'])
        tokens["influencerAux"] = [eval(i) for i in texts["influencerAux"]]
        tokens["genderAux"] = [eval(i) for i in texts["genderAux"]]
        tokens["mediaAux"] = [eval(i) for i in texts["mediaAux"]]
        return tokens

    def compute_metrics(self,p):    
        pred, labels = p
        pred = np.argmax(pred, axis=1)
        
        macp = metrics.precision_score(y_true=labels, y_pred=pred, average='macro')
        mar = metrics.recall_score(y_true=labels, y_pred=pred, average='macro')
        maf1 = metrics.f1_score(y_true=labels, y_pred=pred, average='macro')
        mip = metrics.precision_score(y_true=labels, y_pred=pred, average='micro')
        mir = metrics.recall_score(y_true=labels, y_pred=pred, average='micro')
        mif1 = metrics.f1_score(y_true=labels, y_pred=pred, average='micro')
        wp = metrics.precision_score(y_true=labels, y_pred=pred, average='weighted')
        wr = metrics.recall_score(y_true=labels, y_pred=pred, average='weighted')
        wf1 = metrics.f1_score(y_true=labels, y_pred=pred, average='weighted')

        return {'maf': maf1, 'map': macp, 'mar': mar, 'mif': mif1, 'mip': mip, 'mir': mir, 'avgf': wf1, 'avgp': wp, 'avgr': wr}

    def entrenarModelo(self,model, dicParametersNoMetadata):
        self.predictAndTrain(model, dicParametersNoMetadata)

if __name__=="__main__":
  for i in dicParametersNoMetadata:
    seed = 10
    # python 
    random.seed(seed)
    # pytorch
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
    # numpy 
    np.random.seed(seed)
    p = Pruebas(dicParametersNoMetadata[i])
    p.entrenarModelo(i,dicParametersNoMetadata[i])