<a href="https://colab.research.google.com/github/sddavicillo/TRANSCEFR/blob/main/ELECTRA_small_EarlyStopping_TClass.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Análisis, preprocesamiento y preparación de los datos

In [None]:
# Instalar dependencias (solo necesario en la primera ejecución)
!pip install -U -q PyDrive

# Importar bibliotecas necesarias
import os
import pandas as pd
import re

from google.colab import drive
drive.mount('/content/drive')

# Ruta a tu carpeta con los archivos txt
folder_path = '/content/drive/My Drive/raw_data_CEFR-Predict/ICNALE/SM_0_Unclassified_Unmerged'  # Cambiar por tu ruta

# Lista para almacenar los datos
data = []

# Expresión regular para extraer las etiquetas
pattern = re.compile(r'(A2|B1_1|B1_2|B2)')

# Procesar cada archivo en la carpeta
for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        # Construir ruta completa
        file_path = os.path.join(folder_path, filename)

        # Leer contenido del archivo
        with open(file_path, 'r', encoding='utf-8-sig') as file:
            content = file.read()

        # Extraer etiqueta del nombre de archivo
        match = pattern.search(filename)
        if match:
            label = match.group()
        else:
            label = 'Desconocido'  # Para manejar posibles errores

        # Añadir a la lista de datos
        data.append({
            'content': content,
            'filename': filename,
            'label': label
        })

# Crear DataFrame
df = pd.DataFrame(data)

# Mostrar las primeras filas del dataframe
print(df.head())

Mounted at /content/drive
                                             content  \
0  \n\nI think it is not imp – not always importa...   
1  \n\nI agree with it.  In my opinion part-time ...   
2  \n\nI agree the opinion because I in my – in m...   
3  \n\nI can see the benefit of part-time job, bu...   
4  \n\nI disagree with that statement because doi...   

                   filename label  
0  SM_JPN_PTJ1_102_B1_2.txt  B1_2  
1  SM_JPN_PTJ1_103_B1_2.txt  B1_2  
2  SM_JPN_PTJ2_109_B1_2.txt  B1_2  
3  SM_JPN_PTJ1_104_B1_2.txt  B1_2  
4  SM_JPN_PTJ2_104_B1_2.txt  B1_2  


In [None]:
# Opcional: Guardar el dataframe como CSV para su verificación manual
df.to_csv('dataset.csv', index=False)

In [None]:
from sklearn.model_selection import train_test_split

# Configurar semilla para reproducibilidad
RANDOM_STATE = 42

# Split estratificado inicial: 80% train - 20% temporal
train_df, temp_df = train_test_split(
    df,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=df['label']  # Estratificación por etiqueta
)

# Split estratificado del temporal: 10% dev - 10% test
dev_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    random_state=RANDOM_STATE,
    stratify=temp_df['label']  # Estratificación en el split secundario
)

# Resetear índices
train_df = train_df.reset_index(drop=True)
dev_df = dev_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Función para mostrar distribución de etiquetas
def print_distribution(df, name):
    dist = df['label'].value_counts(normalize=True).sort_index() * 100
    print(f"\nDistribución en {name}:")
    for label, percentage in dist.items():
        print(f"- {label}: {percentage:.1f}%")

# Verificar distribuciones
print(f"\nTamaños absolutos:")
print(f"Entrenamiento: {len(train_df)} ejemplos")
print(f"Desarrollo: {len(dev_df)} ejemplos")
print(f"Prueba: {len(test_df)} ejemplos")

print_distribution(df, 'dataset completo')
print_distribution(train_df, 'entrenamiento')
print_distribution(dev_df, 'desarrollo')
print_distribution(test_df, 'prueba')


Tamaños absolutos:
Entrenamiento: 3040 ejemplos
Desarrollo: 380 ejemplos
Prueba: 380 ejemplos

Distribución en dataset completo:
- A2: 10.5%
- B1_1: 23.3%
- B1_2: 49.4%
- B2: 16.8%

Distribución en entrenamiento:
- A2: 10.5%
- B1_1: 23.3%
- B1_2: 49.4%
- B2: 16.8%

Distribución en desarrollo:
- A2: 10.5%
- B1_1: 23.2%
- B1_2: 49.5%
- B2: 16.8%

Distribución en prueba:
- A2: 10.5%
- B1_1: 23.4%
- B1_2: 49.2%
- B2: 16.8%


In [None]:
!pip install -U accelerate
!pip install --upgrade transformers
!pip install datasets

Collecting accelerate
  Downloading accelerate-1.7.0-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0

In [None]:
from datasets import Dataset, DatasetDict

dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df.reset_index(drop=True)),
    "valid": Dataset.from_pandas(dev_df.reset_index(drop=True)),
     "test": Dataset.from_pandas(test_df.reset_index(drop=True))
    }
    )
dataset

DatasetDict({
    train: Dataset({
        features: ['content', 'filename', 'label'],
        num_rows: 3040
    })
    valid: Dataset({
        features: ['content', 'filename', 'label'],
        num_rows: 380
    })
    test: Dataset({
        features: ['content', 'filename', 'label'],
        num_rows: 380
    })
})

In [None]:
dataset['train'][122]

{'content': "If people have [***] it is – it is good – it is good preparation.  So if people have – but in case people or student can't study – study – study student – student don't do [***]...",
 'filename': 'SM_JPN_PTJ2_057_A2_0.txt',
 'label': 'A2'}

In [None]:
dataset = dataset.class_encode_column("label")
labels = dataset['train'].features['label']
print(labels.names)

Casting to class labels:   0%|          | 0/3040 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/380 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/380 [00:00<?, ? examples/s]

['A2', 'B1_1', 'B1_2', 'B2']


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

transformer_model = 'google/electra-small-discriminator'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = (AutoModelForSequenceClassification
         .from_pretrained(transformer_model, num_labels = len(labels.names))).to(device)
tokenizer=AutoTokenizer.from_pretrained(transformer_model)

def tokenize(batch):
    return tokenizer(batch["content"], truncation=False)

ds_enc = dataset.map(tokenize, batched=True)

ds_enc

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/54.2M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/54.2M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/3040 [00:00<?, ? examples/s]

Map:   0%|          | 0/380 [00:00<?, ? examples/s]

Map:   0%|          | 0/380 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['content', 'filename', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3040
    })
    valid: Dataset({
        features: ['content', 'filename', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 380
    })
    test: Dataset({
        features: ['content', 'filename', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 380
    })
})

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator

DataCollatorWithPadding(tokenizer=ElectraTokenizerFast(name_or_path='google/electra-small-discriminator', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors=

In [None]:
ds_enc['train'][0]

{'content': "I do not think that smoking should be completely banned to all the country because I disagree for various reasons.  First, people say that there isn't something good about smoking and it has only bad things like health problems or other problems but I think it's good for their psychological state.  I think they give smoking give relax to smokers and it – it gives them various mentally sense and I think it's good for them to feel free in the cho – choices of smoking problem.  Well, second it's bad for other people around them smokers because second smoking is – second hand smoking is very bad and it gives them [***] structure about other people.",
 'filename': 'SM_KOR_SMK1_077_B1_1.txt',
 'label': 1,
 'input_ids': [101,
  1045,
  2079,
  2025,
  2228,
  2008,
  9422,
  2323,
  2022,
  3294,
  7917,
  2000,
  2035,
  1996,
  2406,
  2138,
  1045,
  21090,
  2005,
  2536,
  4436,
  1012,
  2034,
  1010,
  2111,
  2360,
  2008,
  2045,
  3475,
  1005,
  1056,
  2242,
  2204,
 

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [None]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

batch_size = 16
model_name = "CEFR-TextClassfinetuned_microsoft/deberta-v3-base"
training_args = TrainingArguments(
    output_dir=model_name,
    num_train_epochs=25,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_ratio=0.1,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="f1",
    load_best_model_at_end=True,
    save_total_limit=1,
    report_to='none',
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=ds_enc["train"],
    eval_dataset=ds_enc["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(
        early_stopping_patience=3,  # Stop after 3 evaluations without improvement
        early_stopping_threshold=0.0  # Any improvement is considered
    )]
)
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,1.266269,0.244765,0.494737,0.327502
2,No log,1.203553,0.244765,0.494737,0.327502
3,1.262600,1.170396,0.348076,0.481579,0.404042
4,1.262600,1.177056,0.33463,0.486842,0.389299
5,1.262600,1.187447,0.354831,0.505263,0.401967
6,1.111800,1.152232,0.364265,0.492105,0.418232
7,1.111800,1.159126,0.465785,0.518421,0.478156
8,0.983400,1.178667,0.46554,0.521053,0.477006
9,0.983400,1.156095,0.469628,0.513158,0.468459
10,0.983400,1.172299,0.475359,0.523684,0.487557


TrainOutput(global_step=2660, training_loss=0.9731945812254024, metrics={'train_runtime': 140.877, 'train_samples_per_second': 539.478, 'train_steps_per_second': 33.717, 'total_flos': 502678090402560.0, 'train_loss': 0.9731945812254024, 'epoch': 14.0})

In [None]:
preds_output = trainer.predict(ds_enc['test'])
print(preds_output.metrics)


{'test_loss': 1.178651213645935, 'test_precision': 0.4753762686588162, 'test_recall': 0.5447368421052632, 'test_f1': 0.5004935774204511, 'test_runtime': 0.4217, 'test_samples_per_second': 901.132, 'test_steps_per_second': 56.914}


In [None]:
from sklearn.metrics import classification_report
import numpy as np

y_true = np.array(ds_enc['test']["label"])
y_pred = preds_output.predictions.argmax(-1)

report = classification_report(
  y_true,
  y_pred,
  target_names=labels.names,
)
print(report)

              precision    recall  f1-score   support

          A2       0.00      0.00      0.00        40
        B1_1       0.46      0.45      0.45        89
        B1_2       0.59      0.79      0.67       187
          B2       0.47      0.31      0.37        64

    accuracy                           0.54       380
   macro avg       0.38      0.39      0.38       380
weighted avg       0.48      0.54      0.50       380



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
