<a href="https://colab.research.google.com/github/samehaisaa/ML-Journey/blob/main/Gods_Hackathon_43.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset, DataLoader


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Change directory to 'MyDrive/hackathon'
%cd /content/drive/MyDrive/hackathon


Mounted at /content/drive
/content/drive/MyDrive/hackathon


In [None]:
%ls
%pwd

test.csv  train.csv


'/content/drive/MyDrive/hackathon'

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
import os
from sklearn.metrics import accuracy_score
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification, Trainer, TrainingArguments
from textattack.augmentation import EasyDataAugmenter


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy_score(labels, predictions)}

# Load data
train_path = '/content/drive/MyDrive/hackathon/train.csv'
test_path = '/content/drive/MyDrive/hackathon/test.csv'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Preprocessing
train_df['text'] = train_df['title'].fillna('') + ' ' + train_df['content'].fillna('')
test_df['text'] = test_df['title'].fillna('') + ' ' + test_df['content'].fillna('')
train_df['text'] = train_df['text'].fillna('').astype(str)
test_df['text'] = test_df['text'].fillna('').astype(str)

# Encode labels
labels = train_df['target'].unique().tolist()
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}
train_df['label'] = train_df['target'].map(label2id)

# Tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', max_length=256, truncation=True, padding='max_length')

class MentalHealthDataset(Dataset):
    def __init__(self, texts, labels=None, cache_dir='./cache'):
        self.texts = texts.tolist() if isinstance(texts, pd.Series)  else texts
        self.labels = labels.tolist() if isinstance(labels, pd.Series) else labels
        self.cache_dir = cache_dir

        os.makedirs(self.cache_dir, exist_ok=True)
        cache_path = os.path.join(self.cache_dir, 'encodings.pt')

        if os.path.exists(cache_path):
            print("Loading cached encodings...")
            self.encodings = torch.load(cache_path)
        else:
            print("Tokenizing texts...")
            self.encodings = tokenizer.batch_encode_plus(
                self.texts,
                truncation=True,
                padding=True,
                max_length=512,
                  ='pt'
            )
            torch.save(self.encodings, cache_path)
            print("Encodings cached.")

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}  # Keep tensors on CPU
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])  # Keep labels on CPU
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Split data
X_train, X_val, y_train, y_val = train_test_split(
    train_df['text'], train_df['label'], test_size=0.2, stratify=train_df['label'], random_state=42
)

train_dataset = MentalHealthDataset(X_train, y_train, cache_dir='./cache/train')
val_dataset = MentalHealthDataset(X_val, y_val, cache_dir='./cache/val')

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=np.array(y_train))
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# Model setup
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
).to(device)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='eval_accuracy',  # Use accuracy as the metric for best model

    logging_dir='./logs',
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics  # Add this line to use the compute_metrics function


)

# Train
trainer.train()

# Predict on test data
test_dataset = MentalHealthDataset(test_df['text'], cache_dir='./cache/test')
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

# Create submission
test_df['Target'] = [id2label[p] for p in preds]
submission = test_df[['id', 'Target']].rename(columns={'id': 'ID'})
submission.to_csv('submission.csv', index=False)

Loading cached encodings...


  self.encodings = torch.load(cache_path)


Loading cached encodings...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8669,0.785084,0.733469
2,0.6707,0.799864,0.743624


In [None]:
pwd

'/content'

In [None]:
!pip install textattack

Collecting textattack
  Using cached textattack-0.3.10-py3-none-any.whl.metadata (38 kB)
Collecting bert-score>=0.3.5 (from textattack)
  Using cached bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting flair (from textattack)
  Using cached flair-0.15.1-py3-none-any.whl.metadata (12 kB)
Collecting datasets>=2.4.0 (from textattack)
  Using cached datasets-3.3.0-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch!=1.8,>=1.7.0->textattack)
  Using cached nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting boto3>=1.20.27 (from flair->textattack)
  Using cached boto3-1.36.21-py3-none-any.whl.metadata (6.7 kB)
Collecting mpld3>=0.3 (from flair->textattack)
  Using cached mpld3-0.5.10-py3-none-any.whl.metadata (5.1 kB)
Collecting pytorch-revgrad>=0.2.0 (from flair->textattack)
  Using cached pytorch_revgrad-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting transformer-smaller-training-vocab>=0.2.3 (from

In [None]:
import pandas as pd
import numpy as np
import torch
import os
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification, Trainer, TrainingArguments
from textattack.augmentation import EasyDataAugmenter

# Chemins des fichiers
train_path = '/content/drive/MyDrive/hackathon/train.csv'
test_path = '/content/drive/MyDrive/hackathon/test.csv'

# Détection du device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Chargement des datasets
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Concaténation titre + contenu
train_df['text'] = train_df['title'].fillna('') + ' ' + train_df['content'].fillna('')
test_df['text'] = test_df['title'].fillna('') + ' ' + test_df['content'].fillna('')

# Nettoyage des NaN
train_df['text'] = train_df['text'].fillna('').astype(str)
test_df['text'] = test_df['text'].fillna('').astype(str)

# Mapping des labels
labels = train_df['target'].unique().tolist()
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}
train_df['label'] = train_df['target'].map(label2id)


textattack: Updating TextAttack package dependencies.
textattack: Downloading NLTK required packages.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw to /root/nltk_data...
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
eda = EasyDataAugmenter()
train_df['aug_text'] = train_df['text'].apply(lambda x: eda.augment(x)[0])
train_df = pd.concat([train_df, pd.DataFrame({'text': train_df['aug_text'], 'label': train_df['label']})], ignore_index=True)


In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', max_length=512, truncation=True, padding='max_length')

class MentalHealthDataset(Dataset):
    def __init__(self, texts, labels=None, cache_dir='./cache'):
        self.texts = texts.tolist() if isinstance(texts, pd.Series) else texts
        self.labels = labels.tolist() if isinstance(labels, pd.Series) else labels
        self.cache_dir = cache_dir

        os.makedirs(self.cache_dir, exist_ok=True)
        cache_path = os.path.join(self.cache_dir, 'encodings.pt')

        if os.path.exists(cache_path):
            print("Loading cached encodings...")
            self.encodings = torch.load(cache_path)
        else:
            print("Tokenizing texts...")
            self.encodings = tokenizer.batch_encode_plus(
                self.texts,
                truncation=True,
                padding=True,
                max_length=512,
                return_tensors='pt'
            )
            torch.save(self.encodings, cache_path)
            print("Encodings cached.")

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    train_df['text'], train_df['label'], test_size=0.2, stratify=train_df['label'], random_state=42
)

train_dataset = MentalHealthDataset(X_train, y_train, cache_dir='./cache/train')
val_dataset = MentalHealthDataset(X_val, y_val, cache_dir='./cache/val')


Loading cached encodings...


  self.encodings = torch.load(cache_path)


Loading cached encodings...


In [None]:
import torch.nn as nn

# Calcul des poids de classe
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=np.array(y_train))
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# Implémentation de la Focal Loss
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.ce = nn.CrossEntropyLoss(weight=class_weights)

    def forward(self, logits, labels):
        ce_loss = self.ce(logits, labels)
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
        return focal_loss.mean()


In [None]:
model = RobertaForSequenceClassification.from_pretrained(
    'roberta-base',
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
    hidden_dropout_prob=0.3,
    attention_probs_dropout_prob=0.3
).to(device)


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs): # add **kwargs to accept additional arguments
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = FocalLoss(alpha=1, gamma=2)
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss


In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='eval_accuracy',  # Use accuracy as the metric for best model

    logging_dir='./logs',
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics  # Add this line to use the compute_metrics function


)


In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy_score(labels, predictions)}


In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='eval_accuracy',
    logging_dir='./logs',
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics  # Add this line to use the compute_metrics function

)

trainer.train()




Epoch,Training Loss,Validation Loss,Accuracy
1,1.0687,1.027115,0.263372
2,1.0585,1.024191,0.263372


KeyboardInterrupt: 

In [None]:
def tta_prediction(text, num_augments=5):
    variations = [eda.augment(text)[0] for _ in range(num_augments)]
    variations.append(text)  # Original version

    inputs = tokenizer(variations, truncation=True, padding=True, max_length=512, return_tensors='pt').to(device)
    outputs = model(**inputs)
    preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()

    return id2label[np.bincount(preds).argmax()]

test_df['Target'] = test_df['text'].apply(lambda x: tta_prediction(x))
