# 0 - Text classification

## Importation des modules

In [1]:
# Modules de base
import numpy as np
import pandas as pd
import sys

# Ajout d'un chemin
sys.path.append('..')

# Séparation train/test
from sklearn.model_selection import train_test_split
# Métriques
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Modules de NLP
from transformers import AutoTokenizer, AutoModelForSequenceClassification,TrainingArguments, Trainer
from datasets import Dataset

## Importation des données

### Jeu de données d'entraînement

In [2]:
# Importation du jeu de données d'entrainement
data_train = pd.read_excel('../data/data_train.xlsx', index_col='Unnamed: 0')
# Conversion des labels en types numériques
data_train['label'] = data_train['label'].replace({'1' : 0, '8' : 1, '1:8' : 2})
data_train.head(10)

Unnamed: 0,label,message
0,1,"Madame,Monsieur, je me permet de vous envoyer ..."
5,1,je désire connaitre quel est le montant de mo...
6,1,"Bonjour, Je fais suite du dernier message envo..."
7,1,"Bonjour, Je me permets de vous contacter afin ..."
8,1,"Madame, Monsieur, Après avoir déclaré nos r..."
9,0,Bonjour suite a ma derniere demande et votre r...
10,1,"A l'attention de Madame XXXXX XXXXX Bonjour, J..."
11,0,bonjour j'ai changé d'employeur depuis le 000...
12,0,"Madame, Monsieur, Bonjour, nous sommes XXXXX́s..."
13,1,"Madame, Monsieur bonjour, Je suis désolé mai..."


### Jeu de données de test

In [3]:
# Importation du jeu de données de test
data_test = pd.read_excel('../data/data_test.xlsx', index_col='Unnamed: 0')
data_test.head()

Unnamed: 0,message
0,"Chère Madame, Cher Monsieur, Impossible de ch..."
1,"Bonjour, mon taux pour le prélévement à la ..."
2,"Bonsoir, Le 0000.0000.0000 ma première fille ..."
3,"Bonjour, Lors de ma déclaration de changement..."
4,Bonjour concernant le prélèvement a la sourc...


## Statistiques descriptives

### Longueur des textes dans les jeux de données d'entraînement et de test

### Distribution des labels dans le jeu de données d'entraînement

## TF-IDF

## Word2Vec

## BERT

In [4]:
# Split the dataset into training and validation sets
train_df, val_df = train_test_split(data_train, test_size=0.2, random_state=42)

# Convert the pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Define a function to tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['message'], padding="max_length", truncation=True)

# Apply the tokenization function to the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Remove the original message column
train_dataset = train_dataset.remove_columns(["message"])
val_dataset = val_dataset.remove_columns(["message"])

# Set the format to PyTorch tensors
train_dataset.set_format("torch")
val_dataset.set_format("torch")

# Load the model
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=3,              # Number of training epochs
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch"
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # The instantiated Transformers model to be trained
    args=training_args,                  # Training arguments, defined above
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=val_dataset             # Evaluation dataset
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print(results)

# Save the model
model.save_pretrained('./finetuned_distilbert')
tokenizer.save_pretrained('./finetuned_distilbert')


Map:   0%|          | 0/560 [00:00<?, ? examples/s]

Map:   0%|          | 0/140 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/105 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Rapport

Evaluation sur :
- Le workflow
- Le preprocessing (causal language modelling vs masked language modelling)
- Les métriques utilisées
- L'évaluation des résultats

Voir sur la lemmatisation