In [None]:
# Import de la bibliothèque essentielle pour la manipulation de données
import pandas as pd

# On définit des noms de colonnes clairs
column_names = ['sentiment', 'id', 'date', 'query', 'user', 'text']

# Chargement des données dans un "DataFrame"
df = pd.read_csv("../data/training.1600000.processed.noemoticon.csv", names=column_names, encoding="latin-1")

# On remplace la valeur 4 par 1 dans la colonne 'sentiment' pour la rendre binaire (0 ou 1)
df['sentiment'] = df['sentiment'].replace(4, 1)

# Affichage des 5 premières lignes pour un premier aperçu
print("Aperçu des 5 premières lignes du tableau de données :")
print(df.head())

In [None]:
import mlflow
import mlflow.sklearn
import mlflow.transformers

# Configuration de l'URI de suivi pour MLflow
mlflow.set_tracking_uri("file:/mlflow")

# Définition du nom de l'expérience
experiment_name = "Analyse de Sentiments - Twitter"
mlflow.set_experiment(experiment_name)

print(f"MLflow configuré pour l'expérience: '{experiment_name}'")

In [None]:
# Reduce the DataFrame to 16000 elements, stratified by column 0
n_samples = 16000

# Sample 8000 rows where sentiment is 0 (negative sentiment)
df_neg = df[df['sentiment'] == 0].sample(n=n_samples // 2, random_state=42)

# Sample 8000 rows where sentiment is 1 (positive sentiment)
df_pos = df[df['sentiment'] == 1].sample(n=n_samples // 2, random_state=42)

# Concatenate the two dataframes
df_truncated_stratified = pd.concat([df_neg, df_pos])

# Shuffle the truncated DataFrame
df_truncated_stratified = df_truncated_stratified.sample(frac=1, random_state=42).reset_index(drop=True)

# Select only the columns we need (sentiment and text)
df_prepared = df_truncated_stratified[['sentiment', 'text']]


# Display the shape of the truncated and stratified DataFrame
print("Shape of truncated and stratified DataFrame:")
print(df_prepared.shape)

# Display the first few rows of the truncated and stratified DataFrame
print("\nAperçu du DataFrame tronqué et stratifié:")
display(df_prepared.head())

# Check the value counts of the sentiment column to confirm stratification
print("\nValue counts of sentiment in truncated and stratified DataFrame:")
print(df_prepared['sentiment'].value_counts())

In [None]:
df = df_prepared
# 1. Obtenir les dimensions du tableau (nombre de lignes, nombre de colonnes)
print(f"Dimensions du tableau : {df.shape}")
print("-" * 30)

# 2. Obtenir un résumé des informations (types de données, valeurs non nulles)
print("Informations sur le DataFrame :")
df.info()
print("-" * 30)

# 3. Compter le nombre de tweets pour chaque sentiment
print("Distribution des sentiments :")
print(df['sentiment'].value_counts())

In [None]:
# We'll use the same powerful libraries as before
import os

# Disable the progress bar from huggingface_hub to avoid LookupError in some notebook environments
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"

from transformers import AutoTokenizer, AutoModelForSequenceClassification

# The correct, English-based model name
model_name = "distilbert-base-uncased"

# 1. Load the Tokenizer for DistilBERT
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 2. Load the DistilBERT model for sequence classification (with 2 labels)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# --- BitFit implementation ---
# Freeze all parameters by default
for param in model.parameters():
    param.requires_grad = False

# Unfreeze biases and the classification layer
for name, param in model.named_parameters():
    if 'bias' in name or 'classifier' in name or 'pre_classifier' in name:
        param.requires_grad = True

# Count and print the number of trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Number of trainable parameters (BitFit): {trainable_params}')

print("DistilBERT model and its tokenizer are loaded and ready!")

In [None]:
# A sample tweet from our dataset
sample_text = "i love the new flight attendant service"

# Use the tokenizer to process the text
encoding = tokenizer(sample_text, 
                     padding='max_length', 
                     truncation=True, 
                     max_length=64, 
                     return_tensors='pt')

print("Original Sentence:\n", sample_text)
print("\nToken IDs (input_ids):\n", encoding['input_ids'])
print("\nAttention Mask:\n", encoding['attention_mask'])"

In [None]:
import torch
from torch.utils.data import Dataset

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

sample_df = df.sample(n=16000, random_state=42)

MAX_LEN = 64
sentiment_dataset = SentimentDataset(
    texts=sample_df.text.to_numpy(),
    labels=sample_df.sentiment.to_numpy(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

In [None]:
from torch.utils.data import DataLoader, random_split
import numpy as np

train_size = int(0.8 * len(sentiment_dataset))
test_size = len(sentiment_dataset) - train_size
train_dataset, test_dataset = random_split(sentiment_dataset, [train_size, test_size])

BATCH_SIZE = 16

train_data_loader = DataLoader(
  train_dataset,
  batch_size=BATCH_SIZE,
  shuffle=True
)

test_data_loader = DataLoader(
  test_dataset,
  batch_size=BATCH_SIZE
)

print(f"Number of batches in the training loader: {len(train_data_loader)}")
print(f"Number of batches in the testing loader: {len(test_data_loader)}")

In [None]:
from torch.optim import AdamW
from tqdm.auto import tqdm
import torch
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

history = {
    'train_loss': [],
    'accuracy': []
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

EPOCHS = 6

with mlflow.start_run(run_name="DistilBERT") as run:
    mlflow.log_params({"model_name": model_name, "epochs": EPOCHS, "learning_rate": 2e-5, "batch_size": BATCH_SIZE, "max_len": MAX_LEN})
    
    for epoch in range(EPOCHS):
        print(f'--- Epoch {epoch + 1} / {EPOCHS} ---')
        
        model.train()
        total_train_loss = 0

        for batch in tqdm(train_data_loader, desc="Training"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            
            loss.backward()
            total_train_loss += loss.item()
            
            optimizer.step()

        avg_train_loss = total_train_loss / len(train_data_loader)
        history['train_loss'].append(avg_train_loss)

        mlflow.log_metric("train_loss", avg_train_loss, step=epoch)

        print("Evaluating...")
        model.eval()
    
    predictions, true_labels = [], []
    
    with torch.no_grad():
        for batch in tqdm(test_data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            
            logits = outputs.logits
            batch_predictions = torch.argmax(logits, dim=1)
            
            predictions.extend(batch_predictions.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    val_accuracy = accuracy_score(true_labels, predictions)
    history['accuracy'].append(val_accuracy)
    
    mlflow.log_metric("accuracy", val_accuracy, step=epoch)

    report = classification_report(true_labels, predictions, target_names=['Non-négatif (0)', 'Négatif (1)'], output_dict=True)
    print(f"\nRésultats pour l'Epoch {epoch + 1}:")
    print(report)

    mlflow.log_param("model_type", "DistilBERT")

    cleaned_report = {}
    for key, value in report.items():
        if isinstance(value, dict):
            cleaned_key = key.replace(' (0)', '_0').replace(' (1)', '_1').replace('-', '_')
            cleaned_report[cleaned_key] = value
        else:
            cleaned_report[key] = value

    mlflow.log_metrics({f"precision_{k}": v['precision'] for k, v in cleaned_report.items() if isinstance(v, dict)})
    mlflow.log_metrics({f"recall_{k}": v['recall'] for k, v in cleaned_report.items() if isinstance(v, dict)})
    mlflow.log_metrics({f"f1_score_{k}": v['f1-score'] for k, v in cleaned_report.items() if isinstance(v, dict)})

    mlflow.log_text(classification_report(true_labels, predictions, target_names=['Non-négatif (0)', 'Négatif (1)']), "classification_report_distilbert.txt")

    # Use the computed val_accuracy variable (defined above) instead of the undefined 'accuracy'
    print(f"Logged DistilBERT metrics to MLflow (run_id={run.info.run_id}) - accuracy={val_accuracy:.4f}")

    if epoch == EPOCHS - 1:
        model_artifact = {"model": model, "tokenizer": tokenizer}
        mlflow.transformers.log_model(
            transformers_model=model_artifact,
            name="distilbert_sentiment_model",
            task="text-classification"
        )

print("\nEntraînement terminé et modèle sauvegardé avec MLflow !")

In [None]:
import matplotlib.pyplot as plt

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

ax1.plot(history['train_loss'], label='Perte d entraînement')
ax1.set_title('Évolution de la Perte')
ax1.set_xlabel('Épochs')
ax1.set_ylabel('Perte')
ax1.legend()

ax2.plot(history['accuracy'], label='Précision de validation', color='orange')
ax2.set_title('Évolution de la Précision')
ax2.set_xlabel('Épochs')
ax2.set_ylabel('Précision')
ax2.legend()

plt.show()

In [None]:
import os

output_dir = "../models/"

os.makedirs(output_dir, exist_ok=True)

print(f"Sauvegarde du modèle dans le répertoire {output_dir}")

model.save_pretrained(output_dir)

tokenizer.save_pretrained(output_dir)

print("\nModèle et tokenizer sauvegardés avec succès !")

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

output_dir = "../models/"

tokenizer = AutoTokenizer.from_pretrained(output_dir)

model = AutoModelForSequenceClassification.from_pretrained(output_dir)

model.eval()

new_tweet = "The flight was surprisingly on time and the crew was wonderful."

inputs = tokenizer(new_tweet, return_tensors="pt", truncation=True, padding=True)

with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits
prediction = torch.argmax(logits, dim=1).item()

sentiments = ['Négatif (0)', 'Positif (1)']
print(f"Le tweet : '{new_tweet}'")
print(f"Result: {prediction}")
print(f"Prédiction : {sentiments[prediction]}" )