***AI Engineer - P9 - Développez une preuve de concept***

# Importation des modules

In [1]:
import pandas as pd
import pickle
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import pipeline
import torch
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from datasets import Dataset

# Fonctions

In [2]:
# Conversion en Dataset
def encode_data(encodings, labels):
    return Dataset.from_dict({
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels': labels
    })

# Chargement des données

In [3]:
df_final = pd.read_csv("df_final.csv")

In [4]:
# Vérification du DataFrame
print(df_final.head())

   target                                               text  \
0       1  I kinda use the Tweetie video feat., I would u...   
1       0  i want the new album so bad!! but i cant get i...   
2       0  Man... I think Dan is going to make me see Los...   
3       0      Played a bad game with hockey today, lost 7-5   
4       1  wow! i had a great sleep. good morning everyon...   

                                            text_bow  \
0  kinda use tweetie video feat would use audio o...   
1                  want new album bad cant get today   
2  man think dan going make see los campesinos au...   
3                  played bad game hockey today lost   
4  wow great sleep good morning everyone hehe nic...   

                                        text_bow_lem  \
0  kinda use tweetie video feat would use audio o...   
1                  want new album bad cant get today   
2  man think dan going make see los campesinos au...   
3                  played bad game hockey today lost  

In [5]:
# Séparer les données
X = df_final['text']
y = df_final['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Chargement des modèles existants

In [6]:
with open("Tf-idf_MLP-Classifier_model.pkl", "rb") as f:
    mlp_model = pickle.load(f)

In [7]:
with open("tfidf_model.pkl", "rb") as f:
    tfidf_vectorizer = pickle.load(f)

# Prédictions avec le modèle MLP Classifier

In [8]:
X_test_tfidf = tfidf_vectorizer.transform(X_test)
mlp_predictions = mlp_model.predict(X_test_tfidf)

In [9]:
# Calcul des métriques pour la baseline
print("\n=== MLP Classifier ===")
print(f"Accuracy: {accuracy_score(y_test, mlp_predictions):.4f}")
print(f"F1 Score: {f1_score(y_test, mlp_predictions, average='weighted'):.4f}")
print(f"AUC: {roc_auc_score(y_test, mlp_predictions):.4f}")
print(classification_report(y_test, mlp_predictions))


=== MLP Classifier avec Tf-idf ===
Accuracy: 0.6993
F1 Score: 0.6785
AUC: 0.7064
              precision    recall  f1-score   support

           0       0.94      0.45      0.60      2055
           1       0.62      0.97      0.76      1945

    accuracy                           0.70      4000
   macro avg       0.78      0.71      0.68      4000
weighted avg       0.78      0.70      0.68      4000



# Prédictions avec DistilBERT

In [10]:
# Initialisation de DistilBERT
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Préparation des données pour DistilBERT
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128)

In [12]:
train_dataset = encode_data(train_encodings, list(y_train))
test_dataset = encode_data(test_encodings, list(y_test))

In [13]:
# Entraîner le modèle DistilBERT
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
)



In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.4221,0.413466
2,0.3077,0.432768
3,0.2191,0.53156


TrainOutput(global_step=3000, training_loss=0.3253490473429362, metrics={'train_runtime': 9013.7732, 'train_samples_per_second': 5.325, 'train_steps_per_second': 0.333, 'total_flos': 1589608783872000.0, 'train_loss': 0.3253490473429362, 'epoch': 3.0})

In [17]:
# Sauvegarder le modèle fine-tuné et le tokenizer
output_dir = "./distilbert_finetuned"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Modèle et tokenizer sauvegardés dans le répertoire : {output_dir}")

Modèle et tokenizer sauvegardés dans le répertoire : ./distilbert_finetuned


In [15]:
# Évaluer le modèle DistilBERT
predictions = trainer.predict(test_dataset)
pred_labels = torch.argmax(torch.tensor(predictions.predictions), axis=1).numpy()

In [16]:
# Calcul des métriques pour DistilBERT
print("\n=== DistilBERT ===")
print(f"Accuracy: {accuracy_score(y_test, pred_labels):.4f}")
print(f"F1 Score: {f1_score(y_test, pred_labels, average='weighted'):.4f}")
print(f"AUC: {roc_auc_score(y_test, pred_labels):.4f}")
print(classification_report(y_test, pred_labels))


=== DistilBERT ===
Accuracy: 0.8227
F1 Score: 0.8228
AUC: 0.8230
              precision    recall  f1-score   support

           0       0.84      0.81      0.82      2055
           1       0.81      0.83      0.82      1945

    accuracy                           0.82      4000
   macro avg       0.82      0.82      0.82      4000
weighted avg       0.82      0.82      0.82      4000



In [None]:
# Sauvegarder le modèle entraîné
output_dir = "distilbert_finetuned"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)