In [1]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
import json

2024-04-17 22:14:33.951338: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
submission_file = 'submissions/predicted_test_categories.csv'
epoch = 300
n_correction = 20

In [3]:
# Catégories et textes
categories = ["Politics", "Health", "Finance", "Travel", "Food", "Education", "Environment", "Fashion", "Science", "Sports", "Technology", "Entertainment"]
with open('data/train.json') as f:
    data = json.load(f)

# Création d'une liste de phrases et de labels
texts = []
labels = []
for label, texts_list in data.items():
    for text in texts_list:
        texts.append(text)
        labels.append(categories.index(label))

print(labels)

[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11]


In [4]:
# Préparation des données pour BERT
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [5]:
# Tokenisation des textes
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
labels = torch.tensor(labels)

In [6]:
# Création du dataset
dataset = Dataset(inputs, labels)

# Division en ensemble d'entraînement et de validation
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])


In [7]:
# Entraînement de BERT
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(categories))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from transformers import DataCollatorWithPadding

# Création d'un data collator qui s'occupe du padding automatiquement
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [9]:
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=epoch,              
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',            
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator  # Ajoutez le data collator ici
)


trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/1200 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.6971, 'grad_norm': 0.0732276663184166, 'learning_rate': 5e-05, 'epoch': 125.0}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
Checkpoint destination directory ./results/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.0027, 'grad_norm': 0.016833817586302757, 'learning_rate': 1.4285714285714285e-05, 'epoch': 250.0}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


{'train_runtime': 2239.3866, 'train_samples_per_second': 3.751, 'train_steps_per_second': 0.536, 'train_loss': 0.2918431257704894, 'epoch': 300.0}


TrainOutput(global_step=1200, training_loss=0.2918431257704894, metrics={'train_runtime': 2239.3866, 'train_samples_per_second': 3.751, 'train_steps_per_second': 0.536, 'train_loss': 0.2918431257704894, 'epoch': 300.0})

In [10]:
with open('data/test_shuffle.txt') as f:
    test_texts = f.read().splitlines()

# Tokenisation des données de test
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")

# Prédiction
model.eval()
with torch.no_grad():
    outputs = model(**test_encodings)

predictions = torch.argmax(outputs.logits, dim=-1)

In [11]:
# Convertir les indices en catégories
predicted_categories = [categories[prediction] for prediction in predictions]

# Création du DataFrame
results_df = pd.DataFrame({
    'ID': range(0, len(predicted_categories)),
    'Label': predicted_categories,
})
results_with_sentences = pd.DataFrame({
    'ID': range(0, len(predicted_categories)),
    'Label': predicted_categories,
    'Sentence': test_texts
})

# Sauvegarde en CSV
results_df.to_csv(submission_file, index=False)
results_with_sentences.to_csv('submissions/predicted_test_categories_with_sentences.csv', index=False)


In [12]:
import pandas as pd
# Obtenir les probabilités maximales pour chaque prédiction
max_probs, _ = torch.max(outputs.logits, dim=1)

# Obtenir les indices des instances avec la plus petite confiance
sorted_indices = torch.argsort(max_probs, descending=False)

# Obtenir les phrases, les labels et les probabilités correspondants
sorted_texts = [test_texts[i] for i in sorted_indices]
sorted_labels = [predicted_categories[i] for i in sorted_indices]
sorted_probs = max_probs[sorted_indices]

# Créer un DataFrame à partir des listes
df = pd.DataFrame({
    'Text': sorted_texts[:n_correction],
    'Label': sorted_labels[:n_correction],
    'Probability': [prob.item() for prob in sorted_probs[:n_correction]]
})

# Afficher le DataFrame
df

Unnamed: 0,Text,Label,Probability
0,The singer's music video featured elaborate ch...,Travel,0.964023
1,The singer's acoustic performance showcased th...,Entertainment,1.114668
2,The sports car accelerated from 0 to 60 mph in...,Finance,1.280977
3,The drone pilot operates the aircraft to captu...,Health,1.369325
4,The singer's heartfelt ballad resonated with l...,Entertainment,1.614717
5,The impact of cultural appropriation in fashio...,Fashion,1.639305
6,The singer's charity work extends beyond music...,Fashion,1.6819
7,The singer-songwriter performed an acoustic se...,Sports,1.684123
8,The importance of arts education in education ...,Health,1.752597
9,The impact of fashion on cultural identity is ...,Fashion,1.756463


In [13]:
from IPython.display import display
import ipywidgets as widgets

# Créer une liste pour stocker les widgets de saisie, pour pouvoir extraire les données plus tard
input_widgets = {}
corrected_labels = []

# Afficher les textes et préparer les widgets de correction
for idx, (text, label) in enumerate(zip(sorted_texts[:n_correction], sorted_labels[:n_correction])):
    print(f'Text: {text}, Original Label: {label}')
    input_widget = widgets.Text(
        value='',
        placeholder='Entrez le label correct s\'il y a une erreur',
        description='Correction:',
        continuous_update=False
    )
    display(input_widget)
    input_widgets[idx] = (input_widget, label, text)  # stocker le widget avec le label original et le texte

#["Politics", "Health", "Finance", "Travel", "Food", "Education", "Environment", "Fashion", "Science", "Sports", "Technology", "Entertainment"]


Text: The singer's music video featured elaborate choreography and stunning visuals., Original Label: Travel


Text(value='', continuous_update=False, description='Correction:', placeholder="Entrez le label correct s'il y…

Text: The singer's acoustic performance showcased their raw talent and emotion., Original Label: Entertainment


Text(value='', continuous_update=False, description='Correction:', placeholder="Entrez le label correct s'il y…

Text: The sports car accelerated from 0 to 60 mph in just a few seconds., Original Label: Finance


Text(value='', continuous_update=False, description='Correction:', placeholder="Entrez le label correct s'il y…

Text: The drone pilot operates the aircraft to capture aerial footage for film and television., Original Label: Health


Text(value='', continuous_update=False, description='Correction:', placeholder="Entrez le label correct s'il y…

Text: The singer's heartfelt ballad resonated with listeners on an emotional level., Original Label: Entertainment


Text(value='', continuous_update=False, description='Correction:', placeholder="Entrez le label correct s'il y…

Text: The impact of cultural appropriation in fashion is a topic of ongoing debate., Original Label: Fashion


Text(value='', continuous_update=False, description='Correction:', placeholder="Entrez le label correct s'il y…

Text: The singer's charity work extends beyond music to support various causes., Original Label: Fashion


Text(value='', continuous_update=False, description='Correction:', placeholder="Entrez le label correct s'il y…

Text: The singer-songwriter performed an acoustic set at a small venue., Original Label: Sports


Text(value='', continuous_update=False, description='Correction:', placeholder="Entrez le label correct s'il y…

Text: The importance of arts education in education cannot be overemphasized., Original Label: Health


Text(value='', continuous_update=False, description='Correction:', placeholder="Entrez le label correct s'il y…

Text: The impact of fashion on cultural identity is a topic of ongoing concern., Original Label: Fashion


Text(value='', continuous_update=False, description='Correction:', placeholder="Entrez le label correct s'il y…

Text: The two parties are engaged in a bitter feud., Original Label: Politics


Text(value='', continuous_update=False, description='Correction:', placeholder="Entrez le label correct s'il y…

Text: The mayor announced a new initiative to improve public transportation., Original Label: Politics


Text(value='', continuous_update=False, description='Correction:', placeholder="Entrez le label correct s'il y…

Text: The sports academy recruits talented athletes from across the country., Original Label: Sports


Text(value='', continuous_update=False, description='Correction:', placeholder="Entrez le label correct s'il y…

Text: The impact of fashion on cultural appropriation is a topic of ongoing debate., Original Label: Environment


Text(value='', continuous_update=False, description='Correction:', placeholder="Entrez le label correct s'il y…

Text: The soccer coach emphasized teamwork and communication on the field., Original Label: Sports


Text(value='', continuous_update=False, description='Correction:', placeholder="Entrez le label correct s'il y…

Text: The importance of extracurricular activities in education cannot be overemphasized., Original Label: Technology


Text(value='', continuous_update=False, description='Correction:', placeholder="Entrez le label correct s'il y…

Text: The sports academy offers training programs for aspiring athletes., Original Label: Education


Text(value='', continuous_update=False, description='Correction:', placeholder="Entrez le label correct s'il y…

Text: The sports blog provides in-depth coverage of the latest events., Original Label: Technology


Text(value='', continuous_update=False, description='Correction:', placeholder="Entrez le label correct s'il y…

Text: The importance of ethical labor practices in fashion cannot be overstated., Original Label: Finance


Text(value='', continuous_update=False, description='Correction:', placeholder="Entrez le label correct s'il y…

Text: The equestrian rider cleared the obstacles with precision and grace., Original Label: Travel


Text(value='', continuous_update=False, description='Correction:', placeholder="Entrez le label correct s'il y…

In [16]:
from IPython.display import display
import ipywidgets as widgets
import torch

# Préparation pour les variables globales
global corrected_texts, corrected_labels

# Initialiser les listes vides
corrected_texts = []
corrected_labels = []

def process_corrections():
    global corrected_texts, corrected_labels
    corrected_texts = []
    corrected_labels = []
    
    for idx, (widget, original_label, text) in input_widgets.items():
        corrected_label = widget.value if widget.value else original_label
        corrected_labels.append(corrected_label)
        corrected_texts.append(text)
        print(f'Corrected Text: {text}, Corrected Label: {corrected_label}')

    # Affichage pour vérification
    print("\nRécapitulatif des corrections :")
    for text, label in zip(corrected_texts, corrected_labels):
        print(f'Text: {text}, Label: {label}')

# Bouton pour traiter les corrections
process_button = widgets.Button(description="Process Corrections")
process_button.on_click(lambda b: process_corrections())
display(process_button)


Button(description='Process Corrections', style=ButtonStyle())

Corrected Text: The singer's music video featured elaborate choreography and stunning visuals., Corrected Label: Entertainment
Corrected Text: The singer's acoustic performance showcased their raw talent and emotion., Corrected Label: Entertainment
Corrected Text: The sports car accelerated from 0 to 60 mph in just a few seconds., Corrected Label: Sports
Corrected Text: The drone pilot operates the aircraft to capture aerial footage for film and television., Corrected Label: Technology
Corrected Text: The singer's heartfelt ballad resonated with listeners on an emotional level., Corrected Label: Entertainment
Corrected Text: The impact of cultural appropriation in fashion is a topic of ongoing debate., Corrected Label: Fashion
Corrected Text: The singer's charity work extends beyond music to support various causes., Corrected Label: Entertainment
Corrected Text: The singer-songwriter performed an acoustic set at a small venue., Corrected Label: Entertainment
Corrected Text: The importa

In [17]:
# Assurez-vous que `process_corrections()` a été exécuté et les listes remplies
if corrected_texts and corrected_labels:
    # Tokenisation des textes corrigés

    print(corrected_texts[0], corrected_labels[0])
    corrected_encodings = tokenizer(corrected_texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
    corrected_labels_indices = torch.tensor([categories.index(label) for label in corrected_labels])

    # Création du dataset pour les instances corrigées
    corrected_dataset = Dataset(corrected_encodings, corrected_labels_indices)

print(corrected_labels_indices)


The singer's music video featured elaborate choreography and stunning visuals. Entertainment
tensor([11, 11,  9, 10, 11,  7, 11, 11,  5,  7,  0,  0,  9,  7,  9,  5,  9,  9,
         7,  9])


In [18]:
# Assuming `dataset` is your Dataset object
for i, data in enumerate(dataset):
    print(f"Data at index {i} is: {data}")
    if i > 10:  # Only print the first 10 elements
        break

Data at index 0 is: {'input_ids': tensor([ 101, 1996, 3664, 2623, 1037, 2047, 6349, 2000, 5335, 2270, 5193, 1012,
         102,    0,    0,    0,    0,    0,    0,    0]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]), 'labels': tensor(0)}
Data at index 1 is: {'input_ids': tensor([  101,  1996,  5205,  2003,  5307,  6256,  2005,  2014, 11032,  2006,
         1996,  3522,  3021,  1012,   102,     0,     0,     0,     0,     0]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]), 'labels': tensor(0)}
Data at index 2 is: {'input_ids': tensor([  101,  1996,  9046,  2602,  2038, 13977,  6387, 14379,  2426,  1996,
         5347,  1012,   102,     0,     0,     0,     0,     0,     0,     0]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


In [19]:

# Fusionner les datasets d'entraînement originaux et corrigés
new_train_dataset = torch.utils.data.ConcatDataset([train_dataset, corrected_dataset])

In [20]:
print(len(corrected_dataset))
print(len(train_dataset))
print(len(new_train_dataset))

20
28
48


In [26]:

# Recréation de l'ensemble d'entraînement et de validation
new_total_size = len(new_train_dataset)
new_train_size = int(0.8 * new_total_size)
new_val_size = new_total_size - new_train_size
new_train_dataset, new_val_dataset = torch.utils.data.random_split(new_train_dataset, [new_train_size, new_val_size])


training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=500,              
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',            
)

# Réinitialisation du Trainer avec les nouveaux datasets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=new_train_dataset,
    eval_dataset=new_val_dataset,
    data_collator=data_collator  # Ajoutez le data collator ici
)


# Réentraînement du modèle avec le nouveau dataset
trainer.train()

# Après le réentraînement, effectuer des prédictions sur le dataset de test pour évaluer les améliorations
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
model.eval()
with torch.no_grad():
    outputs = model(**test_encodings)
predictions = torch.argmax(outputs.logits, dim=-1)

# Conversion des indices de prédictions en catégories
predicted_categories = [categories[prediction] for prediction in predictions]



dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/2000 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.0, 'grad_norm': 7.641436354788311e-07, 'learning_rate': 5e-05, 'epoch': 125.0}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
Checkpoint destination directory ./results/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.0, 'grad_norm': 4.719841513178835e-07, 'learning_rate': 3.3333333333333335e-05, 'epoch': 250.0}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
Checkpoint destination directory ./results/checkpoint-1500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.0, 'grad_norm': 3.267794852490624e-07, 'learning_rate': 1.6666666666666667e-05, 'epoch': 375.0}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
Checkpoint destination directory ./results/checkpoint-2000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'loss': 0.0, 'grad_norm': 2.7303320848659496e-07, 'learning_rate': 0.0, 'epoch': 500.0}
{'train_runtime': 4393.4369, 'train_samples_per_second': 3.414, 'train_steps_per_second': 0.455, 'train_loss': 1.2001890399915281e-07, 'epoch': 500.0}


In [28]:

# Création du DataFrame
results_correction_df = pd.DataFrame({
    'ID': range(0, len(predicted_categories)),
    'Label': predicted_categories,
})
results_correction_with_sentences = pd.DataFrame({
    'ID': range(0, len(predicted_categories)),
    'Label': predicted_categories,
    'Sentence': test_texts
})

# Sauvegarde en CSV
results_correction_df.to_csv('submissions/predicted_test_correction_categories.csv', index=False)
results_correction_with_sentences.to_csv('submissions/predicted_test_correction_categories_with_sentences.csv', index=False)




In [29]:
baseline_df = pd.read_csv('data/baseline.csv')

# Fusionner les dataframes sur l'ID
merged_df = baseline_df.merge(results_df, on='ID', suffixes=('_baseline', '_file1'))
merged_df = merged_df.merge(results_correction_df, on='ID', suffixes=('', '_file2'))

# Renommer les colonnes pour plus de clarté
merged_df.rename(columns={'Label_baseline': 'Baseline', 'Label_file1': 'File1', 'Label': 'File2'}, inplace=True)

# Calculer l'accuracy pour chaque fichier par rapport à la baseline
merged_df['Correct_File1'] = merged_df['Baseline'] == merged_df['File1']
merged_df['Correct_File2'] = merged_df['Baseline'] == merged_df['File2']

accuracy_file1 = merged_df['Correct_File1'].mean()
accuracy_file2 = merged_df['Correct_File2'].mean()

print(f"Accuracy de No correction par rapport à la baseline: {accuracy_file1:.2%}")
print(f"Accuracy de With correction par rapport à la baseline: {accuracy_file2:.2%}")


Accuracy de No correction par rapport à la baseline: 61.84%
Accuracy de With correction par rapport à la baseline: 78.42%


In [24]:
incorrect_df = merged_df[~(merged_df['Correct_File2']==merged_df['Correct_File1'])]
incorrect_df.head()

Unnamed: 0,ID,Baseline,File1,File2,Correct_File1,Correct_File2
3,3,Finance,Finance,Technology,True,False
6,6,Finance,Finance,Politics,True,False
7,7,Environment,Environment,Finance,True,False
8,8,Fashion,Technology,Fashion,False,True
13,13,Fashion,Environment,Fashion,False,True


In [25]:
merged_df

Unnamed: 0,ID,Baseline,File1,File2,Correct_File1,Correct_File2
0,0,Finance,Finance,Finance,True,True
1,1,Environment,Environment,Environment,True,True
2,2,Science,Science,Science,True,True
3,3,Finance,Finance,Technology,True,False
4,4,Science,Environment,Technology,False,False
...,...,...,...,...,...,...
1135,1135,Entertainment,Entertainment,Entertainment,True,True
1136,1136,Travel,Travel,Fashion,True,False
1137,1137,Food,Environment,Health,False,False
1138,1138,Health,Health,Health,True,True
