In [1]:
# First install required packages
!pip install transformers torch scikit-learn pandas

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [30]:
import json
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import (
    CamembertTokenizer,
    CamembertForSequenceClassification,
    Trainer,
    TrainingArguments
)
import torch
from google.colab import files

In [31]:
# @title Step 2: Upload Your Dataset
uploaded = files.upload()
filename = next(iter(uploaded))
with open(filename, 'r', encoding='utf-8') as f:
    data = json.load(f)

Saving training_data.json to training_data.json


In [32]:
df = pd.DataFrame(data)
print(f"Loaded {len(df)} comments")
print("Category distribution:")
print(df['label'].value_counts())

Loaded 500 comments
Category distribution:
label
quality     138
delivery    127
price       124
service     111
Name: count, dtype: int64


In [33]:
# @title Step 3: Prepare Data
le = LabelEncoder()
df['encoded_label'] = le.fit_transform(df['label'])

In [34]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [35]:
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")


In [38]:
# Corrected tokenization function
def tokenize_text(text_list):
    return tokenizer(
        text_list,
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

In [39]:
# Tokenize the text data directly from the Series
train_encodings = tokenize_text(train_df['commentaire'].tolist())
val_encodings = tokenize_text(val_df['commentaire'].tolist())

In [40]:
class CommentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            key: val[idx] for key, val in self.encodings.items()
        }
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [41]:
train_dataset = CommentDataset(train_encodings, train_df['encoded_label'].values)
val_dataset = CommentDataset(val_encodings, val_df['encoded_label'].values)

In [42]:
# @title Step 4: Train Model
model = CamembertForSequenceClassification.from_pretrained(
    "camembert-base",
    num_labels=len(le.classes_)
)

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    report_to="none"
)




In [44]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)



In [45]:
trainer.train()


Epoch,Training Loss,Validation Loss
1,No log,0.76188
2,0.888700,0.371403
3,0.888700,0.23973
4,0.296100,0.2092


TrainOutput(global_step=100, training_loss=0.5923729801177978, metrics={'train_runtime': 2213.5715, 'train_samples_per_second': 0.723, 'train_steps_per_second': 0.045, 'total_flos': 105246312038400.0, 'train_loss': 0.5923729801177978, 'epoch': 4.0})

In [46]:
# @title Step 5: Save and Download Model
!zip -r model.zip ./results

model.save_pretrained("saved_model")
tokenizer.save_pretrained("saved_model")
pd.Series(le.classes_).to_csv("labels.csv", index=False)


updating: results/ (stored 0%)
updating: results/checkpoint-75/ (stored 0%)
updating: results/checkpoint-75/model.safetensors (deflated 15%)
updating: results/checkpoint-75/config.json (deflated 54%)
updating: results/checkpoint-75/rng_state.pth (deflated 24%)
updating: results/checkpoint-75/optimizer.pt (deflated 28%)
updating: results/checkpoint-75/scheduler.pt (deflated 56%)
updating: results/checkpoint-75/trainer_state.json (deflated 64%)
updating: results/checkpoint-75/training_args.bin (deflated 52%)
updating: results/checkpoint-25/ (stored 0%)
updating: results/checkpoint-25/model.safetensors (deflated 15%)
updating: results/checkpoint-25/config.json (deflated 54%)
updating: results/checkpoint-25/rng_state.pth (deflated 24%)
updating: results/checkpoint-25/optimizer.pt (deflated 28%)
updating: results/checkpoint-25/scheduler.pt (deflated 56%)
updating: results/checkpoint-25/trainer_state.json (deflated 57%)
updating: results/checkpoint-25/training_args.bin (deflated 52%)
updatin

In [47]:
# @title Step 6: Test Your Model
from transformers import pipeline

classifier = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)


Device set to use cpu


In [48]:
test_comments = [
    "Your support team resolved my issue in minutes!",
    "The product arrived damaged and poorly made",
    "Materials feel premium and durable",
    "Not worth the premium price tag",
    "Way overpriced for what you get",
    "Super fast shipping, impressive",
    "Tracking information was never updated"
]

for comment in test_comments:
    result = classifier(comment)
    label_idx = int(result[0]['label'].split('_')[-1])
    label = le.inverse_transform([label_idx])[0]
    print(f"\"{comment}\"")
    print(f"→ {label} ({result[0]['score']:.2%} confidence)")
    print("-" * 50)

"Your support team resolved my issue in minutes!"
→ service (66.19% confidence)
--------------------------------------------------
"The product arrived damaged and poorly made"
→ quality (71.50% confidence)
--------------------------------------------------
"Materials feel premium and durable"
→ quality (82.25% confidence)
--------------------------------------------------
"Not worth the premium price tag"
→ price (82.15% confidence)
--------------------------------------------------
"Way overpriced for what you get"
→ price (82.30% confidence)
--------------------------------------------------
"Super fast shipping, impressive"
→ delivery (48.33% confidence)
--------------------------------------------------
"Tracking information was never updated"
→ quality (34.43% confidence)
--------------------------------------------------
