In [141]:
import pandas as pd

df = pd.read_csv('test.tsv', sep='\t')

In [142]:
print(df['category'].unique())
# count number of each category
print(df['category'].value_counts())

['business' 'health' 'politics' 'sports' 'technology']
category
business      100
health        100
politics      100
sports        100
technology     22
Name: count, dtype: int64


In [143]:
from sklearn.preprocessing import LabelEncoder
clean_df = df.copy()

clean_df.drop('url', axis=1, inplace=True)
le = LabelEncoder()
clean_df['category'] = le.fit_transform(df['category'])

In [144]:
from datasets import Dataset

dataset = Dataset.from_pandas(clean_df)
data = dataset.train_test_split(test_size=0.2, shuffle=True)

In [145]:
from transformers import AutoTokenizer

model_name = 'almanach/camembert-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [146]:
def tokenize_function(examples):
    return tokenizer(examples["headline"], truncation=True)

In [147]:
tokenized_data = data.map(tokenize_function, batched=True)

Map:   0%|          | 0/337 [00:00<?, ? examples/s]

Map: 100%|██████████| 337/337 [00:00<00:00, 4152.00 examples/s]
Map: 100%|██████████| 85/85 [00:00<00:00, 4068.93 examples/s]


In [148]:
tokenized_data["train"][0]

{'category': 1,
 'headline': 'Troubles digestifs fonctionnels : comment la constipation est devenue un tabou',
 'text': 'L\'histoire raconte que Napoléon Bonaparte est arrivé en retard à la bataille de Waterloo - au cours de laquelle son armée a fini par être renversée - parce qu\'à ce moment-là, il était très occupé dans la salle de bains, s\'efforçant de déféquer. Mais bien que la constipation ait affligé le général français pendant des années et qu\'une personne sur sept dans le monde connaisse ce problème, le processus biologique d\'élimination des matières fécales est un sujet dont on parle très peu. Des millions de personnes souffrent en silence des effets de la constipation chronique, un problème dont la solution, selon les experts, ne dépend pas exclusivement du régime alimentaire mais aussi d\'un changement mental qui nous permet d\'en parler et de nous libérer ainsi du poids que ce tabou implique. Lire aussi : "On nous entraîne dès l\'enfance à quitter nos couches, mais ensui

In [149]:
tokens = tokenizer.convert_ids_to_tokens(tokenized_data['train'][5]["input_ids"])
print(len(tokens))
print(tokens)
print(tokenizer.model_max_length)

12
['<s>', '▁CAN', '▁2019', '▁:', '▁bientôt', '▁la', '▁coupe', '▁pour', '▁le', '▁Sénégal', '▁?', '</s>']
512


In [150]:
import evaluate

accuracy = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_metric.compute(predictions=preds, references=labels)
    acc = accuracy.compute(predictions=preds, references=labels, average="macro")
    return {"accuracy": acc["accuracy"], "f1": f1["f1"]}

# Training

In [151]:
import torch
from transformers import TrainingArguments
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [152]:
batch_size = 64
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

In [153]:
class_names = le.classes_
label2id = {label: i for i, label in enumerate(le.classes_)}
id2label = {i: label for label, i in label2id.items()}
print(label2id, id2label, class_names)

{'business': 0, 'health': 1, 'politics': 2, 'sports': 3, 'technology': 4} {0: 'business', 1: 'health', 2: 'politics', 3: 'sports', 4: 'technology'} ['business' 'health' 'politics' 'sports' 'technology']


In [154]:
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def init_trainer():
  model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=len(class_names), id2label=id2label, label2id=label2id
    ).to(device)
  return Trainer(
      model=model,
      args=training_args,
      train_dataset=tokenized_data["train"],
      eval_dataset=tokenized_data["test"],
      tokenizer=tokenizer,
      data_collator=data_collator,
      compute_metrics=compute_metrics,
  ), model

In [155]:
trainer, model = init_trainer()
trainer.train()

ImportError: 
AutoModelForSequenceClassification requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
Please note that you may need to restart your runtime after installation.
