In [None]:
!pip install pandas scikit-learn transformers[torch] datasets

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments

data_path = os.path.join(os.path.dirname(os.getcwd()), "data", "data.csv")

df = pd.read_csv(data_path)
df['labels'] = df['is_positive'].map({'f': 0, 't': 1}).astype(int).values

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

train_df.head()

In [None]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [None]:
def preprocess(examples):
    return tokenizer(examples['title'], padding='max_length', truncation=True)

train_dataset = train_dataset.map(preprocess, batched=True)
val_dataset = val_dataset.map(preprocess, batched=True)

In [None]:
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

In [None]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [None]:
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)