In [None]:
!pip install transformers datasets scikit-learn




In [None]:
!pip install transformers[torch] accelerate -U



In [None]:
!pip install transformers[torch]



In [None]:
!pip install accelerate -U



In [None]:
import pandas as pd
from datasets import load_dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

# Load dataset
dataset = load_dataset('csv', data_files='/content/data.csv')

# Print dataset to inspect the structure
print(dataset)

# Split dataset
dataset = dataset['train'].train_test_split(test_size=0.2)
train_dataset = dataset['train']
test_dataset = dataset['test']

# Debug: Inspect the first example from the dataset
print(train_dataset[0])

# Data cleaning function to remove rows with invalid labels
def clean_data(dataset):
    # Convert 'label' to int and filter out invalid entries
    dataset = dataset.filter(lambda x: x['label'] is not None and str(x['label']).isdigit())
    return dataset.map(lambda x: {"label": int(x['label'])})

# Clean the training and test datasets
train_dataset = clean_data(train_dataset)
test_dataset = clean_data(test_dataset)

# Check the number of examples after cleaning
print(f"Number of training examples: {len(train_dataset)}")
print(f"Number of test examples: {len(test_dataset)}")

# Initialize tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

# Tokenize data
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

# Apply preprocessing
train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Define data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01
)

# Define compute metrics function
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {'accuracy': accuracy, 'f1': f1}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train model
trainer.train()

# Evaluate model
results = trainer.evaluate()
print(results)


DatasetDict({
    train: Dataset({
        features: ['URLs', 'text', 'Body', 'label', 'Unnamed: 4', 'Unnamed: 5'],
        num_rows: 4059
    })
})
{'URLs': 'http://www.cnn.com/videos/politics/2017/10/09/pence-indianapolis-flights-cost-es-live.cnn/video/playlists/cant-miss/\r\n', 'text': "The price tag for Pence's trip to Indianapolis", 'Body': "Ethical questions loom after Vice President Mike Pence's trip to Indianapolis to watch, and then abruptly leave after some players knelt during the anthem, a football game between the Indianapolis Colts and San Francisco 49ers.", 'label': '1', 'Unnamed: 4': None, 'Unnamed: 5': None}


Filter:   0%|          | 0/3247 [00:00<?, ? examples/s]

Map:   0%|          | 0/3205 [00:00<?, ? examples/s]

Filter:   0%|          | 0/812 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Number of training examples: 3205
Number of test examples: 800


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3205 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.247162,0.9075,0.9075


{'eval_loss': 0.247162327170372, 'eval_accuracy': 0.9075, 'eval_f1': 0.9075, 'eval_runtime': 24.0113, 'eval_samples_per_second': 33.318, 'eval_steps_per_second': 2.082, 'epoch': 1.0}
