In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import torch
import datasets
import os
import torch.nn.functional as F

try:
    import evaluate
except:
    !pip install evaluate
    import evaluate
    
os.environ["WANDB_DISABLED"] = "true"

device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
dataset = pd.read_csv('/kaggle/input/troll-detection/dataset.csv')
dataset['label'] = dataset['label'].astype(float)
print(dataset.__len__())

In [None]:
train_data = dataset.sample(frac=0.8, random_state=0)
test_data = dataset.drop(train_data.index)

training = datasets.Dataset.from_pandas(train_data[['tweet', 'label']])
validation = datasets.Dataset.from_pandas(test_data[['tweet', 'label']])

In [None]:
# Getting the metric
roc_auc = evaluate.load('roc_auc')

# Function to compute metric
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    tensor_logits = torch.from_numpy(logits).to(device)
    predictions = F.sigmoid(tensor_logits).cpu().detach().numpy()
    return roc_auc.compute(references=labels,prediction_scores=predictions)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=1)
model.to(device)
collator = DataCollatorWithPadding(tokenizer = tokenizer)

In [None]:
def tokenize_func(sample):
    tweets = sample['tweet']
    return tokenizer(tweets, padding="max_length", truncation=True)

training = training.map(tokenize_func, batched=True, batch_size=32)
validation = validation.map(tokenize_func, batched=True, batch_size=32)

In [None]:
training_args = TrainingArguments(output_dir='/kaggle/working/results',
                                    num_train_epochs=5,
                                    eval_strategy='epoch',
                                    save_strategy='epoch',
                                    load_best_model_at_end=True,
                                    per_device_train_batch_size=16,
                                    per_device_eval_batch_size=16,
                                    warmup_steps=200,
                                    weight_decay=0.01,
                                    logging_dir='./logs',
                                    logging_steps=10,
                                    )

In [None]:
trainer = Trainer(model = model,
                    args = training_args,
                    train_dataset = training,
                    eval_dataset = validation,
                    data_collator = collator,
                    compute_metrics = compute_metrics)

In [None]:
trainer.train()

In [None]:
trainer.save_model('fine-tuned-distillBert')
!zip -r distill-bert.zip /kaggle/working/fine-tuned-distillBert