In [1]:
import pandas as pd
import numpy as np
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import torch
import datasets
import torch.nn.functional as F

device = "cuda" if torch.cuda.is_available() else "cpu"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# dataset1 = pd.read_csv("../datasets/cleaned/troll-dataset.csv")
# dataset2 = pd.read_csv("../datasets/cleaned/WHtweets-cleaned.csv")
# dataset3 = pd.read_csv("../datasets/cleaned/metootweets-cleaned.csv")

# dataset1['label'] = 1
# dataset2['label'] = 0
# dataset3['label'] = 0

# dataset1.rename(columns={"content": "tweet"}, inplace=True)
# dataset2.rename(columns={"text": "tweet"}, inplace=True)
# dataset3.rename(columns={"text": "tweet"}, inplace=True)

# dataset1 = dataset1[['tweet', 'label']]
# dataset2 = dataset2[['tweet', 'label']]
# dataset3 = dataset3[['tweet', 'label']]

# # Sampling the dataset1 to have the same number of samples as dataset2 plus dataset3
# dataset1 = dataset1.sample(n=dataset2.__len__() + dataset3.__len__(), random_state=1)

# dataset = pd.concat([dataset1, dataset2, dataset3], ignore_index=True)

# dataset.to_csv("../datasets/dataset.csv", index=False)

In [3]:
dataset = pd.read_csv('../datasets/dataset.csv')

print(dataset.__len__())

723796


In [4]:
label2idx = { 'non_troll' : 0, 'troll' : 1 }
idx2label = { 0 : 'non_troll', 1 : 'troll' }

dataset['label'] = dataset['label'].astype(float)

In [5]:
train_data = dataset.sample(frac=0.8, random_state=0)
test_data = dataset.drop(train_data.index)

training = datasets.Dataset.from_pandas(train_data[['tweet', 'label']])
validation = datasets.Dataset.from_pandas(test_data[['tweet', 'label']])

In [6]:
# Getting the metric
roc_auc = evaluate.load('roc_auc')

# Function to compute metric
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    tensor_logits = torch.from_numpy(logits).to(device)
    predictions = F.sigmoid(tensor_logits).cpu().detach().numpy()
    return roc_auc.compute(references=labels,prediction_scores=predictions)

In [7]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=1)
model.to(device)
collator = DataCollatorWithPadding(tokenizer = tokenizer)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def tokenize_func(sample):
    tweets = sample['tweet']
    return tokenizer(tweets, padding="max_length", truncation=True)

training = training.map(tokenize_func, batched=True, batch_size=32)
validation = validation.map(tokenize_func, batched=True, batch_size=32)

Map:   0%|          | 0/579037 [00:00<?, ? examples/s]

Map: 100%|██████████| 579037/579037 [01:41<00:00, 5697.22 examples/s]
Map: 100%|██████████| 144759/144759 [00:24<00:00, 5792.75 examples/s]


In [9]:
training_args = TrainingArguments(output_dir='./results',
                                    num_train_epochs=5,
                                    eval_strategy='epoch',
                                    save_strategy='epoch',
                                    load_best_model_at_end=True,
                                    per_device_train_batch_size=16,
                                    per_device_eval_batch_size=16,
                                    warmup_steps=200,
                                    weight_decay=0.01,
                                    logging_dir='./logs',
                                    logging_steps=10,
                                    )

In [10]:
trainer = Trainer(model = model,
                    args = training_args,
                    train_dataset = training,
                    eval_dataset = validation,
                    data_collator = collator,
                    compute_metrics = compute_metrics)

In [11]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 