# Loading Dataset

In [1]:
from datasets import load_from_disk
import torch
import numpy as np 
rumour = load_from_disk("./data/rumour")

# Preprocess

## Tokenize

In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [3]:
# tokenize text and truncate sequence to be no longer than maximum input of "Distilber"
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [4]:
tokenized_rumour = rumour.map(preprocess_function, batched=True)

Loading cached processed dataset at data/rumour/train\cache-c41a3ac9ef72f510.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

Loading cached processed dataset at data/rumour/test\cache-8a704b5dea11a8ad.arrow


In [29]:
covid_dataset = load_from_disk("./covid/test2")
tokenized_covid = covid_dataset.map(preprocess_function, batched=True)

  0%|          | 0/18 [00:00<?, ?ba/s]

## Padding

In [5]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


# Train

In [6]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier

In [10]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_rumour["train"],
    eval_dataset=tokenized_rumour["dev"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1895
  Num Epochs = 15
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1785


Step,Training Loss
500,0.0472
1000,0.0204
1500,0.015


Saving model checkpoint to ./results\checkpoint-500
Configuration saved in ./results\checkpoint-500\config.json
Model weights saved in ./results\checkpoint-500\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-500\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-500\special_tokens_map.json
Saving model checkpoint to ./results\checkpoint-1000
Configuration saved in ./results\checkpoint-1000\config.json
Model weights saved in ./results\checkpoint-1000\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-1000\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-1000\special_tokens_map.json
Saving model checkpoint to ./results\checkpoint-1500
Configuration saved in ./results\checkpoint-1500\config.json
Model weights saved in ./results\checkpoint-1500\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-1500\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-1500\special_toke

TrainOutput(global_step=1785, training_loss=0.025872563380821077, metrics={'train_runtime': 458.1901, 'train_samples_per_second': 62.038, 'train_steps_per_second': 3.896, 'total_flos': 3724008890395512.0, 'train_loss': 0.025872563380821077, 'epoch': 15.0})

In [33]:
covid_dataset = load_from_disk("./covid/test")
tokenized_covid = covid_dataset.map(preprocess_function, batched=True)
with open ("covid.predict_new.txt","w") as f:
    for i in range(len(tokenized_covid)):
        
            _input = tokenized_covid[i]['input_ids']
            _mask = tokenized_covid[i]['attention_mask']
            model.to("cuda")
            _input = torch.tensor(_input).unsqueeze(0).to("cuda")
            _mask = torch.tensor(_mask).unsqueeze(0).to("cuda")
            #a= tokenizer.decode(_input)

            y=model(input_ids=_input,attention_mask=_mask)
            res = np.argmax(y.logits.detach().cpu().numpy())
            #label = tokenized_rumour["dev"][i]["label"]
            if res == 1:
                f.write("rumour\n")
            else:
                f.write("nonrumour\n")

#tokenized_covid