#Model Training

In [None]:
import numpy as np
import pandas as pd

In [None]:
!pip install transformers datasets

In [None]:
!pip install evaluate

In [None]:
!pip install transformers datasets sentencepiece tensorboard
# install git-fls for pushing model and logs to the hugging face hub
!sudo apt-get install git-lfs --yes

In [None]:
!pip install "torch>=2.0" --extra-index-url https://download.pytorch.org/whl/cu117 --upgrade --quiet

In [None]:
from huggingface_hub import login

login(
  token="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",add_to_git_credential=True)

In [None]:
from datasets import load_dataset

# Dataset id from huggingface.co/dataset
dataset_id = "roupenminassian/twitter-misinformation"

# Load raw dataset
raw_dataset = load_dataset(dataset_id,data_files= {"train": "training_data.csv", "test": "testing_data.csv"})

print(f"Train dataset size: {len(raw_dataset['train'])}")
print(f"Test dataset size: {len(raw_dataset['test'])}")



  0%|          | 0/2 [00:00<?, ?it/s]

Train dataset size: 92394
Test dataset size: 10267


In [None]:
raw_dataset

In [None]:
from random import randrange

raw_dataset = raw_dataset.map(remove_columns=["Unnamed: 0.1", "Unnamed: 0"])
random_id = randrange(len(raw_dataset['train']))
raw_dataset['train'][random_id]



{'text': 'Crackdown 3 Destruction Restricted to Multiplayer: Crackdown 3 impressed earlier this week with a demonstratio... http://t.co/gwESgesZxV',
 'label': 1}

In [None]:
from transformers import AutoTokenizer

model_id = 'Twitter/twhin-bert-large'

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Tokenize helper function
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, return_tensors="pt",max_length=512)

# Tokenize dataset
raw_dataset =  raw_dataset.rename_column("label", "labels") # to match Trainer
tokenized_dataset = raw_dataset.map(tokenize, batched=True,remove_columns=["text"])

print(tokenized_dataset["train"].features.keys())
# dict_keys(['input_ids', 'token_type_ids', 'attention_mask','lable'])



Map:   0%|          | 0/10267 [00:00<?, ? examples/s]

dict_keys(['labels', 'input_ids', 'attention_mask'])


In [None]:
from transformers import AutoModelForSequenceClassification

model_id = 'Twitter/twhin-bert-large'

num_labels = 2

label2id = {"factual":0, "misinformation":1}

id2label = {0: "factual", 1: "misinformation"}

# Load and compile our model
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=num_labels, label2id=label2id, id2label=id2label)

Some weights of the model checkpoint at Twitter/twhin-bert-large were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Twitter/twhin-bert-l

In [None]:
import evaluate
import numpy as np

# Metric Id
metric = evaluate.load("f1")

# Metric helper method
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels, average="weighted")

In [None]:
from huggingface_hub import HfFolder
from transformers import Trainer, TrainingArguments

# Id for remote repository
repository_id = "twitter-misinformation-classifier-3"

# Define training args
training_args = TrainingArguments(
    output_dir=repository_id,
    per_device_train_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
	  num_train_epochs=3,
	  # PyTorch 2.0 specifics
	  torch_compile=True, # optimizations
    # logging & evaluation strategies
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=250,
    evaluation_strategy="steps",
    save_strategy="epoch",
    save_total_limit=2,
    metric_for_best_model="f1",
    # push to hub parameters
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),

)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)

/content/twitter-misinformation-classifier-3 is already a clone of https://huggingface.co/roupenminassian/twitter-misinformation-classifier-3. Make sure you pull the latest changes with `repo.git_pull()`.


In [None]:
trainer.train()



Step,Training Loss,Validation Loss,F1
250,0.3092,0.087334,0.968321
500,0.095,0.097344,0.975214
750,0.0937,0.096923,0.966908
1000,0.0859,0.076235,0.977241
1250,0.0828,0.120822,0.96692
1500,0.0685,0.07455,0.980287
1750,0.0801,0.067812,0.979778
2000,0.0717,0.102724,0.978294
2250,0.1132,0.096798,0.977084
2500,0.1088,0.083777,0.979386




Step,Training Loss,Validation Loss,F1
250,0.3092,0.087334,0.968321
500,0.095,0.097344,0.975214
750,0.0937,0.096923,0.966908
1000,0.0859,0.076235,0.977241
1250,0.0828,0.120822,0.96692
1500,0.0685,0.07455,0.980287
1750,0.0801,0.067812,0.979778
2000,0.0717,0.102724,0.978294
2250,0.1132,0.096798,0.977084
2500,0.1088,0.083777,0.979386


In [None]:
# Save processor and create model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
trainer.push_to_hub()

#Inference

In [None]:
from transformers import pipeline
classifier = pipeline('sentiment-analysis', model = 'roupenminassian/TwHIN-BERT-Misinformation-Classifier')

Downloading (…)lve/main/config.json:   0%|          | 0.00/835 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.25G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/446 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [None]:
classifier("There is a possibility that Armenia was attacked by Azerbaijan in the early hours of the morning.")[0]['label']

'factual'