# Welcome to my TxMM project 
By: Roel Duijsings 

This project is about the emotions of the general public during the missing of Hebe and Sanne in October 2022 in the Netherlands.

## Install dependencies

In [1]:
# ! pip install transformers datasets accelerate nvidia-ml-py3 evaluate
# ! pip install torch --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu116
# ! pip install numpy pandas
# ! pip install snscrape

In [2]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [3]:
import torch

# CUDA 11.6 needed!
torch.cuda.is_available()

  from .autonotebook import tqdm as notebook_tqdm


True

In [4]:
print_gpu_utilization()

GPU memory occupied: 135 MB.


## Finetune the model on the training data

Define the model and tokenizer

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "pdelobelle/robbert-v2-dutch-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast = True)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=6).to("cuda")


Some weights of the model checkpoint at pdelobelle/robbert-v2-dutch-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at pdelobelle/robbert-v2-dutch-base and are newly initialized: ['classifier.out_proj.weight', 'cl

Load the training data

In [6]:
from datasets.load import load_dataset
from sklearn.model_selection import train_test_split

def tokenize_function(tweet):
    return tokenizer(tweet["text"], padding="max_length", truncation=True)


data_file = "training_data_749.csv"
dataset = load_dataset("csv", data_files=data_file, split="train")
dataset_tokenized = dataset.map(tokenize_function, batched=True)

# train_dataset, eval_dataset  = train_test_split(dataset_tokenized["train"], test_size=0.05)
# print(dataset_tokenized["train"][:5])
train_dataset = dataset_tokenized.shuffle(seed=42).select(range(700))
# print(train_dataset[:5])
eval_dataset = dataset_tokenized.shuffle(seed=6).select(range(50))
# for i in train_dataset["idx"]:
#     print(i)
#     if i in eval_dataset["idx"]:
#         print(i, eval_dataset["idx"==i]["text"],train_dataset["idx"==i]["text"])
#         break

Using custom data configuration default-4fa77517b795a050
Found cached dataset csv (C:/Users/roell/.cache/huggingface/datasets/csv/default-4fa77517b795a050/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
Loading cached processed dataset at C:\Users\roell\.cache\huggingface\datasets\csv\default-4fa77517b795a050\0.0.0\6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-0a53e4d2d7ea2e47.arrow
Loading cached shuffled indices for dataset at C:\Users\roell\.cache\huggingface\datasets\csv\default-4fa77517b795a050\0.0.0\6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-0a8c56b9a7e358c9.arrow
Loading cached shuffled indices for dataset at C:\Users\roell\.cache\huggingface\datasets\csv\default-4fa77517b795a050\0.0.0\6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317\cache-7656037191ef0295.arrow


In [7]:
import numpy as np
import evaluate


metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [8]:
from transformers import TrainingArguments, Trainer

batch_size = 4

training_args = TrainingArguments(
    output_dir="test_trainer", 
    evaluation_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    )
    
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

Train the model


In [9]:
result =trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: idx, text. If idx, text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 700
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 525
  Number of trainable parameters = 116766726
  0%|          | 0/525 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 33%|███▎      | 175/525 [00:58<01:55,  3.03it/s]The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.fo

{'eval_loss': 1.3874191045761108, 'eval_accuracy': 0.46, 'eval_runtime': 1.2869, 'eval_samples_per_second': 38.852, 'eval_steps_per_second': 10.102, 'epoch': 1.0}


 67%|██████▋   | 350/525 [01:58<00:58,  3.01it/s]The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: idx, text. If idx, text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 50
  Batch size = 4
                                                 
 67%|██████▋   | 350/525 [01:59<00:58,  3.01it/s]

{'eval_loss': 0.9510800242424011, 'eval_accuracy': 0.64, 'eval_runtime': 1.2371, 'eval_samples_per_second': 40.417, 'eval_steps_per_second': 10.508, 'epoch': 2.0}


 95%|█████████▌| 500/525 [02:49<00:08,  2.95it/s]Saving model checkpoint to test_trainer\checkpoint-500
Configuration saved in test_trainer\checkpoint-500\config.json


{'loss': 1.1485, 'learning_rate': 2.3809523809523808e-06, 'epoch': 2.86}


Model weights saved in test_trainer\checkpoint-500\pytorch_model.bin
tokenizer config file saved in test_trainer\checkpoint-500\tokenizer_config.json
Special tokens file saved in test_trainer\checkpoint-500\special_tokens_map.json
100%|██████████| 525/525 [02:59<00:00,  2.98it/s]The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: idx, text. If idx, text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 50
  Batch size = 4
                                                 
100%|██████████| 525/525 [03:00<00:00,  2.98it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 525/525 [03:00<00:00,  2.90it/s]

{'eval_loss': 0.6707025170326233, 'eval_accuracy': 0.78, 'eval_runtime': 1.2459, 'eval_samples_per_second': 40.13, 'eval_steps_per_second': 10.434, 'epoch': 3.0}
{'train_runtime': 180.9943, 'train_samples_per_second': 11.603, 'train_steps_per_second': 2.901, 'train_loss': 1.1281304931640626, 'epoch': 3.0}





In [10]:
print_summary(result)

Time: 180.99
Samples/second: 11.60
GPU memory occupied: 5501 MB.


## Load the testdata

Preprocess testdata

In [11]:
import snscrape.modules.twitter as sntwitter
import pandas as pd


def preprocess(query, maxTweets):
    """
    Preprocess the tweets:
    - collects them from Twitter API,
    - replaces username and URL by placeholders,

    Returns a DataFrame["Date", "Username", "Raw_text", "Url","Text"]
    """
    df = getTweets(query, maxTweets)
    df["Text"] = df["Raw_text"].map(replacePlaceholders)

    return df


def getTweets(query, maxTweets):
    """
    Gather tweets from Twitter API. Query and maxTweets are used here.

    Returns a DataFrame that is in reversed chronological order.
    """
    tweets = []

    for tweet in sntwitter.TwitterSearchScraper(query).get_items():
        if len(tweets) >= maxTweets:
            break
        else:
            tweets.append([tweet.date, tweet.user.username, tweet.content, tweet.url])

    df = pd.DataFrame(tweets, columns=["Date", "Username", "Raw_text", "Url"])
    df.index.name = "id"

    return df


def replacePlaceholders(tweet):
    """
    Replace username and URL in tweet text by @USER and HTTP
    """
    new_tweet = []
    for word in tweet.split(" "):
        if word.startswith("@") and len(word) > 1:
            word = "@USER"
        elif word.startswith("http"):
            word = "HTTP"
        new_tweet.append(word)
    return " ".join(new_tweet)


In [12]:
query = "(hebe) lang:nl until:2022-10-21 since:2022-10-17"
maxTweets = 100000
data = preprocess(query, maxTweets)
data.to_csv(f"test_data_{len(data.index)}.csv")

## Run the model on the test data