In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
df = pd.concat((pd.read_csv("data/train_tweets/" + f)
                for f in os.listdir("data/train_tweets")),
               ignore_index=True)
df.drop(columns=["ID", "MatchID", "PeriodID", "Timestamp"], inplace=True)
df.drop_duplicates(subset="Tweet", inplace=True)

In [3]:
from datasets import Dataset
from transformers import AutoTokenizer

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["Tweet"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns="Tweet")
tokenized_dataset = tokenized_dataset.rename_column("EventType", "label")

Map:   0%|          | 0/2819989 [00:00<?, ? examples/s]

In [4]:
df = pd.concat((pd.read_csv("data/eval_tweets/" + f)
                for f in os.listdir("data/eval_tweets")),
               ignore_index=True)
df.drop(columns=["ID", "MatchID", "PeriodID", "Timestamp"], inplace=True)
df.drop_duplicates(subset="Tweet", inplace=True)

In [5]:
from datasets import Dataset
from transformers import AutoTokenizer

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["Tweet"], padding="max_length", truncation=True)

tokenized_dataset_test = dataset.map(tokenize_function, batched=True, remove_columns="Tweet")

Map:   0%|          | 0/621958 [00:00<?, ? examples/s]

In [None]:
from sklearn.metrics import accuracy_score
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    return {"accuracy": accuracy_score(p.label_ids, preds)}

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', problem_type="single_label_classification")

# Define training arguments
training_args = TrainingArguments(
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    gradient_accumulation_steps=2,
    fp16=True,
    metric_for_best_model="accuracy",
    output_dir="data/out_single_distilbert_base_uncase",
)

tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    processing_class=AutoTokenizer.from_pretrained("distilbert-base-uncased"),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [6]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
model = AutoModelForSequenceClassification.from_pretrained(
    "data/out_single_distilbert_base_uncase/checkpoint-198280")
training_args = TrainingArguments(
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    output_dir="data/out_single_distilbert_base_uncase",
)
trainer = Trainer(
    model=model,
    args=training_args,
    processing_class=AutoTokenizer.from_pretrained("distilbert-base-uncased")
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [7]:
predictions = trainer.predict(tokenized_dataset)

In [8]:
df = pd.concat((pd.read_csv("data/train_tweets/" + f)
                for f in os.listdir("data/train_tweets")),
               ignore_index=True)
df.drop_duplicates(subset="Tweet", inplace=True)
df.drop(columns=["MatchID", "PeriodID", "Timestamp", "Tweet"], inplace=True)
df["Confidence"] = predictions.predictions[:, 1] - predictions.predictions[:, 0]
df.to_csv("data/distilbert_single_train_full.csv", index=False)

In [9]:
predictions = trainer.predict(tokenized_dataset_test)

In [10]:
df = pd.concat((pd.read_csv("data/eval_tweets/" + f)
                for f in os.listdir("data/eval_tweets")),
               ignore_index=True)
df.drop_duplicates(subset="Tweet", inplace=True)
df.drop(columns=["MatchID", "PeriodID", "Timestamp", "Tweet"], inplace=True)
df["Confidence"] = predictions.predictions[:, 1] - predictions.predictions[:, 0]
df.to_csv("data/distilbert_single_pred_full.csv", index=False)