In [19]:
import os
import pandas as pd
import numpy as np

In [2]:
df = pd.concat((pd.read_csv("data/train_tweets/" + f)
                for f in os.listdir("data/train_tweets")),
               ignore_index=True)
df.drop(columns=["MatchID", "PeriodID", "Timestamp"], inplace=True)
df.drop_duplicates(subset="Tweet", inplace=True)
df = df.groupby("ID").agg({"EventType":"first", "Tweet":' '.join}).reset_index()
df.drop(columns="ID", inplace=True)

In [None]:
from datasets import Dataset
from transformers import BertTokenizer

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["Tweet"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns="Tweet")
tokenized_dataset = tokenized_dataset.rename_column("EventType", "label")

Map:   0%|          | 0/2137 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'Tweet', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2137
})

In [4]:
tokenized_dataset.save_to_disk("data/tokenized_train_agg_bert_base_uncased")

Saving the dataset (0/1 shards):   0%|          | 0/2137 [00:00<?, ? examples/s]

In [20]:
df = pd.concat((pd.read_csv("data/eval_tweets/" + f)
                for f in os.listdir("data/eval_tweets")),
               ignore_index=True)
df.drop(columns=["MatchID", "PeriodID", "Timestamp"], inplace=True)
df.drop_duplicates(subset="Tweet", inplace=True)
df = df.groupby("ID")["Tweet"].apply(' '.join).reset_index()
df.drop(columns="ID", inplace=True)

In [21]:
from datasets import Dataset
from transformers import BertTokenizer

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["Tweet"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns="Tweet")

Map:   0%|          | 0/516 [00:00<?, ? examples/s]

In [22]:
tokenized_dataset.save_to_disk("data/tokenized_eval_agg_bert_base_uncased")

Saving the dataset (0/1 shards):   0%|          | 0/516 [00:00<?, ? examples/s]

In [1]:
from datasets import Dataset
tokenized_dataset = Dataset.load_from_disk("data/tokenized_train_agg_bert_base_uncased")

In [3]:
from sklearn.metrics import accuracy_score
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    return {"accuracy": accuracy_score(p.label_ids, preds)}

In [4]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# Load the model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', problem_type = "single_label_classification")

# Define training arguments
training_args = TrainingArguments(
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    output_dir="data/out_agg_bert_base_uncase",
)

tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.3)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=BertTokenizer.from_pretrained('bert-base-uncased'),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [5]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.651635,0.607477
2,No log,0.727854,0.609034
3,No log,0.632969,0.65109
4,No log,0.714515,0.637072
5,No log,0.72775,0.655763


TrainOutput(global_step=470, training_loss=0.5521903829371676, metrics={'train_runtime': 720.3311, 'train_samples_per_second': 10.377, 'train_steps_per_second': 0.652, 'total_flos': 1966755138816000.0, 'train_loss': 0.5521903829371676, 'epoch': 5.0})

In [6]:
trainer.evaluate()

{'eval_loss': 0.7277504801750183,
 'eval_accuracy': 0.6557632398753894,
 'eval_runtime': 14.444,
 'eval_samples_per_second': 44.448,
 'eval_steps_per_second': 2.839,
 'epoch': 5.0}

In [23]:
tokenized_dataset = Dataset.load_from_disk("data/tokenized_eval_agg_bert_base_uncased")

In [None]:
df = pd.DataFrame()
df["ID"] = np.concatenate([pd.read_csv("data/eval_tweets/" + f)["ID"].unique()
                           for f in os.listdir("data/eval_tweets")])
df["EventType"] = trainer.predict(tokenized_dataset).predictions.argmax(-1)
df.to_csv("data/bert_agg_pred.csv", index=False)