In [1]:
import os
import re
import pandas as pd
import numpy as np
import fasttext
import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Apply Word Embeddings

In [3]:
embedding_model = fasttext.load_model("data/fasttext_model.bin")

In [4]:
def get_embedding(tweet, embedding_model):
    tweet = preprocess_tweet(tweet)
    words = tweet.split()
    word_vectors = [embedding_model[word]
                    for word in words if word in embedding_model]
    # If no words in the tweet are in the vocabulary, return a zero vector
    if not word_vectors:
        return np.zeros(embedding_model.get_dimension())
    return np.array(word_vectors)

# Load Data

In [5]:
class TweetDataset(Dataset):
    def __init__(self, df, embedding_model):
        self.df = df
        self.embedding_model = embedding_model
        self.embeddings = df["Tweet"].apply(
            lambda tweet: get_embedding(tweet, embedding_model)).values
        self.labels = df["EventType"].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return (torch.tensor(self.embeddings[idx]),
                torch.tensor(self.labels[idx]))

In [6]:
def load_data(folder):
    data = []
    for filename in os.listdir(folder):
        df = pd.read_csv(os.path.join(folder, filename))
        df.drop(columns=["MatchID", "PeriodID", "Timestamp"],
                inplace=True)
        df.drop_duplicates(subset="Tweet", inplace=True)
        df["Tweet"] = df["Tweet"].apply(preprocess_tweet)
        data.append(df)
    return pd.concat(data)

all_data = load_data("data/train_tweets/")
train_df, eval_df = train_test_split(all_data, test_size=0.2)

train_dataset = TweetDataset(train_df, embedding_model)
eval_dataset = TweetDataset(eval_df, embedding_model)

# Pad and Collate the Data

In [7]:
max_len = max(len(embedding) for embedding, _ in train_dataset)

In [8]:
def pad_collate(batch):
    embeddings, labels = zip(*batch)
    padded_embeddings = [
        torch.cat((embedding, torch.zeros(max_len - len(embedding),
            embedding.shape[1]))) if embedding.dim() > 1
        else torch.cat((embedding.unsqueeze(0),
                        torch.zeros(max_len - 1, len(embedding))))
        for embedding in embeddings]
    return (torch.stack(padded_embeddings).float(),
            torch.stack(labels).float())

In [9]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=pad_collate)
eval_loader = DataLoader(eval_dataset, batch_size=32, shuffle=False, collate_fn=pad_collate)

# Define Model

In [6]:
class TweetClassifier(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(TweetClassifier, self).__init__()
        self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
        self.fc = nn.Linear(embed_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        attn_output, _ = self.multihead_attn(x, x, x)
        attn_output = attn_output.mean(dim=1)
        out = self.fc(attn_output)
        return self.sigmoid(out)

In [11]:
model = TweetClassifier(embedding_model.get_dimension(), 8)

# Train Model

In [12]:
device = torch.device("cuda" if torch.cuda.is_available()
                      else "cpu")
model.to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train_model(model, train_loader, eval_loader,
                criterion, optimizer, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for batch in train_loader:
                embeddings, labels = batch
                embeddings, labels = embeddings.to(device), labels.to(device)
                optimizer.zero_grad()
                outputs = model(embeddings).squeeze()
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                train_loss += loss.item()

        eval_loss = 0.0
        model.eval()
        with torch.no_grad():
            for batch in eval_loader:
                embeddings, labels = batch
                embeddings, labels = embeddings.to(device), labels.to(device)
                outputs = model(embeddings).squeeze()
                loss = criterion(outputs, labels)
                eval_loss += loss.item()

        print(f"Epoch {epoch+1}/{num_epochs}, "
              f"Train Loss: {train_loss/len(train_loader)}, "
              f"Eval Loss: {eval_loss/len(eval_loader)}")

train_model(model, train_loader, eval_loader, criterion, optimizer)
torch.save(model.state_dict(),
           f"data/fasttext_attention_{embedding_model.get_dimension()}.pt")

Epoch 1/10, Train Loss: 0.6629794880949829, Eval Loss: 0.6612436269283453
Epoch 2/10, Train Loss: 0.6624214491535848, Eval Loss: 0.662690996077038
Epoch 3/10, Train Loss: 0.6618990440690388, Eval Loss: 0.6619065216262247
Epoch 4/10, Train Loss: 0.6619244266121287, Eval Loss: 0.6614277009279982
Epoch 5/10, Train Loss: 0.6620126524577056, Eval Loss: 0.6622039313732904
Epoch 6/10, Train Loss: 0.6615502313052863, Eval Loss: 0.6623980000123147
Epoch 7/10, Train Loss: 0.6617466436981575, Eval Loss: 0.6611113632739634
Epoch 8/10, Train Loss: 0.6612460567713732, Eval Loss: 0.6608686968832779
Epoch 9/10, Train Loss: 0.6617135113695879, Eval Loss: 0.6613994423550141
Epoch 10/10, Train Loss: 0.6615982409600434, Eval Loss: 0.6622519856007221


# Evaluate Model

In [None]:
def evaluate_model(model, eval_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in eval_loader:
            embeddings, labels = batch
            embeddings, labels = embeddings.to(device), labels.to(device)
            outputs = model(embeddings).squeeze()
            preds = (outputs > 0.5).float()
            all_preds.extend(list(preds.cpu()))
            all_labels.extend(list(labels.cpu()))

    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Validation Accuracy: {accuracy}")

evaluate_model(model, eval_loader)

Validation Accuracy: 0.5638585168925235


# Get Predictions

In [7]:
embedding_model = fasttext.load_model("data/fasttext_model.bin")
model = TweetClassifier(embedding_model.get_dimension(), 8)
model.load_state_dict(torch.load(
    f"data/fasttext_attention_{embedding_model.get_dimension()}.pt",
    weights_only=True))
model.eval()

TweetClassifier(
  (multihead_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=200, out_features=200, bias=True)
  )
  (fc): Linear(in_features=200, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [8]:
class TestDataset(Dataset):
    def __init__(self, df, embedding_model):
        self.df = df
        self.embedding_model = embedding_model
        self.embeddings = df["Tweet"].apply(
            lambda tweet: get_embedding(tweet, embedding_model)).values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return torch.tensor(self.embeddings[idx])

In [9]:
def load_data(folder):
    data = []
    for filename in os.listdir(folder):
        df = pd.read_csv(os.path.join(folder, filename))
        df.drop(columns=["MatchID", "PeriodID", "Timestamp"],
                inplace=True)
        df.drop_duplicates(subset="Tweet", inplace=True)
        df["Tweet"] = df["Tweet"].apply(preprocess_tweet)
        data.append(df)
    return pd.concat(data)

In [10]:
train_df = load_data("data/train_tweets/")
train_dataset = TestDataset(train_df, embedding_model)

In [10]:
max_len = 200

In [11]:
def pad_collate(embeddings):
    padded_embeddings = [
        torch.cat((embedding, torch.zeros(max_len - len(embedding),
            embedding.shape[1]))) if embedding.dim() > 1
        else torch.cat((embedding.unsqueeze(0),
                        torch.zeros(max_len - 1, len(embedding))))
        for embedding in embeddings]
    return torch.stack(padded_embeddings).float()

In [14]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False, collate_fn=pad_collate)

In [12]:
def get_preds(model, test_loader):
    model.eval()
    all_preds = []
    with torch.no_grad():
        for embeddings in test_loader:
            preds = model(embeddings).squeeze()
            if preds.dim() != 0:
                all_preds.extend(preds.cpu().numpy())
            else:
                all_preds.extend([0.0])
    return all_preds

In [None]:
train_df["Confidence"] = get_preds(model, train_loader)
train_df.drop(columns=["Tweet"], inplace=False).to_csv(
    "data/fasttext_attention_train.csv", index=False)

In [14]:
test_df = load_data("data/eval_tweets/")
test_dataset = TestDataset(test_df, embedding_model)

In [15]:
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=pad_collate)

In [16]:
test_df["Confidence"] = get_preds(model, test_loader)
test_df.drop(columns=["Tweet"], inplace=False).to_csv(
    "data/fasttext_attention_test.csv", index=False)