In [1]:
import pandas as pd
from lxml.etree import XMLParser, parse
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Parse the XML file
p = XMLParser(huge_tree=True)
tree = parse('../data/Posts.xml', parser=p)

In [3]:
# Extract elements from the XML tree
root = tree.getroot()
data = []

for post in root.findall('row'):
    data.append(post.attrib)

# Conver to a pandas DataFrame
posts = pd.DataFrame(data)

The column ```Body``` is a raw HTML code. Before applying any model, we need to clean it from redundant tags.

For this, we will use a library ```BeautifulSoup```

In [4]:
from bs4 import BeautifulSoup

def html_to_str(row_html: str) -> str:
    soup = BeautifulSoup(row_html, 'html.parser')
    return soup.get_text(separator=' ')

posts["Body"] = posts["Body"].apply(html_to_str)

In [5]:
columns_to_keep = ['Body', 'Tags']
posts = posts[columns_to_keep]

### Predict ```Tags``` based on the ```Body```'s embedding

In [6]:
posts_subset = posts.copy()
# posts_subset = posts_subset[0:10_000]

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sentence_transformers import SentenceTransformer

import numpy as np

In [8]:
from tqdm import tqdm
tqdm.pandas()

In [9]:
# Embedder models
models = {
    'Albert': 'paraphrase-albert-small-v2',
    'Roberta': 'all-distilroberta-v1',
    'DistilBert': 'multi-qa-distilbert-cos-v1',
    'MiniLM1': 'all-MiniLM-L6-v2',
    'MiniLM2': 'all-MiniLM-L12-v2',
    'MiniLM3': 'paraphrase-MiniLM-L3-v2'
}

In [10]:
import pickle
import collections.abc
from pathlib import Path

def dump_embeddings(models_name: collections.abc.Iterable) -> None:
    for model_name in models_name:

        # Check if embeddings are already dumped (using this model)
        embeddings_file = Path(f"embeddings/{model_name}_body.obj")
        if embeddings_file.is_file():
            # file exists
            print(f"Embeddings for {model_name} are already dumped.")
            continue

        # Download a model from Hugging Face using its name
        embedder = SentenceTransformer(models[model_name])

        bodies = posts_subset['Body'].tolist()

        X = []
        for body in tqdm(bodies, desc="Encoding posts"):
            # Encode each 'body' and append it to X
            encoded_body = embedder.encode(body)
            X.append(encoded_body)

        X = np.array(X)
        filehandler = open(f"embeddings/{model_name}_body.obj","wb")
        pickle.dump(X, filehandler)
        filehandler.close()
    

dump_embeddings(models.keys())

Embeddings for Albert are already dumped.
Embeddings for Roberta are already dumped.
Embeddings for DistilBert are already dumped.
Embeddings for MiniLM1 are already dumped.
Embeddings for MiniLM2 are already dumped.


Encoding posts: 100%|██████████| 112485/112485 [14:39<00:00, 127.85it/s]


# Create a target vector

Our target is a list of lists, where each nested list includes tags for a corresponding question

In [11]:
import pickle

# Load embeddings (make sure you dumped it before by running the previous cell)
#============================================================================#
# This is just an example (I chose a specific model).
# The entire code below should be wrapped in a single function.
#============================================================================#
file = open("embeddings/MiniLM3_body.obj",'rb')
X = pickle.load(file)
file.close()

In [12]:
y = posts_subset["Tags"] # this a list of str, where each str is in the specified format (delimiter = '|')
y = [str(str_of_tags).split('|')[1:-1] for str_of_tags in y] # this a list of lists

In [13]:
unique_tags = list(set([
    x
    for xs in y
    for x in xs
]))

print(f"unique_tags: {len(unique_tags)}")

unique_tags: 1010


### Split data into train and test

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1200)

In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MultiLabelBinarizer

### Convert tags to multi-label format (One-Hot Encoding)

In [16]:
# Fit MultiLabelBinarizer on the full dataset (y) !!

mlb = MultiLabelBinarizer()
y_full_binary = mlb.fit_transform(y)

In [17]:
# Transform both train and test sets with the same mlb

y_train_binary = mlb.transform(y_train)
y_test_binary = mlb.transform(y_test)

### PyTorch Dataset

In [18]:
class QuestionTagDataset(Dataset):
    def __init__(self, embeddings, tags):
        self.embeddings = torch.tensor(embeddings, dtype=torch.float32)
        self.tags = torch.tensor(tags, dtype=torch.float32)
    
    def __len__(self):
        return len(self.embeddings)
    
    def __getitem__(self, idx):
        return self.embeddings[idx], self.tags[idx]

### Create dataset and dataloader

In [19]:
train_dataset = QuestionTagDataset(X_train, y_train_binary)
test_dataset = QuestionTagDataset(X_test, y_test_binary)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

### Simple Neural Network (4 FC layers)

In [20]:
class TagPredictorNN(nn.Module):
    def __init__(self, input_size, num_tags):
        super(TagPredictorNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 256)
        self.fc3 = nn.Linear(256, 256)
        self.fc4 = nn.Linear(256, num_tags)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = torch.sigmoid(self.fc4(x)) # sigmoid is important here! (multi-label classification)
        return x

In [21]:
# input and output dimensions
input_size = X_train.shape[1]  # i.e. 384 (embedding size)
num_tags = len(unique_tags)

In [22]:
model = TagPredictorNN(input_size, num_tags)

### Training Loop

In [23]:
def train_model(model, dataloader, num_epochs=10):
    loss_fn = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for i, (inputs, labels) in enumerate(dataloader):
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            loss = loss_fn(outputs, labels)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/10:.4f}')
        running_loss = 0.0

### Training

This is a toy example. Increase the number of epochs by a factor of 10, and the dataset should be larger.

In [24]:
train_model(model, train_loader, num_epochs=10)

Epoch [1/50], Loss: 1.7473
Epoch [2/50], Loss: 0.6834
Epoch [3/50], Loss: 0.6183
Epoch [4/50], Loss: 0.5800
Epoch [5/50], Loss: 0.5535
Epoch [6/50], Loss: 0.5316
Epoch [7/50], Loss: 0.5132
Epoch [8/50], Loss: 0.4965
Epoch [9/50], Loss: 0.4796
Epoch [10/50], Loss: 0.4648
Epoch [11/50], Loss: 0.4507
Epoch [12/50], Loss: 0.4370
Epoch [13/50], Loss: 0.4242
Epoch [14/50], Loss: 0.4122
Epoch [15/50], Loss: 0.3999
Epoch [16/50], Loss: 0.3882
Epoch [17/50], Loss: 0.3777
Epoch [18/50], Loss: 0.3665
Epoch [19/50], Loss: 0.3567
Epoch [20/50], Loss: 0.3471
Epoch [21/50], Loss: 0.3385
Epoch [22/50], Loss: 0.3293
Epoch [23/50], Loss: 0.3206
Epoch [24/50], Loss: 0.3136
Epoch [25/50], Loss: 0.3060
Epoch [26/50], Loss: 0.2983
Epoch [27/50], Loss: 0.2919
Epoch [28/50], Loss: 0.2846
Epoch [29/50], Loss: 0.2787
Epoch [30/50], Loss: 0.2729
Epoch [31/50], Loss: 0.2674
Epoch [32/50], Loss: 0.2622
Epoch [33/50], Loss: 0.2572
Epoch [34/50], Loss: 0.2525
Epoch [35/50], Loss: 0.2467
Epoch [36/50], Loss: 0.2427
E

### Evaluation

In [27]:
from sklearn.metrics import jaccard_score, precision_score, recall_score, f1_score

def evaluate_model(model, X_test, y_test, threshold=0.5):
    model.eval()

    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

    with torch.no_grad():
        y_pred = model(X_test_tensor) # Forward pass

    # Threshold is applied to convert probabilities to binary predictions
    y_pred_binary = (y_pred > threshold).float().numpy()
    y_true_binary = y_test_tensor.numpy()

    # Compute different metrics...
    jaccard = jaccard_score(y_true_binary, y_pred_binary, average='samples')
    precision = precision_score(y_true_binary, y_pred_binary, average='samples')
    recall = recall_score(y_true_binary, y_pred_binary, average='samples')
    f1 = f1_score(y_true_binary, y_pred_binary, average='samples')

    print(f'Jaccard Index: {jaccard:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1 Score: {f1:.4f}')


In [28]:
print("===TEST===")
evaluate_model(model, X_test, y_test_binary)
print("==========")
print("===TRAIN===")
evaluate_model(model, X_train, y_train_binary)
print("==========")

===TEST===
Jaccard Index: 0.0749
Precision: 0.1315
Recall: 0.1016
F1 Score: 0.1043
===TRAIN===
Jaccard Index: 0.2187
Precision: 0.2880
Recall: 0.2386
F1 Score: 0.2518
