In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
path = "../data/Posts.xml"
posts = pd.read_xml(path, parser="etree")

In [3]:
posts.shape

(244066, 22)

In [4]:
posts.sample(1)

Unnamed: 0,Id,PostTypeId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastEditorUserId,LastEditDate,...,Tags,AnswerCount,CommentCount,ClosedDate,CommunityOwnedDate,ContentLicense,ParentId,OwnerDisplayName,LastEditorDisplayName,FavoriteCount
110476,210377,2,,2013-09-04T03:10:49.250,1,,<p>It is easier to bust things out and re-use ...,27161.0,,,...,,,0,,,CC BY-SA 3.0,210372.0,,,


In [5]:
posts.columns

Index(['Id', 'PostTypeId', 'AcceptedAnswerId', 'CreationDate', 'Score',
       'ViewCount', 'Body', 'OwnerUserId', 'LastEditorUserId', 'LastEditDate',
       'LastActivityDate', 'Title', 'Tags', 'AnswerCount', 'CommentCount',
       'ClosedDate', 'CommunityOwnedDate', 'ContentLicense', 'ParentId',
       'OwnerDisplayName', 'LastEditorDisplayName', 'FavoriteCount'],
      dtype='object')

The column ```Body``` is a raw HTML code. Before applying any model, we need to clean it from redundant tags.

For this, we will use a library ```BeautifulSoup```

In [6]:
posts = posts.dropna(subset=["Body"], axis=0) # next cell won't work otherwise

In [7]:
from bs4 import BeautifulSoup

def html_to_str(row_html: str) -> str:
    soup = BeautifulSoup(row_html, "html.parser")
    return soup.get_text(separator=' ')

posts["Body"] = posts["Body"].apply(html_to_str)

Most columns are of no value. We will keep only those that are potentilly useful.

In [8]:
columns_to_keep = ["Body", "ViewCount", "CreationDate", "PostTypeId", "Score", "Tags", "Title"]
posts = posts[columns_to_keep]

In [9]:
# posts = posts[posts.CreationDate >= ...] # may be useful
questions = posts[posts.PostTypeId == 1]
answers = posts[posts.PostTypeId == 2]

### Predict question ```Score``` based on the ```Body```'s embedding

In [10]:
questions_subset = questions.copy()[:10_000]

In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sentence_transformers import SentenceTransformer

import numpy as np

from tqdm import tqdm
tqdm.pandas()

In [12]:
# Embedder models
models = {
    'Albert': 'paraphrase-albert-small-v2',
    'Roberta': 'all-distilroberta-v1',
    'DistilBert': 'multi-qa-distilbert-cos-v1',
    'MiniLM1': 'all-MiniLM-L6-v2',
    'MiniLM2': 'all-MiniLM-L12-v2',
    'MiniLM3': 'paraphrase-MiniLM-L3-v2'
}

In [13]:
bodies = questions_subset["Body"].tolist()
y = questions_subset["Score"].values
train_bodies, test_bodies, y_train, y_test = train_test_split(bodies, y, test_size=0.2, random_state=1200)

In [14]:
def estimate_embedder(model_name: str, batch_size: int = 32) -> float:
    """
    Estimates the performance of a linear regression model using embeddings
    generated by a specified SentenceTransformer model from Hugging Face.

    Args:
        model_name (str): The name of the model to be used for generating embeddings.
        batch_size (int): Batch size.

    Returns:
        float: The Mean Absolute Error (MAE) of the linear regression model
               on the test set.
    """
    # Download a model from Hugging Face using its name
    selected_model = models[model_name]
    embedder = SentenceTransformer(selected_model)                

    X_train = [embedder.encode(body) for body in tqdm(train_bodies, desc=f"Encoding posts with {model_name}")]
    X_train = np.array(X_train)

    # Create and fit a simple linear regression model
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)

    X_test = [embedder.encode(body) for body in tqdm(test_bodies, desc=f"Encoding posts with {model_name}")]
    X_test = np.array(X_test)

    y_pred = regressor.predict(X_test)
    return mean_absolute_error(y_test, y_pred)

In [15]:
models_mae = []
for model in models:
    mae = round(estimate_embedder(model_name=model), 3)
    models_mae.append({"model": model, "mae": mae})

Encoding posts with Albert: 100%|██████████| 8000/8000 [10:28<00:00, 12.72it/s]
Encoding posts with Albert: 100%|██████████| 2000/2000 [02:23<00:00, 13.98it/s]
Encoding posts with Roberta: 100%|██████████| 8000/8000 [21:58<00:00,  6.07it/s]  
Encoding posts with Roberta: 100%|██████████| 2000/2000 [05:28<00:00,  6.09it/s]
Encoding posts with DistilBert: 100%|██████████| 8000/8000 [22:08<00:00,  6.02it/s]
Encoding posts with DistilBert: 100%|██████████| 2000/2000 [05:18<00:00,  6.29it/s]
Encoding posts with MiniLM1: 100%|██████████| 8000/8000 [06:08<00:00, 21.69it/s]
Encoding posts with MiniLM1: 100%|██████████| 2000/2000 [01:33<00:00, 21.46it/s]
Encoding posts with MiniLM2: 100%|██████████| 8000/8000 [09:44<00:00, 13.68it/s]
Encoding posts with MiniLM2: 100%|██████████| 2000/2000 [02:07<00:00, 15.65it/s]
Encoding posts with MiniLM3: 100%|██████████| 8000/8000 [02:59<00:00, 44.44it/s]
Encoding posts with MiniLM3: 100%|██████████| 2000/2000 [00:34<00:00, 58.01it/s]


In [16]:
for model_mae in models_mae:
    print(f"MODEL={model_mae['model']}, MAE={model_mae['mae']}")

MODEL=Albert, MAE=21.198
MODEL=Roberta, MAE=20.627
MODEL=DistilBert, MAE=20.812
MODEL=MiniLM1, MAE=19.32
MODEL=MiniLM2, MAE=18.792
MODEL=MiniLM3, MAE=19.112
