In [1]:
import sys
sys.path.append("../..") # to make utils importable

In [2]:
import pandas as pd
import warnings
from utils.data_loader import load_dataset
from bs4 import BeautifulSoup
warnings.filterwarnings("ignore")

In [3]:
posts = load_dataset(filepath="../../data/Posts.xml")
posts.shape

Data loaded


(244066, 22)

In [4]:
posts.sample(1)

Unnamed: 0,Id,PostTypeId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastEditorUserId,LastEditDate,...,Tags,AnswerCount,CommentCount,ClosedDate,CommunityOwnedDate,ContentLicense,ParentId,OwnerDisplayName,LastEditorDisplayName,FavoriteCount
19024,45370,2,,2011-02-08T08:50:26.887,1,,<p>I don't take your arguments: </p>\n\n<block...,16349.0,,,...,,,2,,,CC BY-SA 2.5,45150.0,,,


In [5]:
posts.columns

Index(['Id', 'PostTypeId', 'AcceptedAnswerId', 'CreationDate', 'Score',
       'ViewCount', 'Body', 'OwnerUserId', 'LastEditorUserId', 'LastEditDate',
       'LastActivityDate', 'Title', 'Tags', 'AnswerCount', 'CommentCount',
       'ClosedDate', 'CommunityOwnedDate', 'ContentLicense', 'ParentId',
       'OwnerDisplayName', 'LastEditorDisplayName', 'FavoriteCount'],
      dtype='object')

The column ```Body``` is a raw HTML code. Before applying any model, we need to clean it from redundant tags.

For this, we will use a library ```BeautifulSoup```

In [6]:
posts = posts.dropna(subset=["Body"], axis=0) # next cell won't work otherwise

In [7]:
def html_to_str(row_html: str) -> str:
    soup = BeautifulSoup(row_html, "html.parser")
    return soup.get_text(separator=' ')

posts["Body"] = posts["Body"].apply(html_to_str)

Most columns are of no value. We will keep only those that are potentilly useful.

In [8]:
columns_to_keep = ["Body", "ViewCount", "CreationDate", "PostTypeId", "Score", "Tags", "Title"]
posts = posts[columns_to_keep]

In [9]:
# posts = posts[posts.CreationDate >= ...] # may be useful
questions = posts[posts.PostTypeId == 1]
answers = posts[posts.PostTypeId == 2]

### Predict question ```Score``` based on the ```Body```'s embedding

In [10]:
questions_subset = questions.copy()[:10_000]

In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sentence_transformers import SentenceTransformer

import numpy as np

from utils.consts import EMBEDDERS

from tqdm import tqdm
tqdm.pandas()

In [12]:
bodies = questions_subset["Body"].tolist()
y = questions_subset["Score"].values
train_bodies, test_bodies, y_train, y_test = train_test_split(bodies, y, test_size=0.2, random_state=1200)

In [13]:
def estimate_embedder(model_name: str) -> float:
    """
    Estimates the performance of a linear regression model using embeddings
    generated by a specified SentenceTransformer model from Hugging Face.

    Args:
        model_name (str): The name of the model to be used for generating embeddings.

    Returns:
        float: The Mean Absolute Error (MAE) of the linear regression model
               on the test set.
    """
    # Download a model from Hugging Face using its name
    selected_model = EMBEDDERS[model_name]
    embedder = SentenceTransformer(selected_model)                

    X_train = [embedder.encode(body) for body in tqdm(train_bodies, desc=f"Encoding posts with {model_name}")]
    X_train = np.array(X_train)

    # Create and fit a simple linear regression model
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)

    X_test = [embedder.encode(body) for body in tqdm(test_bodies, desc=f"Encoding posts with {model_name}")]
    X_test = np.array(X_test)

    y_pred = regressor.predict(X_test)
    return mean_absolute_error(y_test, y_pred)

In [14]:
models_mae = []
for model in EMBEDDERS:
    mae = round(estimate_embedder(model_name=model), 3)
    models_mae.append({"model": model, "mae": mae})
    print(f"{model=} {mae=}")

Encoding posts with Albert: 100%|██████████| 8000/8000 [09:59<00:00, 13.34it/s]
Encoding posts with Albert: 100%|██████████| 2000/2000 [02:12<00:00, 15.10it/s]


model='Albert' mae=np.float64(21.198)


Encoding posts with Roberta: 100%|██████████| 8000/8000 [17:07<00:00,  7.79it/s]
Encoding posts with Roberta: 100%|██████████| 2000/2000 [03:46<00:00,  8.82it/s]


model='Roberta' mae=np.float64(20.627)


Encoding posts with DistilBert: 100%|██████████| 8000/8000 [16:16<00:00,  8.19it/s]
Encoding posts with DistilBert: 100%|██████████| 2000/2000 [04:23<00:00,  7.58it/s]


model='DistilBert' mae=np.float64(20.812)


Encoding posts with MiniLM1: 100%|██████████| 8000/8000 [05:11<00:00, 25.68it/s]
Encoding posts with MiniLM1: 100%|██████████| 2000/2000 [01:08<00:00, 29.26it/s]


model='MiniLM1' mae=np.float64(19.32)


Encoding posts with MiniLM2: 100%|██████████| 8000/8000 [07:39<00:00, 17.40it/s]
Encoding posts with MiniLM2: 100%|██████████| 2000/2000 [01:53<00:00, 17.63it/s]


model='MiniLM2' mae=np.float64(18.792)


Encoding posts with MiniLM3: 100%|██████████| 8000/8000 [02:13<00:00, 60.10it/s]
Encoding posts with MiniLM3: 100%|██████████| 2000/2000 [00:34<00:00, 58.14it/s]

model='MiniLM3' mae=np.float64(19.112)



