In [1]:
import pandas as pd
from lxml.etree import XMLParser, parse
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Parse the XML file
p = XMLParser(huge_tree=True)
tree = parse('../data/Posts.xml', parser=p)

In [3]:
# Extract elements from the XML tree
root = tree.getroot()
data = []

for post in root.findall('row'):
    data.append(post.attrib)

# Conver to a pandas DataFrame
posts = pd.DataFrame(data)

In [4]:
posts.shape

(112485, 22)

In [5]:
posts.head(1)

Unnamed: 0,AcceptedAnswerId,AnswerCount,Body,ClosedDate,CommentCount,CommunityOwnedDate,ContentLicense,CreationDate,FavoriteCount,Id,...,LastEditorDisplayName,LastEditorUserId,OwnerDisplayName,OwnerUserId,ParentId,PostTypeId,Score,Tags,Title,ViewCount
0,57,2,<p>I've heard that I can get a lower interest ...,,0,,CC BY-SA 2.5,2009-10-06T03:02:46.713,0,2,...,admin,,bms1,,,1,13,|line-of-credit|,"What's better: An unsecured line of credit, or...",837


In [6]:
posts.columns

Index(['AcceptedAnswerId', 'AnswerCount', 'Body', 'ClosedDate', 'CommentCount',
       'CommunityOwnedDate', 'ContentLicense', 'CreationDate', 'FavoriteCount',
       'Id', 'LastActivityDate', 'LastEditDate', 'LastEditorDisplayName',
       'LastEditorUserId', 'OwnerDisplayName', 'OwnerUserId', 'ParentId',
       'PostTypeId', 'Score', 'Tags', 'Title', 'ViewCount'],
      dtype='object')

The column ```Body``` is a raw HTML code. Before applying any model, we need to clean it from redundant tags.

For this, we will use a library ```BeautifulSoup```

In [7]:
from bs4 import BeautifulSoup

def html_to_str(row_html: str) -> str:
    soup = BeautifulSoup(row_html, 'html.parser')
    return soup.get_text(separator=' ')

posts["Body"] = posts["Body"].apply(html_to_str)

Most columns are of no value. We will keep only those that are potentilly useful.

In [8]:
columns_to_keep = ['Body', 'ViewCount', 'Score', 'Tags', 'Title']
posts = posts[columns_to_keep]

### Predict ```Score``` based on the ```Body```'s embedding

In [9]:
posts_subset = posts.copy()
posts_subset = posts_subset[0:10_000]

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sentence_transformers import SentenceTransformer

import numpy as np

In [11]:
from tqdm import tqdm
tqdm.pandas()

In [12]:
# Embedder models
models = {
    'Albert': 'paraphrase-albert-small-v2',
    'Roberta': 'all-distilroberta-v1',
    'DistilBert': 'multi-qa-distilbert-cos-v1',
    'MiniLM1': 'all-MiniLM-L6-v2',
    'MiniLM3': 'all-MiniLM-L12-v2',
    'MiniLM4': 'paraphrase-MiniLM-L3-v2'
}

In [13]:
def estimate_embedder(model_name: str) -> float:
    """
    Estimates the performance of a linear regression model using embeddings
    generated by a specified SentenceTransformer model from Hugging Face.

    Args:
        model_name (str): The name of the model to be used for generating embeddings.

    Returns:
        float: The Mean Absolute Error (MAE) of the linear regression model
               on the test set.
    """
    # Download a model from Hugging Face using its name
    selected_model = models[model_name]
    embedder = SentenceTransformer(selected_model)

    bodies = posts_subset['Body'].tolist()

    X = []
    for body in tqdm(bodies, desc="Encoding posts"):
        # Encode each 'body' and append it to X
        encoded_body = embedder.encode(body)
        X.append(encoded_body)

    X = np.array(X)
    y = posts_subset['Score'].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1200)

    # Create and fit a simple linear regression model
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)

    # Evaluate model performance
    MAE = mean_absolute_error(y_test, y_pred)
    return MAE

In [14]:
models_mae = []
for model in models.keys():
    mae = round(estimate_embedder(model_name=model), 3)
    models_mae.append({"model": model, "mae": mae})

Encoding posts: 100%|██████████| 10000/10000 [06:01<00:00, 27.69it/s]
Encoding posts: 100%|██████████| 10000/10000 [09:25<00:00, 17.70it/s]
Encoding posts: 100%|██████████| 10000/10000 [09:02<00:00, 18.43it/s]
Encoding posts: 100%|██████████| 10000/10000 [02:45<00:00, 60.30it/s]
Encoding posts: 100%|██████████| 10000/10000 [04:06<00:00, 40.54it/s]
Encoding posts: 100%|██████████| 10000/10000 [01:13<00:00, 136.95it/s]


In [15]:
for model_mae in models_mae:
    print(f"MODEL={model_mae["model"]}, MAE={model_mae["mae"]}")

MODEL=Albert, MAE=5.392
MODEL=Roberta, MAE=5.219
MODEL=DistilBert, MAE=5.2
MODEL=MiniLM1, MAE=5.229
MODEL=MiniLM3, MAE=4.857
MODEL=MiniLM4, MAE=5.053
