In [1]:
import sys

sys.path.append("../..")  # to make utils importable

In [2]:
import pandas as pd
import warnings
from utils.data_loader import load_dataset
from bs4 import BeautifulSoup
from utils.consts import EMBEDDERS

warnings.filterwarnings("ignore")

In [3]:
posts = load_dataset(filepath="../../data/Posts.xml")
posts.shape

Data loaded


(10000, 22)

In [4]:
posts.sample(1)

Unnamed: 0,Id,PostTypeId,AcceptedAnswerId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastEditorUserId,LastEditDate,...,Tags,AnswerCount,CommentCount,ClosedDate,CommunityOwnedDate,ContentLicense,ParentId,OwnerDisplayName,LastEditorDisplayName,FavoriteCount
2871,8631,1,8650.0,2010-09-30T20:00:41.227,14,13771.0,<p>When you are defining a function/variable/e...,1785.0,-1.0,2020-06-16T10:01:49.770,...,|naming|variables|functions|,18.0,11,2015-04-05T15:17:02.313,2012-02-10T16:03:21.110,CC BY-SA 3.0,,,,


In [5]:
posts.columns

Index(['Id', 'PostTypeId', 'AcceptedAnswerId', 'CreationDate', 'Score',
       'ViewCount', 'Body', 'OwnerUserId', 'LastEditorUserId', 'LastEditDate',
       'LastActivityDate', 'Title', 'Tags', 'AnswerCount', 'CommentCount',
       'ClosedDate', 'CommunityOwnedDate', 'ContentLicense', 'ParentId',
       'OwnerDisplayName', 'LastEditorDisplayName', 'FavoriteCount'],
      dtype='object')

The column ```Body``` is a raw HTML code. Before applying any model, we need to clean it from redundant tags.

For this, we will use a library ```BeautifulSoup```

In [6]:
posts = posts.dropna(subset=["Body"], axis=0)  # next cell won't work otherwise

In [7]:
posts.dropna(subset=["Body"], axis=0) 
def html_to_str(row_html: str) -> str:
    soup = BeautifulSoup(row_html, "html.parser")
    return soup.get_text(separator=" ")


posts["Body"] = posts["Body"].apply(html_to_str)

Most columns are of no value. We will keep only those that are potentilly useful.

In [8]:
columns_to_keep = [
    "Body",
    "ViewCount",
    "CreationDate",
    "PostTypeId",
    "Score",
    "Tags",
    "Title",
]
posts = posts[columns_to_keep]

In [9]:
# posts = posts[posts.CreationDate >= ...] # may be useful
questions = posts[posts.PostTypeId == 1]
answers = posts[posts.PostTypeId == 2]

### Predict question ```Score``` based on the ```Body```'s embedding

In [10]:
from train import estimate_embedder

In [11]:
questions_subset = questions.copy()[:1_000]
models_mae = []
for model in EMBEDDERS:
    mae = estimate_embedder(data=questions_subset, model_name=model)
    models_mae.append({"model": model, "mae": mae})
    print(f"{model=} {mae=}")

Encoding train data with Albert: 100%|██████████| 800/800 [00:58<00:00, 13.77it/s]
Encoding test data with Albert: 100%|██████████| 200/200 [00:14<00:00, 13.74it/s]


model='Albert' mae=184.9331


Encoding train data with Roberta: 100%|██████████| 800/800 [01:33<00:00,  8.57it/s]
Encoding test data with Roberta: 100%|██████████| 200/200 [00:23<00:00,  8.63it/s]


model='Roberta' mae=140.1873


Encoding train data with DistilBert: 100%|██████████| 800/800 [01:24<00:00,  9.47it/s]
Encoding test data with DistilBert: 100%|██████████| 200/200 [00:22<00:00,  8.84it/s]


model='DistilBert' mae=161.0058


Encoding train data with MiniLM1: 100%|██████████| 800/800 [00:25<00:00, 31.27it/s]
Encoding test data with MiniLM1: 100%|██████████| 200/200 [00:08<00:00, 24.71it/s]


model='MiniLM1' mae=38.5727


Encoding train data with MiniLM2: 100%|██████████| 800/800 [00:39<00:00, 20.27it/s]
Encoding test data with MiniLM2: 100%|██████████| 200/200 [00:10<00:00, 18.55it/s]


model='MiniLM2' mae=35.6215


Encoding train data with MiniLM3: 100%|██████████| 800/800 [00:10<00:00, 74.29it/s]
Encoding test data with MiniLM3: 100%|██████████| 200/200 [00:02<00:00, 72.78it/s]

model='MiniLM3' mae=37.9715





In [12]:
answers_subset = answers.copy()[:1_000]
models_mae = []
for model in EMBEDDERS:
    mae = estimate_embedder(data=answers_subset, model_name=model)
    models_mae.append({"model": model, "mae": mae})
    print(f"{model=} {mae=}")

Encoding train data with Albert: 100%|██████████| 800/800 [00:47<00:00, 16.99it/s]
Encoding test data with Albert: 100%|██████████| 200/200 [00:11<00:00, 17.89it/s]


model='Albert' mae=182.9839


Encoding train data with Roberta: 100%|██████████| 800/800 [01:08<00:00, 11.67it/s]
Encoding test data with Roberta: 100%|██████████| 200/200 [00:16<00:00, 12.23it/s]


model='Roberta' mae=156.9618


Encoding train data with DistilBert: 100%|██████████| 800/800 [01:06<00:00, 11.98it/s]
Encoding test data with DistilBert: 100%|██████████| 200/200 [00:15<00:00, 13.17it/s]


model='DistilBert' mae=154.2511


Encoding train data with MiniLM1: 100%|██████████| 800/800 [00:21<00:00, 37.30it/s]
Encoding test data with MiniLM1: 100%|██████████| 200/200 [00:05<00:00, 38.48it/s]


model='MiniLM1' mae=36.2843


Encoding train data with MiniLM2: 100%|██████████| 800/800 [00:33<00:00, 23.76it/s]
Encoding test data with MiniLM2: 100%|██████████| 200/200 [00:08<00:00, 24.37it/s]


model='MiniLM2' mae=39.3906


Encoding train data with MiniLM3: 100%|██████████| 800/800 [00:09<00:00, 81.57it/s]
Encoding test data with MiniLM3: 100%|██████████| 200/200 [00:02<00:00, 80.26it/s]

model='MiniLM3' mae=35.9594



