## Using sentence Transformer for vector embeddings

## Data preparation and loading

In [1]:
import pandas as pd
import re
from urllib.parse import urlparse

In [2]:
filtered_bookmarks = pd.read_csv('filtered_bookmarks.csv')

In [3]:
df_clean = filtered_bookmarks.copy()

In [4]:
def link_to_tokens(link):
    """Extracts tokens from the URL path and domain."""
    if pd.isna(link):
        return ""
    
    # 1. Parse the URL
    parsed = urlparse(link)
    
    # 2. Get the clean domain (root.com)
    domain = parsed.netloc.replace('www.', '')
    
    # 3. Get path tokens (split by '/' and '-')
    path_tokens = re.split(r'[/_.-]', parsed.path)
    
    # Combine domain and path tokens into a list of words
    all_tokens = [domain] + [token for token in path_tokens if token]
    
    return ' '.join(all_tokens)

In [5]:
# Create the new combined feature column
df_clean['link_tokens'] = df_clean['link'].apply(link_to_tokens)

# Combine title and link tokens into one text column per bookmark
df_clean['combined_text'] = df_clean['title'] + ' ' + df_clean['link_tokens']

In [6]:
df_clean.sample(n=5)

Unnamed: 0,title,link,directory,leaf_dir,link_tokens,combined_text
154,The “AI 2027” Scenario: How realistic is it?,https://garymarcus.substack.com/p/the-ai-2027-...,coding > machineLearning > articles,articles,garymarcus.substack.com p the ai 2027 scenario...,The “AI 2027” Scenario: How realistic is it? g...
282,How to add a landing page to a mkdocs doc site...,https://stackoverflow.com/questions/63438788/h...,coding > webDevelopment > selected,selected,stackoverflow.com questions 63438788 how to ad...,How to add a landing page to a mkdocs doc site...
69,"SciPost: SciPost Chem. 1, 005 (2022) - Crystal...",https://scipost.org/10.21468/SciPostChem.1.2.005,work > MS_thesis > MatSciPaper,MatSciPaper,scipost.org 10 21468 SciPostChem 1 2 005,"SciPost: SciPost Chem. 1, 005 (2022) - Crystal..."
374,Dataset - Catalog,https://catalog.data.gov/dataset/?tags=fruits-...,projects > vegDataset,vegDataset,catalog.data.gov dataset,Dataset - Catalog catalog.data.gov dataset
18,Xingang Zhao - Google Scholar,https://scholar.google.com/citations?user=Q0DQ...,work > material_science > scientists,scientists,scholar.google.com citations,Xingang Zhao - Google Scholar scholar.google.c...


## Vector Embeddings with Sentence Transformer

In [7]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')

X_dense = model.encode(df_clean['combined_text'].tolist(), show_progress_bar=True)

  from .autonotebook import tqdm as notebook_tqdm
2025-12-07 17:37:18.681756: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Batches: 100%|██████████| 13/13 [00:00<00:00, 18.01it/s]


## Target Variable Encoding (y)

considered the `leaf_dir` as the target variable for classification

In [8]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

y = label_encoder.fit_transform(df_clean['leaf_dir'])

In [9]:
label_encoder.classes_

array(['AItools', 'C/C++', 'DFTtools', 'DSA', 'MLmaterialsScience',
       'MatSciConcepts', 'MatSciPaper', 'articles',
       'bandStructurePlotCodes', 'finance', 'learn',
       'libraries/tools/models', 'linux / shell', 'materialsDatabase',
       'people/organizations', 'plots', 'scientists', 'selected',
       'vegDataset', 'webDevelopment'], dtype=object)

In [10]:
y

array([ 4,  4,  4,  4,  4,  4,  4,  4,  4,  4, 16, 16, 16, 16, 16, 16, 16,
       16, 16, 16, 16, 16, 16, 16, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
        8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  5,  5,  5,  5,  5,  5,  5,
        5,  5,  5,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6, 14, 14,
       14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
       14, 14, 14, 14, 14, 14, 14, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
       11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
       11, 11, 11, 11,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
        7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  0,  0,
        0,  0,  0,  0,  0

## train-test split

In [11]:
from sklearn.model_selection import train_test_split

X_train_val, X_test, y_train_val, y_test = train_test_split(
    X_dense, y,
    test_size=0.2,
    random_state=69,
    stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val,
    test_size=0.125,  # 10% of total for validation
    random_state=69,
    stratify=y_train_val
)


In [12]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(276, 384)
(40, 384)
(80, 384)


In [13]:
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

(276,)
(40,)
(80,)


In [17]:
X_train

array([[-0.09973808,  0.02342485, -0.05863123, ...,  0.02494417,
         0.03866109, -0.04173507],
       [-0.10261237,  0.07778404, -0.00564668, ...,  0.04064304,
        -0.04133896,  0.02158372],
       [-0.07807889,  0.05194126, -0.04271359, ...,  0.04606555,
        -0.08535463, -0.03997073],
       ...,
       [-0.07273512, -0.02523563,  0.06813152, ..., -0.0017187 ,
        -0.00446138,  0.02552423],
       [-0.07065745, -0.0018605 ,  0.0004698 , ..., -0.05855493,
         0.02214863,  0.04297204],
       [-0.08604132,  0.04831516,  0.01137779, ...,  0.02129088,
        -0.03249979, -0.09900051]], dtype=float32)

## Model Building and Evaluation

In [14]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

In [15]:
results = {}

In [16]:
mnb_model = MultinomialNB()
mnb_model.fit(X_train, y_train)
y_pred_mnb_val = mnb_model.predict(X_val)
results['MNB'] = accuracy_score(y_val, y_pred_mnb_val)

ValueError: Negative values in data passed to MultinomialNB (input X).

In [18]:
param_grid = {
    'C': [0.1, 1.0, 10, 50, 100]
}

lr_grid = GridSearchCV(
    LogisticRegression(solver='lbfgs', max_iter=1000, random_state=69),
    param_grid,
    cv=3, # use 3-fold cross-validation on the training set
    scoring='f1_weighted', # F1-score for better evaluation
    verbose=1
)

lr_grid.fit(X_train, y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


0,1,2
,estimator,LogisticRegre...ndom_state=69)
,param_grid,"{'C': [0.1, 1.0, ...]}"
,scoring,'f1_weighted'
,n_jobs,
,refit,True
,cv,3
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,100
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,69
,solver,'lbfgs'
,max_iter,1000


In [19]:
print(f"Best C value: {lr_grid.best_params_}")
print(f"Best F1-Score on Train/CV: {lr_grid.best_score_:.4f}")

Best C value: {'C': 100}
Best F1-Score on Train/CV: 0.5715


In [20]:
best_lr_model = lr_grid.best_estimator_
val_pred = best_lr_model.predict(X_val)

In [21]:
results['LR (C=100)'] = accuracy_score(y_val, val_pred)

In [22]:
param_grid_svc = {'C': [0.1, 1.0, 10, 50]}
svc_grid = GridSearchCV(
    LinearSVC(random_state=42, max_iter=5000),
    param_grid_svc,
    cv=3,
    scoring='f1_weighted',
    verbose=1
)

svc_grid.fit(X_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


0,1,2
,estimator,LinearSVC(max...ndom_state=42)
,param_grid,"{'C': [0.1, 1.0, ...]}"
,scoring,'f1_weighted'
,n_jobs,
,refit,True
,cv,3
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,10
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [23]:
print(f"Best C value: {svc_grid.best_params_}")
print(f"Best F1-Score on Train/CV: {svc_grid.best_score_:.4f}")

Best C value: {'C': 10}
Best F1-Score on Train/CV: 0.5836


In [24]:
best_svc_model = svc_grid.best_estimator_
svc_val_pred = best_svc_model.predict(X_val)

In [25]:
results['LinearSVC (C=10)'] = accuracy_score(y_val, svc_val_pred)

In [26]:
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=50,
    random_state=69,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)

y_pred_rf_val = rf_model.predict(X_val)

results['RF'] = accuracy_score(y_val, y_pred_rf_val)

In [27]:
results

{'LR (C=100)': 0.525, 'LinearSVC (C=10)': 0.575, 'RF': 0.475}

Linear SVC got the best accuracy among all models.

## More Feature Engineering needed

### Incorporate explicit domain signal (High Impact)

In [28]:
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack

In [29]:
# 1. Extract Root Domain
def extract_root_domain(link):
    if pd.isna(link):
        return 'unknown'
    # Extracts netloc, removes 'www.', then gets the root domain (e.g., 'google.com')
    netloc = urlparse(link).netloc.replace('www.', '').split(':')[0]
    return '.'.join(netloc.split('.')[-2:])

In [30]:
df_clean['root_domain'] = df_clean['link'].apply(extract_root_domain)

In [31]:
df_clean.sample(n=5)

Unnamed: 0,title,link,directory,leaf_dir,link_tokens,combined_text,root_domain
244,Stroustrup: C++ Glossary,https://www.stroustrup.com/glossary.html,coding > C/C++,C/C++,stroustrup.com glossary html,Stroustrup: C++ Glossary stroustrup.com glossa...,stroustrup.com
329,seaborn: statistical data visualization — seab...,https://seaborn.pydata.org/index.html,coding > plots,plots,seaborn.pydata.org index html,seaborn: statistical data visualization — seab...,pydata.org
367,Vegetables Object Detection Model by Vegetables,https://universe.roboflow.com/vegetables/veget...,projects > vegDataset,vegDataset,universe.roboflow.com vegetables vegetables el4g6,Vegetables Object Detection Model by Vegetable...,roboflow.com
37,QijingZheng/pyband: band plot using python mat...,https://github.com/QijingZheng/pyband,work > MS_thesis > bandStructurePlotCodes,bandStructurePlotCodes,github.com QijingZheng pyband,QijingZheng/pyband: band plot using python mat...,github.com
62,tobycrisford.github.io/bravais-lattice-fermi-s...,https://tobycrisford.github.io/bravais-lattice...,work > MS_thesis > DFTtools,DFTtools,tobycrisford.github.io bravais lattice fermi s...,tobycrisford.github.io/bravais-lattice-fermi-s...,github.io


### One Hot encode top domains

In [32]:
top_domains = df_clean['root_domain'].value_counts().nlargest(15).index

In [33]:
df_clean.sample(n=5)

Unnamed: 0,title,link,directory,leaf_dir,link_tokens,combined_text,root_domain
175,[blog] on machine learning concepts,https://colah.github.io/,coding > machineLearning > learn,learn,colah.github.io,[blog] on machine learning concepts colah.gith...,github.io
178,Physics: a survey of machine learning applicat...,https://www.frontiersin.org/journals/physics/a...,coding > machineLearning > learn,learn,frontiersin.org journals physics articles 10 3...,Physics: a survey of machine learning applicat...,frontiersin.org
227,Perplexity,https://www.perplexity.ai/,coding > machineLearning > AItools,AItools,perplexity.ai,Perplexity perplexity.ai,perplexity.ai
382,Vegetable Classification Using Transfer Learning,https://www.kaggle.com/code/theeyeschico/veget...,projects > vegDataset,vegDataset,kaggle.com code theeyeschico vegetable classif...,Vegetable Classification Using Transfer Learni...,kaggle.com
107,jack dorsey - Google Search,https://www.google.com/search?gs_ssp=eJzj4tLP1...,coding > machineLearning > people/organizations,people/organizations,google.com search,jack dorsey - Google Search google.com search,google.com


In [34]:
df_clean.shape

(396, 7)

7 features. Serial no/index is not included.

In [35]:
df_clean.columns

Index(['title', 'link', 'directory', 'leaf_dir', 'link_tokens',
       'combined_text', 'root_domain'],
      dtype='object')

In [36]:
# Prepare data for OHE: replace non-top domains with 'other'
domain_series = df_clean['root_domain'].where(
    df_clean['root_domain'].isin(top_domains), 
    'other_domain'
)

In [37]:
domain_categories = domain_series.values.reshape(-1, 1) 

# Create sparse domain feature matrix
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=True)
X_domain_ohe = ohe.fit_transform(domain_categories)

In [38]:
X_combined = hstack([X_dense, X_domain_ohe])
X_final = X_combined.toarray()

### Re-splitting train-test

In [39]:
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X_final, y,
    test_size=0.2,
    random_state=69,
    stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val,
    test_size=0.125,  # 10% of total for validation
    random_state=69,
    stratify=y_train_val
)


### Re-evaluation of models after adding domain features

In [40]:
param_grid = {
    'C': [0.1, 1.0, 10, 50, 100]
}

lr_grid = GridSearchCV(
    LogisticRegression(solver='lbfgs', max_iter=1000, random_state=69),
    param_grid,
    cv=3, # use 3-fold cross-validation on the training set
    scoring='f1_weighted', # F1-score for better evaluation
    verbose=1
)

lr_grid.fit(X_train, y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


0,1,2
,estimator,LogisticRegre...ndom_state=69)
,param_grid,"{'C': [0.1, 1.0, ...]}"
,scoring,'f1_weighted'
,n_jobs,
,refit,True
,cv,3
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,50
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,69
,solver,'lbfgs'
,max_iter,1000


In [41]:
print(f"Best C value: {lr_grid.best_params_}")
print(f"Best F1-Score on Train/CV: {lr_grid.best_score_:.4f}")

Best C value: {'C': 50}
Best F1-Score on Train/CV: 0.5729


In [46]:
best_lr_model = lr_grid.best_estimator_
val_pred = best_lr_model.predict(X_val)

results['LR (ST+domain) (C=50)'] = accuracy_score(y_val, val_pred)

In [47]:
results

{'LR (C=100)': 0.525,
 'LinearSVC (C=10)': 0.575,
 'RF': 0.475,
 'LR (ST+domain) (C=50)': 0.625}

In [55]:
param_grid_svc = {'C': [0.1, 1.0, 5, 10, 50, 100, 500, 1000]}
svc_grid = GridSearchCV(
    LinearSVC(random_state=69, max_iter=10000),
    param_grid_svc,
    cv=3,
    scoring='f1_weighted',
    verbose=1
)

svc_grid.fit(X_train, y_train)

print(f"Best C value: {svc_grid.best_params_}")
print(f"Best F1-Score on Train/CV: {svc_grid.best_score_:.4f}")

best_svc_model = svc_grid.best_estimator_
y_pred_svc_val = best_svc_model.predict(X_val)

Fitting 3 folds for each of 8 candidates, totalling 24 fits




Best C value: {'C': 5}
Best F1-Score on Train/CV: 0.5992


In [56]:
results['LinearSVC (ST+domain) (C=5)'] = accuracy_score(y_val, y_pred_svc_val)

In [57]:
results

{'LR (C=100)': 0.525,
 'LinearSVC (C=10)': 0.575,
 'RF': 0.475,
 'LR (ST+domain) (C=50)': 0.625,
 'LinearSVC (ST+domain) (C=5)': 0.55}

In [58]:
rf_model.fit(X_train, y_train)

y_pred_rf_val = rf_model.predict(X_val)

results['RF (ST+domain)'] = accuracy_score(y_val, y_pred_rf_val)

In [59]:
results

{'LR (C=100)': 0.525,
 'LinearSVC (C=10)': 0.575,
 'RF': 0.475,
 'LR (ST+domain) (C=50)': 0.625,
 'LinearSVC (ST+domain) (C=5)': 0.55,
 'RF (ST+domain)': 0.375}