In [3]:
import json 

with open("../save/dataset.json", encoding="utf-8") as f:
    data = json.load(f)

In [4]:
import re
import unicodedata 

def preprocess(s: str):
    # don't know what this does ngl
    s = unicodedata.normalize("NFC", s).lower()
    # remove hyphens
    s = re.sub(r"[-‐-‒–—]+", " ", s)
    # accent folding
    s = unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode("ascii")
    # whitespace for single space 
    s = re.sub(r"\s+", " ", s).strip()
    return s

In [6]:
docs = []
for d in data:
    doc = preprocess(d['title']) + " " + preprocess(d['text'])
    docs.append(doc)

In [7]:
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer 

vec = TfidfVectorizer(
    lowercase=True,
    strip_accents='unicode',
    token_pattern=r"(?u)\b[^\W\d_]+(?:'[^\W\d_]+)?\b",
    ngram_range=(1,2),
    min_df=2,
    max_df=0.9,
    norm='l2',
    dtype=np.float32
)

X = vec.fit_transform(docs)

In [None]:
from sklearn.preprocessing import normalize

def search_tfidf(doc_ind: int, k: int = 20):
    q = X[doc_ind] 
    X_n = normalize(X, norm="l2", axis=1, copy=False)
    q_n = normalize(q, norm="l2", axis=1, copy=False)
    scores = (X_n @ q_n.T).toarray().ravel()     

    idx = np.argpartition(scores, -k)[-k:]
    order = idx[np.argsort(scores[idx])[::-1]]
    
    print(f"Query: {data[doc_ind]['title']}\n")
    for j in order[1:]:
        print(f"{data[j]['title']} {scores[j]}")
    return [(scores[i], data[i]['title']) for i in order]

In [9]:
_ = search_tfidf(8, 10)

Query: Mary Caffrey Low

Bella Nixon 0.21529759466648102
Kappa Sigma 0.20706669986248016
Norman L. Bassett 0.19645266234874725
Women's rights 0.18810726702213287
Nellie Quander 0.1855752021074295
College of William & Mary fraternity and sorority system 0.16852104663848877
Pauli Murray 0.15569064021110535
Women in Italy 0.15063662827014923
Ann Bannon 0.1486842781305313


In [11]:
from scipy import sparse
import joblib

joblib.dump(vec, "../save/tfidf/vectorizer.joblib")
sparse.save_npz("../save/tfidf/X.npz", X, compressed=True)