In [3]:
import jsonlines
transcription_path = "../../first_tests/output/Debatten12okt.jsonl"
data = []
with jsonlines.open(transcription_path) as reader:
    for obj in reader:
        data.append(obj)

In [4]:
import numpy as np

def segment_sentences(start, end, txt):
    if "." not in txt:
        return [{"start": start, "end": end, "text": txt.strip()}]
    sentences = [s.strip() for s in txt.split(".") if len(s.strip()) > 0]
    n_sents = len(sentences)
    segments = []
    for i, sentence in enumerate(sentences):
        segments.append({"start": start, "end": end, "text": sentence})
    return segments

{'start': 4, 'end': 15, 'text': 'This is a sentence'}
{'start': 4, 'end': 15, 'text': 'This is another sentence'}
{'start': 4, 'end': 15, 'text': 'This is a third sentence'}
{'start': 0, 'end': 3, 'text': 'This is a sentence'}


In [5]:
import numpy as np

parsed = []
for obj in data:
    timestamp = obj["timestamp"]
    txt = obj["text"]
    start, end = timestamp
    segments = segment_sentences(start, end, txt)
    segments = [s for s in segments if len(s["text"].split()) > 2]
    parsed.extend(segments)

In [6]:
import pandas as pd
df = pd.DataFrame(parsed)
df.head()

Unnamed: 0,start,end,text
0,3.96,7.28,Denne debatten skal handle om dette
1,7.28,15.72,Målet er at flest mulig skal jobbe
2,7.28,15.72,Derfor må det lønne seg å stå opp om morgenen
3,17.52,21.88,Det er litt forstemmende at vi ofte får en dis...
4,21.96,26.96,"nivået på ytelser, og altfor lite om hvordan v..."


In [22]:
df.to_csv("debatten.csv", index=False)

In [7]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("NbAiLab/nb-sbert-base")

In [8]:
embeddings = model.encode(df["text"].tolist(), show_progress_bar=True);

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

In [9]:
from autofaiss import build_index
index, index_infos = build_index(embeddings, save_on_disk=True, verbose=0)

2023-10-29 22:01:17,272 [INFO]: Using 10 omp threads (processes), consider increasing --nb_cores if you have more
2023-10-29 22:01:17,277 [INFO]: Launching the whole pipeline 10/29/2023, 22:01:17
2023-10-29 22:01:17,277 [INFO]: Reading total number of vectors and dimension 10/29/2023, 22:01:17
2023-10-29 22:01:17,283 [DEBUG]: open file: /var/folders/0k/1bg63zt532nb9d86g5tk_6vh0000gn/T/tmpnnl3xqwr/emb.npy
100%|██████████| 1/1 [00:00<00:00, 34100.03it/s]
2023-10-29 22:01:17,287 [INFO]: There are 661 embeddings of dim 768
2023-10-29 22:01:17,288 [INFO]: >>> Finished "Reading total number of vectors and dimension" in 0.0100 secs
2023-10-29 22:01:17,288 [INFO]: 	Compute estimated construction time of the index 10/29/2023, 22:01:17
2023-10-29 22:01:17,289 [INFO]: 		-> Train: 16.7 minutes
2023-10-29 22:01:17,289 [INFO]: 		-> Add: 0.0 seconds
2023-10-29 22:01:17,291 [INFO]: 		Total: 16.7 minutes
2023-10-29 22:01:17,291 [INFO]: 	>>> Finished "Compute estimated construction time of the index" in

In [20]:
import faiss
index = faiss.read_index("knn.index", faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY)
index

<faiss.swigfaiss.IndexFlat; proxy of <Swig Object of type 'faiss::IndexFlat *' at 0x315627900> >

# usage

In [25]:
def query(q, K=10):
    emb = model.encode([q], show_progress_bar=False)
    _, matches = index.search(emb, K)
    res = df.iloc[matches[0]]
    return res[["start", "end"]].values.tolist()

query("utfordringer rundt arbeidsledighet", K = 10)

[[1372.76, 1377.68],
 [423.88, 426.04],
 [1510.88, 1517.92],
 [1385.6, 1391.04],
 [39.28, 46.44],
 [480.32, 484.64],
 [484.72, 485.04],
 [426.12, 431.72],
 [1518.0, 1523.64],
 [2555.78, 2561.26]]