In [17]:
%pip install -qq llama_index=="0.8.22" pydantic nltk sentence_transformers huggingface_hub

Note: you may need to restart the kernel to use updated packages.


In [2]:
from llama_index import SimpleWebPageReader

urls = ["http://paulgraham.com/greatwork.html"]
documents = SimpleWebPageReader(html_to_text=True).load_data(urls)
assert len(documents) == 1

# TODO: Replace Note, ex) [1]
# documents[0].text.find("[1]")

In [3]:
from llama_index import ServiceContext, LLMPredictor
from llama_index.llms import OpenAI
from llama_index.node_parser import SimpleNodeParser
from llama_index.evaluation import DatasetGenerator


def generate_questions(model, num=None):
    llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name=model))
    node_parser = SimpleNodeParser.from_defaults(chunk_size=512, chunk_overlap=128)

    service_context = ServiceContext.from_defaults(
        llm_predictor, node_parser=node_parser
    )

    data_generator = DatasetGenerator.from_documents(
        documents,
        service_context,
        num_questions_per_chunk=5,
    )
    questions = data_generator.generate_questions_from_nodes(num)

    return questions

In [8]:
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer("all-mpnet-base-v2")


def cluster(questions):
    embeddings = embedder.encode(questions)
    embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

    clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=2)
    clustering_model.fit(embeddings)
    cluster_assignment = clustering_model.labels_

    clustered_sentences = {}

    for sentence_id, cluster_id in enumerate(cluster_assignment):
        if cluster_id not in clustered_sentences:
            clustered_sentences[cluster_id] = []
        clustered_sentences[cluster_id].append(questions[sentence_id])

    return clustered_sentences

In [5]:
questions = generate_questions(model="gpt-3.5-turbo")
c = cluster(questions)

In [9]:
for k in c.keys():
    print(k, len(c[k]))

37 2
55 3
19 3
16 2
1 3
3 5
69 1
9 3
61 2
12 4
8 4
5 2
20 2
56 1
41 2
86 1
40 2
80 1
2 3
67 1
22 3
54 2
4 3
26 2
35 2
28 2
14 2
11 3
68 1
87 1
45 2
7 2
77 1
42 2
65 1
70 1
51 1
39 3
63 1
52 2
76 2
23 2
21 2
71 1
57 1
18 3
30 2
29 2
62 2
0 3
66 1
72 1
47 2
75 1
53 2
25 2
43 1
10 3
17 2
85 1
73 1
44 1
33 1
27 2
81 1
84 2
64 1
34 1
15 3
60 1
13 4
48 1
82 1
50 2
24 4
46 1
32 1
79 1
49 1
78 1
83 1
59 1
74 1
38 1
58 1
36 1
6 2
31 1


In [23]:
from IPython.display import clear_output
from huggingface_hub import interpreter_login

interpreter_login()
clear_output()

In [43]:
from datasets import Dataset, DatasetDict

raw = Dataset.from_dict({"question": []})
balanced = Dataset.from_dict({"question": []})

d = DatasetDict({"balanced": balanced, "raw": raw})
d

DatasetDict({
    balanced: Dataset({
        features: ['question'],
        num_rows: 0
    })
    raw: Dataset({
        features: ['question'],
        num_rows: 0
    })
})

In [25]:
d.push_to_hub("fastrepl/questions_pg_how_to_do_great_work")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format: 0ba [00:00, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format: 0ba [00:00, ?ba/s]