In [None]:
%pip uninstall llama_index -y
%pip install -qq "git+https://github.com/jerryjliu/llama_index.git@424fdb689f8a8f87ca3ce1013ed8552777b4298b"
%pip install -qq pydantic nltk sentence_transformers huggingface_hub

In [1]:
from llama_index import SimpleWebPageReader

urls = ["http://paulgraham.com/greatwork.html"]
documents = SimpleWebPageReader(html_to_text=True).load_data(urls)
assert len(documents) == 1

In [2]:
text = documents[0].text

text.index("![How to Do Great Work]"), text.index("Notes"), text.index("**Thanks** ")

(192, 60764, 68292)

In [3]:
import re

notes = re.findall(
    r"\[\d+\]\s+(.*?)(?=(?:\[\d+\]|\n  \n|\Z))",
    documents[0].text[60764:68292],
    re.DOTALL,
)

In [4]:
doc = documents[0].text
doc = doc[192:60764] + doc[68292:]

for i, note in enumerate(notes, 1):
    note = note.strip()
    doc = doc.replace(f"[{i}]", f"({note})")

doc

'![How to Do Great Work](https://s.turbifycdn.com/aah/paulgraham/how-to-do-\ngreat-work-1.gif)  \n  \nJuly 2023  \n  \nIf you collected lists of techniques for doing great work in a lot of\ndifferent fields, what would the intersection look like? I decided to find out\nby making it.  \n  \nPartly my goal was to create a guide that could be used by someone working in\nany field. But I was also curious about the shape of the intersection. And one\nthing this exercise shows is that it does have a definite shape; it\'s not just\na point labelled "work hard."  \n  \nThe following recipe assumes you\'re very ambitious.  \n  \n  \n  \n  \n  \nThe first step is to decide what to work on. The work you choose needs to have\nthree qualities: it has to be something you have a natural aptitude for, that\nyou have a deep interest in, and that offers scope to do great work.  \n  \nIn practice you don\'t have to worry much about the third criterion. Ambitious\npeople are if anything already too conser

In [6]:
from llama_index import ServiceContext, LLMPredictor
from llama_index.llms import LiteLLM
from llama_index.node_parser import SimpleNodeParser
from llama_index.evaluation import DatasetGenerator


def generate_questions(model, num=None):
    llm_predictor = LLMPredictor(
        llm=LiteLLM(
            temperature=0.5,
            model_name=model,
            max_tokens=100,
            api_base="https://proxy.litellm.ai",
            custom_llm_provider="openai",
        )
    )
    node_parser = SimpleNodeParser.from_defaults(chunk_size=1024, chunk_overlap=64)

    service_context = ServiceContext.from_defaults(
        llm_predictor, node_parser=node_parser
    )

    data_generator = DatasetGenerator.from_documents(
        documents,
        service_context,
        num_questions_per_chunk=5,
    )
    questions = data_generator.generate_questions_from_nodes(num)

    return questions

In [7]:
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer("all-mpnet-base-v2")


def cluster(questions):
    embeddings = embedder.encode(questions)
    embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

    clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=2)
    clustering_model.fit(embeddings)
    cluster_assignment = clustering_model.labels_

    clustered_sentences = {}

    for sentence_id, cluster_id in enumerate(cluster_assignment):
        if cluster_id not in clustered_sentences:
            clustered_sentences[cluster_id] = []
        clustered_sentences[cluster_id].append(questions[sentence_id])

    return clustered_sentences

In [None]:
questions = generate_questions(model="command-nightly")
c = cluster(questions)

In [5]:
for k in c.keys():
    print(k, len(c[k]))

0 58
1 23
2 12


In [23]:
from IPython.display import clear_output
from huggingface_hub import interpreter_login

interpreter_login()
clear_output()

In [43]:
from datasets import Dataset, DatasetDict

raw = Dataset.from_dict({"question": []})
balanced = Dataset.from_dict({"question": []})

d = DatasetDict({"balanced": balanced, "raw": raw})
d

DatasetDict({
    balanced: Dataset({
        features: ['question'],
        num_rows: 0
    })
    raw: Dataset({
        features: ['question'],
        num_rows: 0
    })
})

In [25]:
d.push_to_hub("fastrepl/questions_pg_how_to_do_great_work")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format: 0ba [00:00, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format: 0ba [00:00, ?ba/s]