In [None]:
# pip install transformers datasets huggingface_hub

In [None]:
from datasets import load_dataset
ds = load_dataset("Abirate/english_quotes")

In [None]:
data_f=ds["train"].to_pandas()

In [None]:
data_f.head()

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [None]:
data_f['clean_quote']=data_f['quote'].str.lower()
data_f['clean_quote'] = data_f['clean_quote'].apply(lambda x: ' '.join(
    [word for word in str(x).split() if word.lower() not in stop_words]
))

In [None]:
import string
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

data_f['clean_quote'] = data_f['clean_quote'].apply(remove_punctuation)

In [None]:
import random

def generate_queries(row):
    author = row["author"]
    tags = row["tags"]
    queries = []

    # Basic templates
    queries.append(f"{author} quotes")
    queries.append(f"Quotes by {author}")
    
    if tags is not None and len(tags) > 0:
        tag = random.choice(tags)
        queries.append(f"{author} quotes about {tag}")
        queries.append(f"Quotes tagged with {tag}")
        queries.append(f"Famous quotes on {tag}")
        queries.append(f"Quotes related to {tag} by {author}")

    return queries

# Generate a list of (query, quote) pairs
training_data = []

for _, row in data_f.iterrows():
    quote = row['clean_quote']
    queries = generate_queries(row)
    for query in queries:
        training_data.append((query, quote))

In [None]:
# training_data[::5]

In [None]:
# pip install sentence-transformers

In [None]:
from sentence_transformers import InputExample

train_examples = [
    InputExample(texts=[query, quote]) for query, quote in training_data]

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

In [None]:
from sentence_transformers import SentenceTransformer, losses

model = SentenceTransformer('all-MiniLM-L6-v2')  # small & fast


In [None]:
train_loss = losses.MultipleNegativesRankingLoss(model)

In [None]:
# pip install --upgrade accelerate

In [None]:
# pip install accelerate>=0.26.0

In [None]:
# !pip install transformers[torch]
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    warmup_steps=100,
    steps_per_epoch=100
)

In [None]:
# Assuming `model` is the same object you trained
query = "Show some quotes about Imagination by Einstein"
all_quotes = data_f["quote"].tolist()

from sentence_transformers import util
quote_embeddings = model.encode(all_quotes, convert_to_tensor=True)
query_embedding = model.encode(query, convert_to_tensor=True)
cosine_scores = util.cos_sim(query_embedding, quote_embeddings)[0]

top_k = 5
top_results = cosine_scores.argsort(descending=True)[:top_k]

print(f"\nüîç Query: {query}\n")
for idx in top_results.tolist():  # convert tensor to list of ints
    print(f"‚úÖ Score: {cosine_scores[idx]:.4f}")
    print(f"üß† Author: {data_f.iloc[idx]['author']}")
    print(f"üìú Quote: {all_quotes[idx]}")
    print("---")

In [None]:
model.save("proto/")