In [1]:
# pip install transformers datasets huggingface_hub

In [2]:
from datasets import load_dataset
ds = load_dataset("Abirate/english_quotes")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data_f=ds["train"].to_pandas()

In [4]:
data_f.head()

Unnamed: 0,quote,author,tags
0,“Be yourself; everyone else is already taken.”,Oscar Wilde,"[be-yourself, gilbert-perreira, honesty, inspi..."
1,"“I'm selfish, impatient and a little insecure....",Marilyn Monroe,"[best, life, love, mistakes, out-of-control, t..."
2,“Two things are infinite: the universe and hum...,Albert Einstein,"[human-nature, humor, infinity, philosophy, sc..."
3,"“So many books, so little time.”",Frank Zappa,"[books, humor]"
4,“A room without books is like a body without a...,Marcus Tullius Cicero,"[books, simile, soul]"


In [5]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [6]:
data_f['clean_quote']=data_f['quote'].str.lower()
data_f['clean_quote'] = data_f['clean_quote'].apply(lambda x: ' '.join(
    [word for word in str(x).split() if word.lower() not in stop_words]
))

In [7]:
import spacy

nlp = spacy.load('en_core_web_sm')

def lemmatize_with_spacy(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

data_f['clean_quote'] = data_f['clean_quote'].apply(lemmatize_with_spacy)

In [8]:
import string
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

data_f['clean_quote'] = data_f['clean_quote'].apply(remove_punctuation)

In [9]:
import random

def generate_queries(row):
    author = row["author"]
    tags = row["tags"]
    queries = []

    # Base queries
    queries.append(f"{author} quotes")
    queries.append(f"Quotes by {author}")
    
    if tags is not None and len(tags) > 0:
        tag = random.choice(tags)
        queries.append(f"{author} quotes about {tag}")
        queries.append(f"Quotes tagged with {tag}")
        queries.append(f"Famous quotes on {tag}")
        queries.append(f"Quotes related to {tag} by {author}")

    return queries

# Storing the query, quote pairs for training
training_data = []

for _, row in data_f.iterrows():
    quote = row['clean_quote']
    queries = generate_queries(row)
    for query in queries:
        training_data.append((query, quote))

In [10]:
# training_data[::5]

In [11]:
# pip install sentence-transformers

In [12]:
from sentence_transformers import InputExample

train_examples = [
    InputExample(texts=[query, quote]) for query, quote in training_data]

In [13]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

In [14]:
from sentence_transformers import SentenceTransformer, losses
model = SentenceTransformer('all-MiniLM-L6-v2')

In [15]:
train_loss = losses.MultipleNegativesRankingLoss(model)

In [16]:
# pip install --upgrade accelerate

In [17]:
# pip install accelerate>=0.26.0

In [18]:
# !pip install transformers[torch]
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    warmup_steps=100
)



Step,Training Loss
500,2.0609


In [19]:
model.save("quote-retrievel-model/")