In [8]:
from selenium import webdriver
from selenium.webdriver.common.by import By

from tqdm import tqdm

import time

In [9]:
driver = webdriver.Chrome()

In [10]:
all_quotes = []

In [11]:
for i in tqdm(range(1, 101)):
    url = f"https://www.goodreads.com/author/quotes/656983.J_R_R_Tolkien?page={i}"

    driver.get(url)

    time.sleep(0.1)

    quotes_per_page = driver.find_elements(By.CLASS_NAME, "quoteText")
    all_quotes.extend([quote.text for quote in quotes_per_page])
    # print(quotes_per_page[-1].text)
    quotes_per_page.clear()


# scrape page 2 manually
url = "https://www.goodreads.com/author/quotes/656983.J_R_R_Tolkien?page=2"
driver.get(url)

time.sleep(0.1)

quotes_per_page = driver.find_elements(By.CLASS_NAME, "quoteText")
all_quotes.extend([quote.text for quote in quotes_per_page])
quotes_per_page.clear()

driver.quit()

100%|██████████| 100/100 [03:06<00:00,  1.87s/it]


In [12]:
print(len(all_quotes))

3030


In [13]:
print(all_quotes[0])

“All that is gold does not glitter,
Not all those who wander are lost;
The old that is strong does not wither,
Deep roots are not reached by the frost.

From the ashes a fire shall be woken,
A light from the shadows shall spring;
Renewed shall be blade that was broken,
The crownless again shall be king.”
― J.R.R. Tolkien, The Fellowship of the Ring


#### Manually extracting data from page 2

In [14]:
for i in all_quotes[31:60]:
    print(i)
    print("*"*50)

“A man that flies from his fear may find that he has only taken a short cut to meet it.”
― J.R.R. Tolkien, The Children of Húrin
**************************************************
“Many that live deserve death. And some that die deserve life. Can you give it to them? Then do not be too eager to deal out death in judgement.”
― J.R.R. Tolkien, The Fellowship of the Ring
**************************************************
“So comes snow after fire, and even dragons have their endings.”
― J.R.R. Tolkien, The Hobbit, or There and Back Again
**************************************************
“May the wind under your wings bear you where the sun sails and the moon walks.”
― J.R.R. Tolkien, The Hobbit, or There and Back Again
**************************************************
“Where there's life there's hope.”
― J.R.R. Tolkien, The Hobbit, or There and Back Again
**************************************************
“It does not do to leave a live dragon out of your calculations, if you live near 

In [54]:
import pandas as pd

raw_quotes_df = pd.DataFrame(all_quotes, columns=['quote'])
raw_quotes_df

Unnamed: 0,quote
0,"“All that is gold does not glitter,\nNot all t..."
1,“Not all those who wander are lost.”\n― J.R.R....
2,"“I wish it need not have happened in my time,""..."
3,“I don't know half of you half as well as I sh...
4,“All we have to decide is what to do with the ...
...,...
3025,"“Home is behind, the world ahead,\nAnd there a..."
3026,"“It is not despair, for despair is only for th..."
3027,"“There is more in you of good than you know, c..."
3028,"“Good Morning!"" said Bilbo, and he meant it. T..."


In [55]:
raw_quotes_df["quote"][0]

'“All that is gold does not glitter,\nNot all those who wander are lost;\nThe old that is strong does not wither,\nDeep roots are not reached by the frost.\n\nFrom the ashes a fire shall be woken,\nA light from the shadows shall spring;\nRenewed shall be blade that was broken,\nThe crownless again shall be king.”\n― J.R.R. Tolkien, The Fellowship of the Ring'

In [56]:
cleaned_quotes_df = raw_quotes_df["quote"].str.replace('[“”]', '', regex=True)

In [57]:
cleaned_quotes_df

0       All that is gold does not glitter,\nNot all th...
1       Not all those who wander are lost.\n― J.R.R. T...
2       I wish it need not have happened in my time," ...
3       I don't know half of you half as well as I sho...
4       All we have to decide is what to do with the t...
                              ...                        
3025    Home is behind, the world ahead,\nAnd there ar...
3026    It is not despair, for despair is only for tho...
3027    There is more in you of good than you know, ch...
3028    Good Morning!" said Bilbo, and he meant it. Th...
3029    In sorrow we must go, but not in despair. Beho...
Name: quote, Length: 3030, dtype: object

In [58]:
# split the rows by "―" and assign the first partition to cleaned_quotes_df
cleaned_quotes_df = cleaned_quotes_df.str.rpartition("\n―")[0]

In [59]:
cleaned_quotes_df[10]

"It's a dangerous business, Frodo, going out your door. You step onto the road, and if you don't keep your feet, there's no knowing where you might be swept off to."

In [60]:
type(cleaned_quotes_df)

pandas.core.series.Series

In [61]:
cleaned_quotes_df

0       All that is gold does not glitter,\nNot all th...
1                      Not all those who wander are lost.
2       I wish it need not have happened in my time," ...
3       I don't know half of you half as well as I sho...
4       All we have to decide is what to do with the t...
                              ...                        
3025    Home is behind, the world ahead,\nAnd there ar...
3026    It is not despair, for despair is only for tho...
3027    There is more in you of good than you know, ch...
3028    Good Morning!" said Bilbo, and he meant it. Th...
3029    In sorrow we must go, but not in despair. Beho...
Name: 0, Length: 3030, dtype: object

In [62]:
# remove empty rows
cleaned_quotes_df.drop(cleaned_quotes_df.index[31:60], axis=0, inplace=True)

In [63]:
# reset index
cleaned_quotes_df = cleaned_quotes_df.reset_index(drop=True).squeeze()
cleaned_quotes_df

0       All that is gold does not glitter,\nNot all th...
1                      Not all those who wander are lost.
2       I wish it need not have happened in my time," ...
3       I don't know half of you half as well as I sho...
4       All we have to decide is what to do with the t...
                              ...                        
2996    Home is behind, the world ahead,\nAnd there ar...
2997    It is not despair, for despair is only for tho...
2998    There is more in you of good than you know, ch...
2999    Good Morning!" said Bilbo, and he meant it. Th...
3000    In sorrow we must go, but not in despair. Beho...
Name: 0, Length: 3001, dtype: object

In [64]:
# remove duplicates
cleaned_quotes_df.drop_duplicates(inplace=True, ignore_index=True)
cleaned_quotes_df

0       All that is gold does not glitter,\nNot all th...
1                      Not all those who wander are lost.
2       I wish it need not have happened in my time," ...
3       I don't know half of you half as well as I sho...
4       All we have to decide is what to do with the t...
                              ...                        
2992    Home is behind, the world ahead,\nAnd there ar...
2993    It is not despair, for despair is only for tho...
2994    There is more in you of good than you know, ch...
2995    Good Morning!" said Bilbo, and he meant it. Th...
2996    In sorrow we must go, but not in despair. Beho...
Name: 0, Length: 2997, dtype: object

In [65]:
# add a new line at the end
cleaned_quotes_df = cleaned_quotes_df + '\n'

In [66]:
cleaned_quotes_df.to_csv("quotes.txt", index=False, header=False, encoding="utf-8", lineterminator="\n")