# Part 1: Web Scraping (Using Python)

In [1]:
# Importing necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# base url for quotes website
base_url='http://quotes.toscrape.com/'

In [3]:
"""The below function Scrapes quotes, authors, and tags from all pages(10 pages available) of 'quotes.toscrape.com'.
   It Returns a Pandas DataFrame containing:
     - Author
     - Quote
     - TagNames (comma-separated tags for each quote)
"""
def fetch_data(base_url):
    
    # Lists to store data from each page
    all_authors = []
    all_texts = []
    all_tags = []
    
    # Loop through pages 1 to 10
    for i in range(1,11):
        # Creates the page-specific url
        url=f"{base_url}page/{i}/"
        # Sending a GET request to fetch the page content
        response=requests.get(url)  
        
        # Checking if the request was successfull
        if response.status_code==200:
            # Parsing the HTML response using Beautifulsoup
            soup=BeautifulSoup(response.text,'html.parser')
            
            #Fetches all the quote block on the particluar page
            quotes=soup.find_all('div',{'class':'quote'})
            
            # Extract author, text and tags from each quote block
            for quote in quotes:
                author=quote.find('small',{'class':'author'}).text.strip()
                text=quote.find('span',{'class':'text'}).text.strip()
                
                # Removing straight and curly quotes from text
                for ch in ['"', '“', '”']:
                    text = text.replace(ch, '')
                    
                # Find all tag elements within this quote    
                tag_elements=quote.find_all('a',{'class':'tag'})
                
                # creates a list of the text for each tag
                tags_list = [t.text.strip() for t in tag_elements]
                # Joining them into a single comma-separated string             
                tags=",".join(tags_list)
                    
                # Appending each piece of data to the respective list
                all_authors.append(author)
                all_texts.append(text)
                all_tags.append(tags)        

        else:
            print(f"Failed to fetch the data.{response.status_code}")
    
    # Creating a dataframe to store the quotes data(author,quote,tags)                         
    df=pd.DataFrame({
        'Author':all_authors,
        'Quote':all_texts,
        'TagNames':all_tags
    })  
        
    return df



In [4]:
# Calling the function and storing the result in 'data'
data=fetch_data(base_url)
data.head()

Unnamed: 0,Author,Quote,TagNames
0,Albert Einstein,The world as we have created it is a process o...,"change,deep-thoughts,thinking,world"
1,J.K. Rowling,"It is our choices, Harry, that show what we tr...","abilities,choices"
2,Albert Einstein,There are only two ways to live your life. One...,"inspirational,life,live,miracle,miracles"
3,Jane Austen,"The person, be it gentleman or lady, who has n...","aliteracy,books,classic,humor"
4,Marilyn Monroe,"Imperfection is beauty, madness is genius and ...","be-yourself,inspirational"


In [6]:
"""
The below function Saves the given DataFrame to a CSV file with UTF-8 encoding.
This helps avoid encoding issues when opening the file on Windows or importing to MySQL.

Parameters:    
    data: DataFrame to save
    file_name: Name (without .csv extension) to use for the output file
"""

def save_data(data,file_name):
    try:
        # Saves DataFrame to CSV with no index column, using U,encodiTF-8
        data.to_csv(f'{file_name}.csv',index=False,encoding="utf-8-sig")
        return 'Saved the file successfully!'
    except Exception as e:
        return f"Failed to save the file {file_name}.csv :{e}"

In [7]:
save_data(data,'quotes_data')

'Saved the file successfully!'