In [2]:
import re
import requests
from bs4 import BeautifulSoup
import sqlite3
import json
import datetime

def get_authors(text, string=True):
  try:
    return text.split('by: ')[1].split(', ')
  except:
    if string: return text.join(', ')
    return  [None]

# Send a GET request to the URL of this page
def generate_link(page=1, audible_programs="20956260011", author_author="", keywords="", narrator="full-cast", publisher="", sort="review-rank", title="", pageSize=50):
  base_url = "https://www.audible.com/search?"
  params = {
    "audible_programs": audible_programs,
    "author_author": author_author,
    "keywords": keywords,
    "narrator": narrator,
    "pageSize": pageSize,
    "publisher": publisher,
    "sort": sort,
    "title": title,
    "ref": "a_search_l1_audible_programs_0",
    "pf_rd_p": "daf0f1c8-2865-4989-87fb-15115ba5a6d2",
    "pf_rd_r": "3CSM3Q3AG46QRQ0TVK0F",
    "pageLoadId": "dELu6hUurPGV8fAu",
    "creativeId": "9648f6bf-4f29-4fb4-9489-33163c0bb63e"
  }
  if page > 1:
    params["page"] = page
  query = "&".join([f"{key}={value}" for key, value in params.items()])
  return base_url + query



# convert string to date object
def string_to_date(text):
    '''
    Convert string to date object
    datetime.date
        year in float
        ex: 2013.2993150684931
    '''
    if text == None:
        return None
    elif 'Release date: ' in text:
        month, day, year = text.split('Release date: ')[1].split('-')
        year = "20"+year
        # month, day, year = map(int, text.split('-'))
        date =  datetime.date(int(year), int(month), int(day))
    # check if text is float or int
    elif text.isnumeric():
        return text
    return date.year+ date.month/12 + date.day/365

# convert string to date object
def extract_rating(string):
    if string == "Not rated yet" or string == None:
        return None, None
    string = string.split(' out of 5 stars ')
    rating = float(string[0])
    votes = int(string[1].split(' rating')[0].replace(',',''))
    return rating, votes


# convert string to date object
def extract_rating(string):
    if string == "Not rated yet" or string == None:
        return None, None
    string = string.split(' out of 5 stars ')
    rating = float(string[0])
    votes = int(string[1].split(' rating')[0].replace(',',''))
    return rating, votes


# hour and min to min
def hour_min_to_min(tim):
    if tim == None:
        return None
    elif 'min' not in tim:
        return int(tim.split('Length: ')[1].split(' hr')[0])*60
    elif 'hr' not in tim:
        return int(tim.split('Length: ')[1].split(' min')[0])
    else:
        hr = tim.split('Length: ')[1].split(' hr')[0]
        minute = tim.split("and ")[1].split(' min')[0]
    return int(hr)*60 + int(minute)


# Define a function to scrape all details from a page
def scrape_all_details(page):
  """Scrape product details from an Amazon page.

  Args:
    page (str): The URL of the page to scrape.

  Returns:
    list: A list of dictionaries containing product details.
  """
  # Send a GET request to the page and parse the HTML content
  # response = requests.get(page)
  # soup = BeautifulSoup(response.content, "html.parser")

  # Find all the elements that contain the product details
  products = soup.find_all("div", class_="bc-col-responsive bc-col-6")

  # Find all the image links
  img_tags = soup.find_all("img")
  # list of image
  urls = []
  # Loop through the img tags and get the src attribute of each one
  for i, img_tag in enumerate(img_tags):
    try:
      src = img_tag["src"]
      # print(src) # Print the image URL
      urls.append(src)

    except:
      src = None
      urls.append(src)
  # print(src) # Print the image URL
  cover_image = []
  for image_link in urls:
    if "https://m.media-amazon.com/images/I" in image_link or ".jpg" in image_link:
      # print(image_link)
      cover_image.append(image_link)
  if len(cover_image) % 10 != 0:
    print(f"Error: {len(cover_image)} images found")
    return None
  else:
    print(f"Success: {len(cover_image)} images found")


  # Create an empty list to store the details
  details_list = []

  # Loop through each product element and extract the details
  for product in products:
    # Initialize an empty dictionary to store the product details
    details_dict = {}

    # Try to find the title element and handle the exception if not found
    try:
      title = product.find("h3", class_="bc-heading").text.strip()
      details_dict["title"] = title
    except :
      # Assign None if title is not found
      details_dict["title"] = None

    # Try to find the subtitle element and handle the exception if not found
    try:
      subtitle = product.find("li", class_="bc-list-item subtitle").text.strip()
      details_dict["subtitle"] = subtitle
    except :
      # Assign None if subtitle is not found
      details_dict["subtitle"] = None

    # Try to find the author element and handle the exception if not found
    try:
      author = product.find("li", class_="authorLabel").text.strip()
      details_dict["author"] = author.split("By: ")[1]
    except :
      # Assign None if author is not found
      details_dict["author"] = None

    # Try to find the narrator element and handle the exception if not found
    try:
      narrator = product.find("li", class_="narratorLabel").text.strip()
      details_dict["narrator"] = narrator.split("Narrated by: ")[1]
    except :
      # Assign None if narrator is not found
      details_dict["narrator"] = None

    # Try to find the series element and handle the exception if not found
    try:
      series = product.find("li", class_="seriesLabel").text.strip()
      details_dict["series"] = series.split("Series: ")[1]
    except :
      # Assign None if series is not found
      details_dict["series"] = None

    # Try to find the length element and handle the exception if not found
    try:
      length = product.find("li", class_="runtimeLabel").text.strip()
      details_dict["length"] = hour_min_to_min(length)
    except :
      # Assign None if length is not found
      details_dict["length"] = None

    # Try to find the release date element and handle the exception if not found
    try:
      release_date = product.find("li", class_="releaseDateLabel").text.strip()
      details_dict["release_date"] = release_date.split("Release date: ")[1]
    except :
      # Assign None if release date is not found
      details_dict["release_date"] = None

    # Try to find the language element and handle the exception if not found
    try:
      language = product.find("li", class_="languageLabel").text.strip()
      details_dict["language"] = language.split("Language: ")[1]
    except :
      # Assign None if language is not found
      details_dict["language"] = None

    # Try to find the summary element and handle the exception if not found
    try:
      summary = product.find("p", class_="bc-text").text.strip()
      details_dict["summary"] = summary
    except :
      # Assign None if summary is not found
      details_dict["summary"] = None

    # Try to find the image element and handle the exception if not found
    try:
      image = product.find("img").get("src")
      details_dict["image"] = image
    except :
      # Assign None if image is not found
      details_dict["image"] = None

    # Try to find the link element and handle the exception if not found
    try:
      link = product.find("a", class_="bc-link bc-color-link").get("href")
      details_dict["link"] = link
    except :
      # Assign None if link is not found
      details_dict["link"] = None
    
    # Try to find the ratings element and handle the exception if not found
    try:
      ratings = product.find("li", class_="ratingsLabel").text.strip()
      details_dict["ratings"] = ratings
    except :
      # Assign None if ratings is not found
      details_dict["ratings"] = None

    # Format the values using strip and replace methods
    for key, value in details_dict.items():
      # Remove leading and trailing whitespaces
      try:
        value = value.strip()
              # Replace multiple whitespaces with a single space using re.sub
        value = re.sub("\s+", " ", value)
        # Update the dictionary with the formatted value
        details_dict[key] = value
      except :
        pass

    # Append the dictionary to the list
    if details_dict["title"] is not None:
      details_dict["ratings"], details_dict["votes"] = extract_rating(details_dict["ratings"])
      details_list.append(details_dict)

  # Add the image link to the dictionary
  for i in range(len(details_list)):
    details_list[i]["image"] = cover_image[i]

  # Return the list with all the details
  return details_list

request = requests.get(generate_link())
soup = BeautifulSoup(request.text, "html.parser")

data = scrape_all_details(generate_link())

Success: 50 images found


In [49]:

# Define a class for the database operations
class AudibleDB:

    # Define a method to create the database and table
    def create_db(self):

        # Connect to the database file or create it if it does not exist
        self.conn = sqlite3.connect("audible.db")

        # Create a cursor object to execute SQL commands
        self.cur = self.conn.cursor()

        # Create a table called audiobooks with the following columns and data types
        # Create a table called audiobooks with the following columns and data types
        self.cur.execute("""CREATE TABLE IF NOT EXISTS audiobooks (
                        title TEXT,
                        subtitle TEXT,
                        author TEXT,
                        narrator TEXT,
                        series TEXT,
                        length INTEGER,
                        release_date TEXT,
                        language TEXT,
                        summary TEXT,
                        image TEXT,
                        link TEXT PRIMARY KEY,
                        ratings REAL,
                        votes INTEGER
                    )
                    """)


        # Commit the changes to the database
        self.conn.commit()

    # Define a method to insert data into the table
    def insert_data(self, data):

        # Loop through each item in the data list
        for item in data:

            # Extract the values from the dictionary
            title = item["title"]
            subtitle = item["subtitle"]
            author = item["author"]
            narrator = item["narrator"]
            series = item["series"]
            length = item["length"]
            release_date = item["release_date"]
            language = item["language"]
            summary = item["summary"]
            image = item["image"]
            link = item["link"]
            ratings = item["ratings"]
            votes = item.get("votes", 0) # This will return 0 if "votes" is not in item
            # Insert the values into the table using placeholders and a tuple if it already doesn't exist
            self.cur.execute("""INSERT OR IGNORE INTO audiobooks VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?)
                                ON CONFLICT(link) DO NOTHING;
                                """,
                             (title, subtitle, author, narrator, series, length, release_date, language, summary, image, link, ratings, votes))
                             # print if data is inserted
            print(f"Success: {title} inserted")

        # Commit the changes to the database
        self.conn.commit()
    
    def read_data(self, **kwargs):

        # Create a query string to filter the data
        query_string = ""
        for key, value in kwargs.items():
            if value is not None:
                query_string += f" AND {key} = '{value}'"

        # Execute the query and get the results
        self.cur.execute("""SELECT * FROM audiobooks WHERE 1 {query_string}""".format(query_string=query_string))
        results = self.cur.fetchall()

        # Return the results
        return results
        
    # Define a method to read the database
    # def read_data(self, author=None, narrator=None, series=None, language=None, min_length=None, min_rating=None, min_votes=None, search=None, sort_by=None, sort_order=None):

    #     # Create a query string to filter the data
    #     query_string = ""
    #     for key, value in locals().items():
    #         if value is not None:
    #             if key == "min_length":
    #                 query_string += f" AND length >= '{value}'"
    #             elif key == "min_rating":
    #                 query_string += f" AND ratings >= {value}"
    #             elif key == "min_votes":
    #                 query_string += f" AND votes >= {value}"
    #             elif key == "search":
    #                 query_string += f" AND (title LIKE '%{value}%' OR subtitle LIKE '%{value}%' OR summary LIKE '%{value}%')"
    #             else:
    #                 query_string += f" AND {key} = '{value}'"

    #     # Add the sorting option if specified
    #     if sort_by is not None and sort_order is not None:
    #         query_string += f" ORDER BY {sort_by} {sort_order}"

    #     # Execute the query and get the results
    #     self.cur.execute("""SELECT * FROM audiobooks WHERE 1 {query_string}""".format(query_string=query_string))
    #     results = self.cur.fetchall()

    #     # Return the results
    #     return results

    # def read_data(self, **kwargs):
    #     # Construct the query string based on the provided arguments
    #     query_string = ""
    #     if kwargs.get("author"):
    #         query_string += f" AND author='{kwargs['author']}'"
    #     if kwargs.get("narrator"):
    #         query_string += f" AND narrator='{kwargs['narrator']}'"
    #     if kwargs.get("series"):
    #         query_string += f" AND series='{kwargs['series']}'"
    #     if kwargs.get("language"):
    #         query_string += f" AND language='{kwargs['language']}'"
    #     if kwargs.get("min_length"):
    #         query_string += f" AND length>={kwargs['min_length']}"
    #     if kwargs.get("min_rating"):
    #         query_string += f" AND ratings>={kwargs['min_rating']}"
    #     if kwargs.get("min_votes"):
    #         query_string += f" AND votes>={kwargs['min_votes']}"
    #     if kwargs.get("search"):
    #         search_terms = kwargs["search"].split()
    #         for term in search_terms:
    #             query_string += f" AND (title LIKE '%{term}%' OR subtitle LIKE '%{term}%' OR author LIKE '%{term}%' OR narrator LIKE '%{term}%' OR summary LIKE '%{term}%')"
    #     if kwargs.get("sort_by"):
    #         sort_by = kwargs["sort_by"]
    #         if sort_by not in ["title", "author", "narrator", "series", "length", "release_date", "language", "ratings", "votes"]:
    #             sort_by = "title"
    #     else:
    #         sort_by = "title"
    #     if kwargs.get("sort_order"):
    #         sort_order = kwargs["sort_order"]
    #         if sort_order not in ["ASC", "DESC"]:
    #             sort_order = "ASC"
    #     else:
    #         sort_order = "ASC"
    #     query_string += f" ORDER BY {sort_by} {sort_order}"
    #     # Execute the query and get the results
    #     self.cur.execute("""SELECT * FROM audiobooks WHERE 1 {query_string}""".format(query_string=query_string))
    #     results = self.cur.fetchall()
    #     # Return the results
    #     print(results)
    #     return results

    def read_data(self, **kwargs):
        # Define the base query
        query = "SELECT * FROM audiobooks WHERE 1=1"

        # Define the parameters for the query
        params = []

        # Add the filters to the query and parameters
        if kwargs.get("author"):
            query += " AND author=?"
            params.append(kwargs["author"])
        if kwargs.get("narrator"):
            query += " AND narrator=?"
            params.append(kwargs["narrator"])
        if kwargs.get("series"):
            query += " AND series=?"
            params.append(kwargs["series"])
        if kwargs.get("language"):
            query += " AND language=?"
            params.append(kwargs["language"])
        if kwargs.get("min_length"):
            query += " AND length>=?"
            params.append(kwargs["min_length"])
        if kwargs.get("min_rating"):
            query += " AND ratings>=?"
            params.append(kwargs["min_rating"])
        if kwargs.get("min_votes"):
            query += " AND votes>=?"
            params.append(kwargs["min_votes"])
        if kwargs.get("search"):
            search_terms = kwargs["search"].split()
            for term in search_terms:
                query += " AND (title LIKE ? OR subtitle LIKE ? OR author LIKE ? OR narrator LIKE ? OR summary LIKE ?)"
                params.extend(["%{}%".format(term)] * 5)

        # Add the sorting to the query
        sort_by = kwargs.get("sort_by", "title")
        sort_order = kwargs.get("sort_order", "ASC")
        query += " ORDER BY {} {}".format(sort_by, sort_order)

        # Execute the query and get the results
        self.cur.execute(query, params)
        results = self.cur.fetchall()

        # Return the results
        return results

    # Define a method to close the connection to the database
    def close_db(self):
        self.conn.close()


# Create an instance of the class
db = AudibleDB()

# Call the create_db method to create the database and table
db.create_db()

# Call the insert_data method to insert the data into the table
# db.insert_data(data)

da = db.read_data(min_votes=100, sort_by="ratings")

# 
db.close_db()


In [50]:
da

[('Have a Nice Day',
  None,
  None,
  None,
  None,
  106,
  None,
  None,
  None,
  'https://m.media-amazon.com/images/I/51NGkd6hOLL._SL500_.jpg',
  '/pd/Have-a-Nice-Day-Audiobook/B07GXSPKSR',
  4.5,
  25560),
 ('Dragon Planet',
  None,
  None,
  None,
  None,
  225,
  None,
  None,
  None,
  'https://m.media-amazon.com/images/I/51uh6OvizhL._SL500_.jpg',
  '/pd/Dragon-Planet-Audiobook/B07X7J8YJ3',
  4.5,
  3607),
 ('Hit Job',
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  'https://m.media-amazon.com/images/I/51Aay40lkCS._SL500_.jpg',
  '/pd/Hit-Job-Podcast/B091ZH3RCM',
  4.5,
  1964),
 ('Oracle 2: The Dreamland Murders',
  'Oracle',
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  'https://m.media-amazon.com/images/I/51rhjUWCxjL._SL500_.jpg',
  '/pd/Oracle-The-Dreamland-Murders-Podcast/B09CLMJPJ4',
  4.5,
  1552),
 ('Hot White Heist',
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  'https://m.media-amazon.com/images/I/516yPwKkg8S._SL500_.

In [9]:
```python
# Define a class for the database operations
class AudibleDB:

    # Define a method to create the database and table
    def create_db(self):

        # Connect to the database file or create it if it does not exist
        self.conn = sqlite3.connect("audible.db")

        # Create a cursor object to execute SQL commands
        self.cur = self.conn.cursor()

        # Create a table called audiobooks with the following columns and data types
        # Create a table called audiobooks with the following columns and data types
        self.cur.execute("""CREATE TABLE IF NOT EXISTS audiobooks (
                        title TEXT,
                        subtitle TEXT,
                        author TEXT,
                        narrator TEXT,
                        series TEXT,
                        length INTEGER,
                        release_date TEXT,
                        language TEXT,
                        summary TEXT,
                        image TEXT,
                        link TEXT PRIMARY KEY,
                        ratings REAL,
                        votes INTEGER
                    )
                    """)


        # Commit the changes to the database
        self.conn.commit()

    # Define a method to insert data into the table
    def insert_data(self, data):

        # Loop through each item in the data list
        for item in data:

            # Extract the values from the dictionary
            title = item["title"]
            subtitle = item["subtitle"]
            author = item["author"]
            narrator = item["narrator"]
            series = item["series"]
            length = item["length"]
            release_date = item["release_date"]
            language = item["language"]
            summary = item["summary"]
            image = item["image"]
            link = item["link"]
            ratings = item["ratings"]
            votes = item.get("votes", 0) # This will return 0 if "votes" is not in item
            # Insert the values into the table using placeholders and a tuple if it already doesn't exist
            self.cur.execute("""INSERT OR IGNORE INTO audiobooks VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?)
                                ON CONFLICT(link) DO NOTHING;
                                """,
                             (title, subtitle, author, narrator, series, length, release_date, language, summary, image, link, ratings, votes))
                             # print if data is inserted
            print(f"Success: {title} inserted")

        # Commit the changes to the database
        self.conn.commit()
    
    def read_data(self, **kwargs):

        # Create a query string to filter the data
        query_string = ""
        for key, value in kwargs.items():
            if value is not None:
                query_string += f" AND {key} = '{value}'"

        # Execute the query and get the results
        self.cur.execute("""SELECT * FROM audiobooks WHERE 1 {query_string}""".format(query_string=query_string))
        results = self.cur.fetchall()

        # Return the results
        return results
        
    # Define a method to read the database
    def read_data(self, author=None, narrator=None, series=None, language=None, min_length=None, min_rating=None, min_votes=None, search=None, sort_by=None, sort_order=None):

        # Create a query string to filter the data
        query_string = ""
        for key, value in locals().items():
            if value is not None:
                if key == "min_length":
                    query_string += f" AND length >= '{value}'"
                elif key == "min_rating":
                    query_string += f" AND ratings >= {value}"
                elif key == "min_votes":
                    query_string += f" AND votes >= {value}"
                elif key == "search":
                    query_string += f" AND (title LIKE '%{value}%' OR subtitle LIKE '%{value}%' OR summary LIKE '%{value}%')"
                else:
                    query_string += f" AND {key} = '{value}'"

        # Add the sorting option if specified
        if sort_by is not None and sort_order is not None:
            query_string += f" ORDER BY {sort_by} {sort_order}"

        # Execute the query and get the results
        self.cur.execute("""SELECT * FROM audiobooks WHERE 1 {query_string}""".format(query_string=query_string))
        results = self.cur.fetchall()

        # Return the results
        return results
```

OperationalError                          Traceback (most recent call last)
c:\Users\saket\Documents\GitHub\Pyhton\web scraping\test.ipynb Cell 2 in ()
    120 db.create_db()
    122 # Call the insert_data method to insert the data into the table
    123 # db.insert_data(data)
--> 125 db.read_data(min_votes=1000, sort_by="ratings")
    127 # 
    128 db.close_db()

c:\Users\saket\Documents\GitHub\Pyhton\web scraping\test.ipynb Cell 2 in AudibleDB.read_data(self, author, narrator, series, language, min_length, min_rating, min_votes, search, sort_by, sort_order)
    102     query_string += f" ORDER BY {sort_by} {sort_order}"
    104 # Execute the query and get the results
--> 105 self.cur.execute("""SELECT * FROM audiobooks WHERE 1 {query_string}""".format(query_string=query_string))
    106 results = self.cur.fetchall()
    108 # Return the results

OperationalError: no such column: self

SyntaxError: invalid syntax (294752858.py, line 1)

In [74]:

def scrape_all_details(page):
# Send a GET request to the page and parse the HTML content
  # response = requests.get(page)
  # soup = BeautifulSoup(response.content, "html.parser")

  # Find all the elements that contain the product details
  products = soup.find_all("div", class_="bc-col-responsive bc-col-6")

  # Create an empty list to store the details
  details_list = []

  img_tags = soup.find_all("img")
  # list of image
  urls = []
  # Loop through the img tags and get the src attribute of each one
  for i, img_tag in enumerate(img_tags):
    try:
      src = img_tag["src"]
      # print(src) # Print the image URL
      urls.append(src)

    except:
      src = None
      urls.append(src)
      # print(src) # Print the image URL
  cover_image = []
  for image_link in urls:
    if "https://m.media-amazon.com/images/I" in image_link or ".jpg" in image_link:
      # print(image_link)
      cover_image.append(image_link)
  if len(cover_image) % 10 != 0:
    print(f"Error: {len(cover_image)} images found")
    return None
  else:
    print(f"Success: {len(cover_image)} images found")

# Loop through each product element and extract the details
  for product in products:
    # Try to find the title element and handle the exception if not found
    try:
      title = product.find("h3", class_="bc-heading").text.strip()
    except AttributeError:
      title = None
      continue
    # Try to find the subtitle element and handle the exception if not found
    try:
      # get the li element with class subtitle
      subtitle = product.find("li", class_="bc-list-item subtitle").text.strip()
    except AttributeError:
      subtitle = None

    # Try to find the author element and handle the exception if not found
    try:
      author = product.find("li", class_="authorLabel").text.strip()
    except AttributeError:
      author = None
    # Try to find the narrator element and handle the exception if not found
    try:
      narrator = product.find("li", class_="narratorLabel").text.strip()
    except AttributeError:
      narrator = None
    try:
      series = product.find("li", class_="seriesLabel").text.strip()
    except AttributeError:
      series = None
    try:
      length = product.find("li", class_="runtimeLabel").text.strip()
    except AttributeError:
      length = None
    try:
      release_date = product.find("li", class_="releaseDateLabel").text.strip() 
    except AttributeError:
      release_date = None
    try:
      language = product.find("li", class_="languageLabel").text.strip()
    except AttributeError:
      language = None

    try:
      ratings = product.find("li", class_="ratingsLabel").text.strip()
    except AttributeError:
      ratings = None

    # Try to find the summary element and handle the exception if not found
    try:
      summary = product.find("p", class_="bc-text").text.strip()
    except AttributeError:
      summary = None

    image = None

    # Try to find the link element and handle the exception if not found
    try:
      link = product.find("a", class_="bc-link bc-color-link").get("href")
    except AttributeError:
      link = None

    # Create a dictionary with the product details
    details_dict = {
      "title"        : title,
      "subtitle"     : subtitle,
      "author"       : author,
      "narrator"     : narrator,
      "series"       : series,
      "length"       : length,
      "release_date" : release_date,
      "language"     : language,
      "ratings"      : ratings,
      "vote"         : None,
      "summary"      : summary,
      "image"        : image, # Add this line
      "link"         : link # Add this line
    }
    # Format the values using strip and replace methods
    for key, value in details_dict.items():
      # Remove leading and trailing whitespaces
      if value is None: continue
      value = value.strip()
      # Replace multiple whitespaces with a single space using re.sub
      value = re.sub("\s+", " ", value)
      # Update the dictionary with the formatted value
      details_dict[key] = value
    
    # print(details_dict)
    # break
    # series
    
    # Append the dictionary to the list
    details_list.append(details_dict)
    try:
      details_dict['series'] = details_dict['series'].split('Series: ')[1]
    except:
      details_dict['series'] = None
    # details_dict['author'] = get_authors(details_dict['author'])

    # narrator
    try:
      details_dict['narrator'] = get_authors(details_dict['narrator'])
    except:
      details_dict['narrator'] = None
    # modify length
    details_dict['length'] = hour_min_to_min(details_dict['length'])
    # language
    try:
      details_dict['language'] = details_dict['language'].split('Language: ')[1]
    except:
      details_dict['language'] = None
    # add vote
    details_dict['vote'] = extract_rating(details_dict['ratings'])[1]
    # modify ratings
    details_dict['ratings'] = extract_rating(details_dict['ratings'])[0]
    # modify release date
    details_dict['release_date'] = string_to_date(details_dict['release_date'])
    
    try:
      details_dict['author'] = ", ".join(details_dict['author'])
    except:
        details_dict['author'] = None

    try:
        details_dict['narrator'] = ", ".join(details_dict['narrator'])
    except:
        details_dict['narrator'] = None

  # add cover image to the dictionary in the list
  for i in range(len(details_list)):
    details_list[i]["image"] = cover_image[i]

  # Return the list with all the details
  return details_list

import sqlite3

class StoreData:
    def __init__(self):
        self.conn = sqlite3.connect('mydatabase.db')
        self.cursor = self.conn.cursor()
        self.cursor.execute('''CREATE TABLE IF NOT EXISTS mytable(
                            title TEXT,
                            subtitle TEXT, 
                            author TEXT, 
                            narrator TEXT,
                            series TEXT,
                            length INTEGER, 
                            release_date REAL,
                            language TEXT, 
                            ratings REAL, 
                            vote INTEGER,
                            summary TEXT, 
                            image TEXT, 
                            link TEXT)
                            ''')
        self.conn.commit()

    def store_data(self, data):
        # print(data)
        self.cursor.execute(f"INSERT INTO mytable VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
                            (data['title'],
                            print(data['title']),
                            data['subtitle'],
                            data['author'],
                            data['narrator'],
                            data['series'],
                            data['length'],
                            data['release_date'],
                            data['language'],
                            data['ratings'],
                            data['vote'],
                            data['summary'],
                            data['image'],
                            data['link']))
        self.conn.commit()
# db = StoreData()

# db.store_data(scrape_all_details(generate_link()))
scrape_all_details(generate_link())



In [75]:
# def scrape_all_details(page):
#   # Send a GET request to the page and parse the HTML content
# #   response = requests.get(page)
#   # Check if the response is successful
# #   response.raise_for_status()
# #   soup = BeautifulSoup(response.content, "html.parser")

#   # Find all the elements that contain the product details
#   products = soup.find_all("div", class_="bc-col-responsive bc-col-6 bc-list-item")

#   # Create an empty list to store the details
#   details_list = []

#   # Find all the img tags that have a src attribute ending with .jpg
#   img_tags = soup.find_all("img", attrs={"src": re.compile("\.jpg$")})
#   # Create a list of image URLs from the src attribute of each img tag
#   cover_image = [img_tag["src"] for img_tag in img_tags]

#   # Loop through each product element and extract the details
#   for product in products:
#     # Try to find the title element and handle the exception if not found
#     try:
#       title = product.find("h3", class_="bc-heading").text.strip().title()
#     except AttributeError:
#       title = None
#       continue
#     # Try to find the subtitle element and handle the exception if not found
#     try:
#       # get the li element with class subtitle
#       subtitle = product.find("li", class_="bc-list-item subtitle").text.strip().title()
#     except AttributeError:
#       subtitle = None

#     # Try to find the author element and handle the exception if not found
#     try:
#       author = product.find("li", class_="authorLabel").text.strip().title()
#     except AttributeError:
#       author = None
#     # Try to find the narrator element and handle the exception if not found
#     try:
#       narrator = product.find("li", class_="narratorLabel").text.strip().title()
#     except AttributeError:
#       narrator = None
#     try:
#       series = product.find("li", class_="seriesLabel").text.strip().title()
#     except AttributeError:
#       series = None
#     try:
#       length = product.find("li", class_="runtimeLabel").text.strip()
#     except AttributeError:
#       length = None
#     try:
#       release_date = product.find("li", class_="releaseDateLabel").text.strip() 
#     except AttributeError:
#       release_date = None
#     try:
#       language = product.find("li", class_="languageLabel").text.strip().title()
#     except AttributeError:
#       language = None

#     try:
#       ratings = product.find("li", class_="ratingsLabel").text.strip()
#     except AttributeError:
#       ratings = None

#     # Try to find the summary element and handle the exception if not found
#     try:
#       summary = product.find("p", class_="bc-text").text.strip()
#     except AttributeError:
#       summary = None

#     image = None

#     # Try to find the link element and handle the exception if not found
#     try:
#       link = product.find("a", class_="bc-link bc-color-link").get("href")
#     except AttributeError:
#       link = None

#     # Create a dictionary with the product details
#     details_dict = {
#       "title"        : title,
#       "subtitle"     : subtitle,
#       "author"       : author,
#       "narrator"     : narrator,
#       "series"       : series,
#       "length"       : length,
#       "release_date" : release_date,
#       "language"     : language,
#       "ratings"      : ratings,
#       "vote"         : None,
#       "summary"      : summary,
#       "image"        : image, 
#       "link"         : link 
#     }
    
#     # Split the series field by Series: and get the second element if it exists
#     details_dict['series'] = details_dict['series'].split('Series: ')[-1] if details_dict['series'] else None
    
#     # Split the author and narrator fields by commas and get a list of names
#     details_dict['author'] = details_dict['author'].split(', ') if details_dict['author'] else None
#     details_dict['narrator'] = details_dict['narrator'].split(', ') if details_dict['narrator'] else None
    
#     # Convert the length field from hours and minutes to minutes using a helper function
#     details_dict['length'] = hour_min_to_min(details_dict['length']) if details_dict['length'] else None
    
#     # Split the language field by Language: and get the second element if it exists
#     details_dict['language'] = details_dict['language'].split('Language: ')[-1] if details_dict['language'] else None
    
#     # Extract the ratings and vote fields from the ratings field using a helper function
#     details_dict['ratings'], details_dict['vote'] = extract_rating(details_dict['ratings']) if details_dict['ratings'] else (None, None)
    
#     # Convert the release date field from string to date using a helper function
#     details_dict['release_date'] = string_to_date(details_dict['release_date']) if details_dict['release_date'] else None
    
#     # Join the author and narrator fields with commas if they are not None
#     details_dict['author'] = ", ".join(details_dict['author']) if details_dict['author'] else None
#     details_dict['narrator'] = ", ".join(details_dict['narrator']) if details_dict['narrator'] else None

#     # Append the dictionary to the list
#     details_list.append(details_dict)

#   # Loop through the details_list and cover_image lists using zip and assign the image URLs to the details_dict
#   for details_dict, image_url in zip(details_list, cover_image):
#     details_dict["image"] = image_url

#   # Return the list with all the details
#   return details_list
# scrape_all_details(generate_link())

In [76]:
# def scrape_all_details(page):
#   # Send a GET request to the page and parse the HTML content
# #   response = requests.get(page)
# #   # Check if the response is successful
# #   response.raise_for_status()
# #   soup = BeautifulSoup(response.content, "html.parser")

#   # Find all the elements that contain the product details
#   products = soup.find_all("div", class_="bc-col-responsive bc-col-6 bc-list-item")

#   # Find all the img tags that have a src attribute ending with .jpg
#   img_tags = soup.find_all("img", attrs={"src": re.compile("\.jpg$")})
#   # Create a list of image URLs from the src attribute of each img tag
#   cover_image = [img_tag["src"] for img_tag in img_tags]

#   # Define a helper function to get the text from an element and format it
#   def get_text(element):
#     if element:
#       return element.text.strip().title()
#     else:
#       return None

#   # Define a helper function to get the text from a list item with a specific class
#   def get_list_item(product, class_name):
#     return get_text(product.find("li", class_=class_name))

#   # Define a helper function to split the text by a separator and get the second element if it exists
#   def split_text(text, separator):
#     if text:
#       return text.split(separator)[-1]
#     else:
#       return None

#   # Define a helper function to convert the length from hours and minutes to minutes
#   def hour_min_to_min(length):
#     if length:
#       hours, minutes = map(int, re.findall("\d+", length))
#       return hours * 60 + minutes
#     else:
#       return None

#   # Define a helper function to extract the ratings and vote from the ratings field
#   def extract_rating(ratings):
#     if ratings:
#       rating, vote = re.findall("\d+\.\d+|\d+", ratings)
#       return float(rating), int(vote)
#     else:
#       return None, None

#   # Define a helper function to convert the release date from string to date
#   def string_to_date(release_date):
#     if release_date:
#       date_str = re.search("\w+ \d+, \d+", release_date).group()
#       return datetime.strptime(date_str, "%B %d, %Y").date()
#     else:
#       return None

#   # Create an empty list to store the details
#   details_list = []

#   # Loop through each product element and extract the details using list comprehensions and lambda functions
#   for product in products:
#     details_dict = {
#       "title"        : get_text(product.find("h3", class_="bc-heading")),
#       "subtitle"     : get_list_item(product, "bc-list-item subtitle"),
#       "author"       : ", ".join(get_list_item(product, "authorLabel").split(", ")) if get_list_item(product, "authorLabel") else None,
#       "narrator"     : ", ".join(get_list_item(product, "narratorLabel").split(", ")) if get_list_item(product, "narratorLabel") else None,
#       "series"       : split_text(get_list_item(product, "seriesLabel"), "Series: "),
#       "length"       : hour_min_to_min(get_list_item(product, "runtimeLabel")),
#       "release_date" : string_to_date(get_list_item(product, "releaseDateLabel")),
#       "language"     : split_text(get_list_item(product, "languageLabel"), "Language: "),
#       "ratings"      : extract_rating(get_list_item(product, "ratingsLabel"))[0],
#       "vote"         : extract_rating(get_list_item(product, "ratingsLabel"))[1],
#       "summary"      : get_text(product.find("p", class_="bc-text")),
#       "image"        : None,
#       "link"         : product.find("a", class_="bc-link bc-color-link").get("href") if product.find("a", class_="bc-link bc-color-link") else None
#     }
#     # Append the dictionary to the list
#     details_list.append(details_dict)

#   # Loop through the details_list and cover_image lists using zip and assign the image URLs to the details_dict using f-strings
#   for details_dict, image_url in zip(details_list, cover_image):
#     details_dict["image"] = f"https:{image_url}"

#   # Return the list with all the details
#   return details_list


In [77]:
# scrape_all_details(generate_link())

In [6]:
# print(generate_link())
response = requests.get(generate_link())
soup = BeautifulSoup(response.text, 'html.parser')

In [7]:
# import datetime

# def get_authors(text):
#   try:
#     return text.split('by: ')[1].split(', ')
#   except:
#     return  None

# # Send a GET request to the URL of this page
# def generate_link(page=1, audible_programs="20956260011", author_author="", keywords="", narrator="full-cast", publisher="", sort="review-rank", title="", pageSize=50):
#   base_url = "https://www.audible.com/search?"
#   params = {
#     "audible_programs": audible_programs,
#     "author_author": author_author,
#     "keywords": keywords,
#     "narrator": narrator,
#     "pageSize": pageSize,
#     "publisher": publisher,
#     "sort": sort,
#     "title": title,
#     "ref": "a_search_l1_audible_programs_0",
#     "pf_rd_p": "daf0f1c8-2865-4989-87fb-15115ba5a6d2",
#     "pf_rd_r": "3CSM3Q3AG46QRQ0TVK0F",
#     "pageLoadId": "dELu6hUurPGV8fAu",
#     "creativeId": "9648f6bf-4f29-4fb4-9489-33163c0bb63e"
#   }
#   if page > 1:
#     params["page"] = page
#   query = "&".join([f"{key}={value}" for key, value in params.items()])
#   return base_url + query


# def scrape_all_details(page):
# # Send a GET request to the page and parse the HTML content
#   # response = requests.get(page)
#   # soup = BeautifulSoup(response.content, "html.parser")

#   # Find all the elements that contain the product details
#   products = soup.find_all("div", class_="bc-col-responsive bc-col-6")

#   # Create an empty list to store the details
#   details_list = []

#   img_tags = soup.find_all("img")
#   # list of image
#   urls = []
#   # Loop through the img tags and get the src attribute of each one
#   for i, img_tag in enumerate(img_tags):
#     try:
#       src = img_tag["src"]
#       # print(src) # Print the image URL
#       urls.append(src)

#     except:
#       src = None
#       urls.append(src)
#       # print(src) # Print the image URL
#   cover_image = []
#   for image_link in urls:
#     if "https://m.media-amazon.com/images/I" in image_link or ".jpg" in image_link:
#       # print(image_link)
#       cover_image.append(image_link)
#   if len(cover_image) % 10 != 0:
#     print(f"Error: {len(cover_image)} images found")
#     return None
#   else:
#     print(f"Success: {len(cover_image)} images found")

# # Loop through each product element and extract the details
#   for product in products:
#     # Try to find the title element and handle the exception if not found
#     try:
#       title = product.find("h3", class_="bc-heading").text.strip()
#     except AttributeError:
#       title = None
#       continue
#     # Try to find the subtitle element and handle the exception if not found
#     try:
#       # get the li element with class subtitle
#       subtitle = product.find("li", class_="bc-list-item subtitle").text.strip()
#     except AttributeError:
#       subtitle = None

#     # Try to find the author element and handle the exception if not found
#     try:
#       author = product.find("li", class_="authorLabel").text.strip()
#     except AttributeError:
#       author = None
#     # Try to find the narrator element and handle the exception if not found
#     try:
#       narrator = product.find("li", class_="narratorLabel").text.strip()
#     except AttributeError:
#       narrator = None
#     try:
#       series = product.find("li", class_="seriesLabel").text.strip()
#     except AttributeError:
#       series = None
#     try:
#       length = product.find("li", class_="runtimeLabel").text.strip()
#     except AttributeError:
#       length = None
#     try:
#       release_date = product.find("li", class_="releaseDateLabel").text.strip() 
#     except AttributeError:
#       release_date = None
#     try:
#       language = product.find("li", class_="languageLabel").text.strip()
#     except AttributeError:
#       language = None

#     try:
#       ratings = product.find("li", class_="ratingsLabel").text.strip()
#     except AttributeError:
#       ratings = None

#     # Try to find the summary element and handle the exception if not found
#     try:
#       summary = product.find("p", class_="bc-text").text.strip()
#     except AttributeError:
#       summary = None

#     image = None

#     # Try to find the link element and handle the exception if not found
#     try:
#       link = product.find("a", class_="bc-link bc-color-link").get("href")
#     except AttributeError:
#       link = None

#     # Create a dictionary with the product details
#     details_dict = {
#       "title"        : title,
#       "subtitle"     : subtitle,
#       "author"       : author,
#       "narrator"     : narrator,
#       "series"       : series,
#       "length"       : length,
#       "release_date" : release_date,
#       "language"     : language,
#       "ratings"      : ratings,
#       "vote"         : None,
#       "summary"      : summary,
#       "image"        : image, # Add this line
#       "link"         : link # Add this line
#     }
#     # Format the values using strip and replace methods
#     for key, value in details_dict.items():
#       # Remove leading and trailing whitespaces
#       if value is None: continue
#       value = value.strip()
#       # Replace multiple whitespaces with a single space using re.sub
#       value = re.sub("\s+", " ", value)
#       # Update the dictionary with the formatted value
#       details_dict[key] = value
    
#     # print(details_dict)
#     # break
#     # series
    
#     # Append the dictionary to the list
#     details_list.append(details_dict)
#     try:
#       details_dict['series'] = details_dict['series'].split('Series: ')[1]
#     except:
#       details_dict['series'] = None
#     details_dict['author'] = get_authors(details_dict['author'])

#     # narrator
#     try:
#       details_dict['narrator'] = get_authors(details_dict['narrator'])
#     except:
#       details_dict['narrator'] = None
#     # modify length
#     details_dict['length'] = hour_min_to_min(details_dict['length'])
#     # language
#     try:
#       details_dict['language'] = details_dict['language'].split('Language: ')[1]
#     except:
#       details_dict['language'] = None
#     # add vote
#     details_dict['vote'] = extract_rating(details_dict['ratings'])[1]
#     # modify ratings
#     details_dict['ratings'] = extract_rating(details_dict['ratings'])[0]
#     # modify release date
#     details_dict['release_date'] = string_to_date(details_dict['release_date'])
    
#     try:
#       details_dict['author'] = ", ".join(details_dict['author'])
#     except:
#         details_dict['author'] = None

#     try:
#         details_dict['narrator'] = ", ".join(details_dict['narrator'])
#     except:
#         details_dict['narrator'] = None

#   # add cover image to the dictionary in the list
#   for i in range(len(details_list)):
#     details_list[i]["image"] = cover_image[i]

#   # Return the list with all the details
#   return details_list

# data = scrape_all_details(generate_link())

Success: 50 images found


In [4]:
# 


NameError: name 'data' is not defined

None


In [2]:
# nerrated = "Narrated by: full cast, Mort Shelby, James Lewis, Elizabeth Jernigan, Terence Aselford, Richard Rohan, Tim Getman, Steven Carpenter, David Coyne, Tim Carlin, Thomas Penny, Lily Beacon"

# get_authors(nerrated)

# for i in data:
#   if get_authors(i['narrator'])[0] != None:
#     print(get_authors(i['narrator']))


In [1]:
# for i in data:
#   print(i)
#   print(get_authors(i['author']))
# # authors = [i['author'] for i in data if i['author'] != None and "," in i['author']]

# # print(authors)


# # for i in data:
# #   get_authors(i['author'])
# # len(data)

In [26]:
```python
dic = {'title': 'Revenge of the Mountain Man [Dramatized Adaptation]',
     'subtitle': 'Smoke Jensen: The Mountain Man, Book 4',
     'author': [None],
     'narrator': ['full cast', 'Mort Shelby', 'James Lewis', 'Joe Brack', 'Elizabeth Jernigan', 'Dylan Lynch', 'David Coyne', 'Tim Getman', 'Richard Rohan', 'Terence Aselford', 'Michael Glenn', 'Christopher Graybill'],
     'series': 'Mountain Man (Johnstone), Book 4, Dramatized Adaptation',
     'length': 364,
     'release_date': 2020.9550228310502,
     'language': 'English',
     'ratings': 5.0,
     'vote': 35,
     'summary': None,
     'image': 'https://m.media-amazon.com/images/I/61CyC23FFwL._SL500_.jpg',
     'link': '/pd/Revenge-of-the-Mountain-Man-Dramatized-Adaptation-Audiobook/1648805698'}
```

this is the dictionary file that I want to store to a sqlite3 database using python
write a class
    create the table
    create a method to store the data to the database

title
subtitle
author
narrator
series
length
release_date
language
ratings
vote
summary
image
link


NameError: name 'soup' is not defined

In [19]:

# Define a function to create a database and a table
def create_db_table():
  # Connect to the database file or create one if it does not exist
  conn = sqlite3.connect('audiobooks.db')
  # Create a cursor object to execute SQL commands
  c = conn.cursor()
  # Create a table with the columns as the keys of the scraped data
  # Use JSON data type for author and narrator columns
  c.execute('''CREATE TABLE IF NOT EXISTS audiobooks (
    title TEXT,
    subtitle TEXT,
    author JSON,
    narrator JSON,
    series TEXT,
    length INTEGER,
    release_date INTEGER,
    language TEXT,
    ratings INTEGER,
    vote INTEGER,
    summary TEXT,
    image TEXT,
    link TEXT PRIMARY KEY)''')
  # Commit the changes and close the connection
  conn.commit()
  conn.close()


# Define a function to insert scraped data into the table
def insert_data(data):
  # Connect to the database file
  conn = sqlite3.connect('audiobooks.db')
  # Create a cursor object to execute SQL commands
  c = conn.cursor()
  # Loop through the data list and insert each item as a row
  for item in data:
    # Use a try-except block to handle duplicates
    try:
      # Insert the values of the item into the table
      c.execute('''INSERT INTO audiobooks VALUES (
        :title,
        :subtitle,
        :author,
        :narrator,
        :series,
        :length,
        :release_date,
        :language,
        :ratings,
        :vote,
        :summary,
        :image,
        :link)''', item)
      # Print a success message
      print(f"Inserted {item['title']} into the table")
    except sqlite3.IntegrityError:
      # Print an error message if the link already exists in the table
      print(f"Duplicate link: {item['link']}")
  # Commit the changes and close the connection
  conn.commit()
  conn.close()


1
Scraping page 1...


In [9]:
# read the data from the audiobooks.db
conn = sqlite3.connect('audiobooks.db')
c = conn.cursor()
c.execute("SELECT * FROM audiobooks")
data = c.fetchall()
conn.close()

In [10]:
data

[]

In [9]:
# Import sqlite3 and datetime modules
import sqlite3
import datetime



# Define a class to get the data from the database
class DataGetter:

  # Define a constructor that takes a database name as an argument
  def __init__(self, db_name):
    # Connect to the database and create a cursor object
    self.conn = sqlite3.connect(db_name)
    self.cur = self.conn.cursor()

  # Define a function that converts the data from the database into a list of dictionaries
  def get_data(self):
    # Execute a query to select all columns except subtitle from the table
    self.cur.execute("SELECT title, author, narrator, series, length, release_date, language, ratings, summary, image, link FROM audiobooks")
    # Fetch all the rows from the query result
    rows = self.cur.fetchall()
    # Create an empty list to store the converted data
    data = []
    # Loop through each row
    for row in rows:
      # Create an empty dictionary to store the row data
      item = {}
      # Assign the values of each column to the corresponding keys in the dictionary
      item['title'] = row[0]
      item['author'] = row[1]
      item['narrator'] = row[2]
      item['series'] = row[3]
      item['length'] = hour_min_to_min(row[4])
      item['release_date'] = string_to_date(row[5])
      item['language'] = row[6]
      item['rating'], item["votes"] = extract_rating(row[7])
      item['summary'] = row[8]
      item['image'] = row[9]
      item['link'] = row[10]
      # Append the dictionary to the data list
      data.append(item)
    # Return the data list
    return data

  # Define a function that closes the database connection
  def close(self):
    # Close the cursor and the connection objects
    self.cur.close()
    self.conn.close()
audible = DataGetter('audiobooks.db')



[]

In [38]:
db = AudiobookDB()
data = db.get_all_data()
db.close_connection()

In [6]:
data

[]

In [None]:

import requests
from bs4 import BeautifulSoup
from PIL import Image

for i in data:
    image_link = i[10]
    title = i[0]
    star = i[8]
    
    # conver the title to a valid file name
    title = title.replace(":", "")
    # download the image to a folder
    image = Image.open(requests.get(image_link, stream=True).raw)
    # add metadata to the image

    file_loc = f"image/{title}.jpg"
    # there a image folder in the same directory
    # check if the image is already downloaded
    try:
        with open(file_loc) as f:
            pass
    except FileNotFoundError:
        # if not downloaded, download it
        image.save(file_loc)
        print(f"{i[0]}.jpg downloaded")
    break

In [37]:
# # Import sqlite3 and datetime modules
# import sqlite3
# import datetime
# # Import re module for regular expressions
# import re

# # Define a class to get the data from the database
# class DataGetter:

#   # Define a constructor that takes a database name as an argument
#   def __init__(self, db_name):
#     # Connect to the database and create a cursor object
#     self.conn = sqlite3.connect(db_name)
#     self.cur = self.conn.cursor()

#   # Define a function that converts the data from the database into a list of dictionaries
#   def get_data(self):
#     # Execute a query to select all columns except subtitle from the table
#     self.cur.execute("SELECT title, author, narrator, series, length, release_date, language, ratings, summary, image, link FROM audiobooks")
#     # Fetch all the rows from the query result
#     rows = self.cur.fetchall()
#     # Create an empty list to store the converted data
#     data = []
#     # Loop through each row
#     for row in rows:
#       # Create an empty dictionary to store the row data
#       item = {}
#       # Assign the values of each column to the corresponding keys in the dictionary
#       item['title'] = row[0]
#       item['author'] = row[1]
#       item['narrator'] = row[2]
#       item['series'] = row[3]
#       # Check if the length column is None and assign None if so, otherwise convert it from string to integer in minutes
#       if row[4] == None:
#         item['length'] = None
#       else:
#         hours, minutes = map(int, row[4].split(' hrs and '))
#         item['length'] = hours * 60 + minutes
#       # Check if the release date column is None and assign None if so, otherwise convert it from string to datetime object
#       if row[5] == None:
#         item['release_date'] = None
#       else:
#         month, day, year = map(int, row[5].split('-'))
#         item['release_date'] = datetime.date(year, month, day)
#       item['language'] = row[6]
#       # Check if the ratings column is None and assign None if so, otherwise split it by space and get the first and last elements as rating and votes
#       if row[7] == None:
#         item['rating'] = None
#         item['votes'] = None
#       else:
#         try:
#             rating, votes = re.findall(r'\d[\d.,]*', row[7])
#         except:
#             print(row[7])
#       item['summary'] = row[8]
#       item['image'] = row[9]
#       item['link'] = row[10]
#       # Append the dictionary to the data list
#       data.append(item)
#     # Return the data list
#     return data

#   # Define a function that closes the database connection
#   def close(self):
#     # Close the cursor and the connection objects
#     self.cur.close()
#     self.conn.close()


# # Define a function to get the data from the database
# def get_data(db_name):
#     # Create an instance of the DataGetter class
#     getter = DataGetter(db_name)
#     # Call the get_data method
#     data = getter.get_data()
#     # Close the database connection
#     getter.close()
#     # Return the data
#     return data
# get_data('audiobooks.db')

In [38]:
# for i in data:
#     print(i[])

In [39]:
# string = "4.5 out of 5 stars 2,010 ratings"
# string = "5 out of 5 stars 1 rating"

# for i in data:
#     try:
#         print(i[8], '-->', extract_rating(i[8]))
#         extract_rating(i[8])
#     except:
#         print(i[8])
#         break

None


In [27]:
# for i in data[4:]:
#     for ii in i:
#         print(ii)
#     print(i)
#     break

We're Alive: A Story of Survival, the Third Season
N/A
By: Kc Wayland
Narrated by: full cast
Series: We’re Alive: A Story of Survival, Book 3
Length: 11 hrs and 31 mins
Release date: 03-18-13
Language: English
5 out of 5 stars 1,432 ratings
N/A
https://m.media-amazon.com/images/I/51Xt2BYA5vL._SL500_.jpg
/pd/Were-Alive-A-Story-of-Survival-the-Third-Season-Audiobook/B00BUTFLGS
("We're Alive: A Story of Survival, the Third Season", 'N/A', 'By: Kc Wayland', 'Narrated by: full cast', 'Series: We’re Alive: A Story of Survival, Book 3', 'Length: 11 hrs and 31 mins', 'Release date: 03-18-13', 'Language: English', '5 out of 5 stars 1,432 ratings', 'N/A', 'https://m.media-amazon.com/images/I/51Xt2BYA5vL._SL500_.jpg', '/pd/Were-Alive-A-Story-of-Survival-the-Third-Season-Audiobook/B00BUTFLGS')


ValueError: too many values to unpack (expected 5)

imageKOZ.jpg downloaded


In [19]:
# Test the function with different parameters
print(get_custom_data(title='KOZ'))
print(get_custom_data(author='By: William W. Johnstone'))
print(get_custom_data(language='English'))


[('KOZ', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '5 out of 5 stars 2,010 ratings', 'N/A', 'https://m.media-amazon.com/images/I/517I-u-1NGL._SL500_.jpg', '/pd/KOZ-Podcast/B0B4F665LR')]
[('Revenge of the Mountain Man [Dramatized Adaptation]', 'N/A', 'By: William W. Johnstone', 'Narrated by: full cast, Mort Shelby, James Lewis, Joe Brack, Elizabeth Jernigan, Dylan Lynch, David Coyne, Tim Getman, Richard Rohan, Terence Aselford, Michael Glenn, Christopher Graybill', 'Series: Mountain Man (Johnstone), Book 4, Dramatized Adaptation', 'Length: 6 hrs and 4 mins', 'Release date: 11-14-20', 'Language: English', '5 out of 5 stars 35 ratings', 'N/A', 'https://m.media-amazon.com/images/I/61CyC23FFwL._SL500_.jpg', '/pd/Revenge-of-the-Mountain-Man-Dramatized-Adaptation-Audiobook/1648805698'), ('Journey of the Mountain Man [Dramatized Adaptation]', 'N/A', 'By: William W. Johnstone', 'Narrated by: full cast, Mort Shelby, James Lewis, Andy Clemence, Steven Carpenter, Tim Getman, Terence Aselfor

In [7]:
# no = 1
# # print the database
# conn = sqlite3.connect("audiobooks.db")
# cur = conn.cursor()
# cur.execute("SELECT * FROM audiobooks")
# results = cur.fetchall()
# for row in results:
#     # print(row)
#     no += 1
# conn.close()
# print(no)

301


In [27]:
# images = ("https://m.media-amazon.com/images/I/517I-u-1NGL._SL500_.jpg",
#         "https://m.media-amazon.com/images/I/51POjQXrnVL._SL500_.jpg",
#         "https://m.media-amazon.com/images/I/51ywcR6OqkL._SL500_.jpg",
#         "https://m.media-amazon.com/images/I/51-D+0blRnS._SL500_.jpg",
#         "https://m.media-amazon.com/images/I/51Xt2BYA5vL._SL500_.jpg",
#         "https://m.media-amazon.com/images/I/61CyC23FFwL._SL500_.jpg",
#         "https://m.media-amazon.com/images/I/51g3AinAJNL._SL500_.jpg",
#         "https://m.media-amazon.com/images/I/51pCOQAUu4L._SL500_.jpg",
#         "https://m.media-amazon.com/images/I/51NyNt9PePS._SL500_.jpg",
#         "https://m.media-amazon.com/images/I/51qO2LV-ilL._SL500_.jpg",
#         "https://m.media-amazon.com/images/I/515rqFN7PJL._SL500_.jpg",
#         "https://m.media-amazon.com/images/I/61i0TYaZ9pL._SL500_.jpg",
#         "https://m.media-amazon.com/images/I/614otPUQ5bL._SL500_.jpg",
#         "https://m.media-amazon.com/images/I/51HAoKblnpL._SL500_.jpg",
#         "https://m.media-amazon.com/images/I/51TjnQD6ILL._SL500_.jpg",
#         "https://m.media-amazon.com/images/I/51FvcRRvyUL._SL500_.jpg",
#         "https://m.media-amazon.com/images/I/61tqfwb3YML._SL500_.jpg",
#         "https://m.media-amazon.com/images/I/61h58kY9lsL._SL500_.jpg",
#         "https://m.media-amazon.com/images/I/51B+tOzmoAL._SL500_.jpg",
#         "https://m.media-amazon.com/images/I/61ltix+WxRL._SL500_.jpg",
#     )

# # download images
# for image_link in images:
#     image_name = image_link.split("/")[-1]
#     with open(image_name, "wb") as f:
#         f.write(requests.get(image_link).content)

In [6]:
# Import the libraries
import requests
from bs4 import BeautifulSoup
from PIL import Image

# Define the URL of the website
# url = generate_link # Replace this with your desired URL

# # Make a request to the website and get the HTML content
# response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
# html = response.text

# # Parse the HTML content using BeautifulSoup
# soup = BeautifulSoup(html, "html.parser")


# response = requests.get(generate_link())
# soup = BeautifulSoup(response.content, "html.parser")


# Find all the img tags in the HTML content

    # Open the image URL using requests and Pillow
    # image = Image.open(requests.get(src, stream=True).raw)
    
    # Save the image to a folder with a unique name
    # image.save(f"image{i}.jpg") # You can change the folder and file name as you wish


In [10]:
# save the soup object to a file
with open("soup.html", "w") as file:
    file.write(str(soup))


In [42]:
# # Import scrapy
# import scrapy

# # Import the CrawlerProcess: for running the spider
# from scrapy.crawler import CrawlerProcess

# # Import the Twisted reactor
# from twisted.internet import reactor

# # Define a class for your spider
# class GithubSpider(scrapy.Spider):
#   # Give your spider a name
#   name = "github_spider"
#   # Define a list of URLs to start scraping from
#   start_urls = [generate_link()]

#   # Define a method to parse the response from each URL
#   def parse(self, response):
#     # Find all the elements that contain the product details
#     products = response.xpath("//div[@class='bc-col-responsive bc-col-6']")

#     # Loop through each product element and extract the details
#     for product in products:
#       # Try to find the title element and handle the exception if not found
#       try:
#         title = product.xpath(".//h3/text()").get().strip()
#       except AttributeError:
#         title = None
#       # Try to find the subtitle element and handle the exception if not found
#       try:
#         subtitle = product.xpath(".//span[@class='subtitle']/text()").get().strip()
#       except AttributeError:
#         subtitle = None
#       # Try to find the author element and handle the exception if not found
#       try:
#         author = product.xpath(".//li[@class='authorLabel']/text()").get().strip()
#       except AttributeError:
#         author = None
#       # Try to find the narrator element and handle the exception if not found
#       try:
#         narrator = product.xpath(".//li[@class='narratorLabel']/text()").get().strip()
#       except AttributeError:
#         narrator = None
#       try:
#         series = product.xpath(".//li[@class='seriesLabel']/text()").get().strip()
#       except AttributeError:
#         series = None
#       try:
#         length = product.xpath(".//li[@class='runtimeLabel']/text()").get().strip()
#       except AttributeError:
#         length = None
#       try:
#         release_date = product.xpath(".//li[@class='releaseDateLabel']/text()").get().strip()
#       except AttributeError:
#         release_date = None
#       try:
#         language = product.xpath(".//li[@class='languageLabel']/text()").get().strip()
#       except AttributeError:
#         language = None

#       try:
#         ratings = product.xpath(".//li[@class='ratingsLabel']/text()").get().strip()
#       except AttributeError:
#         ratings = None

#       # Try to find the summary element and handle the exception if not found
#       try:
#         summary = product.xpath(".//p/text()").get().strip()
#       except AttributeError:
#         summary = None
      
#       # Try to find the image element and handle the exception if not found
#       try:
#         # Get the src attribute of the img tag
#         image = product.xpath(".//img/@src").get()
#       except AttributeError:
#         image = None

#       # Try to find the link element and handle the exception if not found
#       try:
#         link = product.xpath(".//a/@href").get()
#       except AttributeError:
#         link = None

#       # Create a dictionary with the product details
#       details_dict = {
#         "title": title,
#         "subtitle": subtitle,
#         "author": author,
#         "narrator": narrator,
#         "series": series,
#         "length": length,
#         "release_date": release_date,
#         "language": language,
#         "ratings": ratings,
#         "summary": summary,
#         "image": image, # Add this line
#         "link": link # Add this line
#       }

#       # Yield or return the dictionary
#       yield details_dict



# # Create an instance of the CrawlerProcess: process
# process = CrawlerProcess()

# # Run the spider
# process.crawl(GithubSpider)
# process.start()


In [43]:
# # Import the libraries
# import requests
# from bs4 import BeautifulSoup
# from PIL import Image

# # Define the URL of the website
# # url = generate_link # Replace this with your desired URL

# # # Make a request to the website and get the HTML content
# # response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
# # html = response.text

# # # Parse the HTML content using BeautifulSoup
# # soup = BeautifulSoup(html, "html.parser")


# response = requests.get(generate_link(), headers={"User-Agent": "Mozilla/5.0"})
# soup = BeautifulSoup(response.content, "html.parser")


# # Find all the img tags in the HTML content

#     # Open the image URL using requests and Pillow
#     # image = Image.open(requests.get(src, stream=True).raw)
    
#     # Save the image to a folder with a unique name
#     # image.save(f"image{i}.jpg") # You can change the folder and file name as you wish


In [60]:
# for audiobook in data[5:]:
#     for key, value in audiobook.items():
#         # print(key, value)
#         pass

# data5 = {'title': "We're Alive: A Story of Survival, the Third Season",
#  'subtitle': 'N/A',
#  'author': 'By:\n                                    Kc Wayland',
#  'narrator': 'Narrated by:\n                                      full cast',
#  'series': 'Series:\n                                      \n\n\n\n\n\n\n\n\n\n\n    \n        \n            \n                \n                \n            \n            \n        \n        \n            \n        \n        We’re Alive: A Story of Survival, Book 3',
#  'length': 'Length: 11 hrs and 31 mins',
#  'release_date': 'Release date:\n                                    03-18-13',
#  'language': 'Language:\n                                      English',
#  'ratings': '5 out of 5 stars\n1,432 ratings',
#  'summary': 'N/A'}

# # Loop through the values in the dictionary
# for key, value in data5.items():
#   # Remove leading and trailing whitespaces
#   value = value.strip()
#   # Replace multiple \n with a single space
#   value = value.replace("\n", " ")
#   # Update the dictionary with the formatted value
#   data5[key] = value

# # Import the re module
# import re

# # Loop through the values in the dictionary
# for key, value in data5.items():
#   # Remove leading and trailing whitespaces
#   value = value.strip()
#   # Replace multiple whitespaces with a single space using re.sub
#   value = re.sub("\s+", " ", value)
#   # Update the dictionary with the formatted value
#   data5[key] = value

# # Print the formatted dictionary
# for i in data5.items():
#   print(i)


('title', "We're Alive: A Story of Survival, the Third Season")
('subtitle', 'N/A')
('author', 'By: Kc Wayland')
('narrator', 'Narrated by: full cast')
('series', 'Series: We’re Alive: A Story of Survival, Book 3')
('length', 'Length: 11 hrs and 31 mins')
('release_date', 'Release date: 03-18-13')
('language', 'Language: English')
('ratings', '5 out of 5 stars 1,432 ratings')
('summary', 'N/A')


In [31]:
# import re
# import requests
# from bs4 import BeautifulSoup

# # read a html page with beautiful soup
# file = open('test.html')
# soup = BeautifulSoup(file, 'html.parser')
# print(soup.prettify())

In [5]:
# def generate_link(page=1, audible_programs="20956260011", author_author="", keywords="", narrator="full-cast", publisher="", sort="review-rank", title=""):
#   base_url = "https://www.audible.com/search?"
#   params = {
#     "audible_programs": audible_programs,
#     "author_author": author_author,
#     "keywords": keywords,
#     "narrator": narrator,
#     "publisher": publisher,
#     "sort": sort,
#     "title": title,
#     "ref": "a_search_l1_audible_programs_0",
#     "pf_rd_p": "daf0f1c8-2865-4989-87fb-15115ba5a6d2",
#     "pf_rd_r": "3CSM3Q3AG46QRQ0TVK0F",
#     "pageLoadId": "dELu6hUurPGV8fAu",
#     "creativeId": "9648f6bf-4f29-4fb4-9489-33163c0bb63e"
#   }
#   if page > 1:
#     params["page"] = page
#   query = "&".join([f"{key}={value}" for key, value in params.items()])
#   return base_url + query

# generate_link()