In [94]:
#import all libraries
import pandas as pd
import requests
import bs4
import re
import logging
import concurrent.futures
import threading
from time import sleep

In [95]:
def get_links():
    urls = list(pd.read_csv('books_url.csv')['link'])
    return urls

In [96]:
def get_response(input_url):
    headers = {
        'User-Agent': 'My User Agent 1.0',
        "Accept-Language": "en-US,en;q=0.5"
    }
    response = requests.get(input_url, headers=headers)
    return response

In [97]:
def get_soup(input_url):
    headers = {
        'User-Agent': 'My User Agent 1.0',
        "Accept-Language": "en-US,en;q=0.5"
    }
    response = requests.get(input_url, headers=headers)
    if response.status_code != 200:
        print("Error in getting link")
        print("response code is : ", response.status_code)
    book_urls.remove(page_url)
    soup = bs4.BeautifulSoup(response.text, 'html.parser')
    return soup

In [98]:
def get_fa_title(soup):
    title = soup.select('.product-name strong')[0]
    return title.text


In [99]:
def get_en_title(soup):
    title = soup.select('.product-name-englishname')[0]
    return title.text

In [100]:
def get_price(soup):
    price = soup.select('.price-broken , .col-md-7 .price:nth-child(1)')[0].text
    return int(price.replace(',', ''))

In [101]:
def get_discount(soup):
    try:
        discount_price = int(soup.select('.col-md-12+ .clearfix .price-special')[0].text.replace(',', ''))
        discount_price = ((get_price(soup) - discount_price) / get_price(soup)) * 100
    except Exception:
        discount_price = 0
        logging.exception("This book has no discount!")
    return discount_price

In [102]:
def get_score(soup):
    soup = soup.find('div', {'class': 'col-md-7'}).find('li', {'class': 'pull-left'}).find('div',
                                                                                           {'class': 'my-rating'})
    soup_str = str(soup)

    match = re.search(r'data-rating="(\d+\.\d+)"', soup_str)
    if match:
        data_rating = match.group(1)
        return data_rating


In [103]:
def get_publisher(soup):
    try:
        publisher_tag = soup.select('div.prodoct-attribute-items:nth-child(1) > a')[0]
        publisher_link = publisher_tag.get('href')
        publisher_id = publisher_link.split('/')[2].split('-')[0]
        publisher_name = publisher_tag.text.strip()
    except Exception:
        publisher_link = -1
        publisher_id = -1
        publisher_name = -1
    return {'id': publisher_id, 'name': publisher_name, 'link': publisher_link}

In [104]:
def get_author(soup):
    authors_list = []
    try:
        authors_a_tag = soup.select('.prodoct-attribute-items+ .prodoct-attribute-items > a')
        if (len(authors_a_tag)) == 0:
            return authors_list
        for author_a_tag in authors_a_tag:
            author_link = author_a_tag.get('href')
            author_id = author_link.split('/')[2].split('-')[0]
            author_name = author_a_tag.text.strip()
            authors_list.append({'id': author_id, 'name': author_name, 'link': author_link})
    except Exception:
        authors_list = -1
    return authors_list

In [105]:
def is_author_available(soup):
    try:
        existence = soup.select('.pull-left+ li span')[0].text
    except:
        existence = None
        logging.exception("This book has no author!")
    return existence


In [106]:
def get_book_attribute(soup):
    rows = soup.find('table', {'class': 'product-table'}).findAll('td')
    code = -1
    isbn = -1
    size = -1
    pages = -1
    per_cal = -1
    ad_cal = -1
    material = -1
    series = -1
    send_time = -1
    language = 'فارسی'
    code_flag = 0
    isbn_flag = 0
    size_flag = 0
    pages_flag = 0
    per_cal_flag = 0
    ad_cal_flag = 0
    material_flag = 0
    language_flag = 0
    series_flag = 0
    send_time_flag = 0

    for row in rows:
        text = row.text.strip()
        if code_flag == 1:
            code = int(text)
            code_flag = 0
        elif isbn_flag == 1:
            isbn = text
            isbn = re.sub('[^0-9-]', '', isbn)
            isbn_flag = 0
        elif size_flag == 1:
            size = text
            size_flag = 0
        elif pages_flag == 1:
            pages = int(text)
            pages_flag = 0
        elif per_cal_flag == 1:
            per_cal = int(text)
            per_cal_flag = 0
        elif ad_cal_flag == 1:
            ad_cal = int(text)
            ad_cal_flag = 0
        elif material_flag == 1:
            material = text
            material_flag = 0
        elif language_flag == 1:
            language = text
            language_flag = 0
        elif series_flag == 1:
            series = int(text)
            series_flag = 0
        elif send_time_flag == 1:
            send_time = text
            send_time_flag = 0

        if 'کد کتاب' in text:
            code_flag = 1
        elif 'شابک' in text:
            isbn_flag = 1
        elif 'قطع' in text:
            size_flag = 1
        elif 'تعداد صفحه' in text:
            pages_flag = 1
        elif 'سال انتشار شمسی' in text:
            per_cal_flag = 1
        elif 'سال انتشار میلادی' in text:
            ad_cal_flag = 1
        elif 'نوع جلد' in text:
            material_flag = 1
        elif 'زبان کتاب' in text:
            language_flag = 1
        elif 'سری چاپ' in text:
            series_flag = 1
        elif 'زودترین زمان ارسال' in text:
            send_time_flag = 1

    return [code, isbn, size, pages, per_cal, ad_cal, material, language, series, send_time]

In [107]:
def get_summary(soup):
    summary = soup.select('.product-description')[0].text.strip()
    return summary

In [108]:
def get_tags(soup):
    tags = soup.select('.product-tags-item')
    tags_list = []
    for tag in tags:
        tags_list += [tag.text.strip()]
    return tags_list

In [109]:
def get_book_detail(book_soup, site_index):
    book_fa_title = get_fa_title(book_soup)
    book_en_title = get_en_title(book_soup)
    book_price = get_price(book_soup)
    book_discount_percent = get_discount(book_soup)
    book_score = get_score(book_soup)
    book_publisher = get_publisher(book_soup)
    book_author = get_author(book_soup)
    book_author_presence = is_author_available(book_soup)

    [book_code, book_isbn, book_size, book_pages, book_publication_per_date, book_publication_ad_date,
     book_cover_material, book_language, book_print_series, book_earliest_send_time] = get_book_attribute(book_soup)

    book_data = [site_index, int(book_code), book_isbn, book_fa_title, book_en_title, book_price,
                 int(book_discount_percent), book_score, book_publisher, book_author, int(book_pages),
                 int(book_publication_per_date), int(book_publication_ad_date), book_size, book_cover_material,
                 book_language, int(book_print_series), book_earliest_send_time, book_author_presence]
    return book_data

In [110]:
def get_book_site_summary(book_soup, site_index):
    try:
        book_summary = get_summary(book_soup)
    except Exception:
        book_summary = None
        logging.exception("This book has no summary!")
    return [site_index, book_summary]

In [111]:
def get_book_site_tags(book_soup, site_index):
    book_tags = get_tags(book_soup)
    book_tags_list = []
    for tag in book_tags:
        book_tags_list += [[site_index, tag]]
    return book_tags_list

In [112]:
def get_site_awards(soup, site_index):
    awards_list = []
    awards = soup.select('book_soup, site_index')

    for award in awards:
        print(award.text)
        awards += [award.text]

In [113]:
def get_req_list(list, req_count):
    if len(list) >= req_count:
        request_list = list[:req_count].copy()
    else:
        request_list = list.copy()
    return request_list

In [114]:
def scrape(site_soup):
    try:
        with lock:
            global site_index
            site_summary_data_list.append(get_book_site_summary(site_soup, site_index))
            site_tags_data_list.extend(get_book_site_tags(site_soup, site_index))
            site_page_books = site_soup.select('.clearfix .clearfix .row')
            for book_index in range(0, len(site_page_books), 2):
                data = get_book_detail(site_page_books[book_index], site_index)
                books_data_list.append(data)
            site_index += 1
    except Exception:
        logging.exception("An error occurred")

In [115]:
def fast_scrape(link):
    try:
        site_soup = get_soup(link)
        with lock:
            global site_index
            site_summary_data_list.append(get_book_site_summary(site_soup, site_index))
            site_tags_data_list.extend(get_book_site_tags(site_soup, site_index))
            site_page_books = site_soup.select('.clearfix .clearfix .row')
            for book_index in range(0, len(site_page_books), 2):
                data = get_book_detail(site_page_books[book_index], site_index)
                writers_data_list.extend(data[9])     #9th index is the writer column which is a list of writers
                publishers_data_list.append(data[8])  #8th column is dict of publisher
                data[8]=data[8]['id']                 #convert 8th column from dict to the publisher's id
                writers_list_of_dict=data[9]          #writes list which is a list of dictionary
                data.pop(9)                           #remove 9th column from data(9th column was writers)
                for w_id in writers_list_of_dict:
                     books_writers_data_list.append({'book_id':data[1],'writer_id':w_id['id']})
                books_data_list.append(data)
            site_index += 1
    except Exception:
        logging.exception("An error occurred")


<h1>Detailed Scraper</h1>

In [None]:
links = get_links()[:200] + ['https://www.iranketab.ir/book/270-gone-with-the-wind']

page_response = []
books_data_list = []
site_tags_data_list = []
site_summary_data_list = []

site_index = 1
sleep_time = 0.5
max_threads = 20
book_count_request = 20  #number of requests per time

lock = threading.Lock()
book_urls = links.copy()

while len(book_urls):
    sleep(sleep_time)  #sleep so that the site does not ban us
    request_list = get_req_list(book_urls, book_count_request)  #list of book's urls we want to send request 
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
        future_list = executor.map(get_response, request_list)
        for future in future_list:
            try:
                data = future
                page_response.append(data)
            except Exception as exc:
                continue
        for item in page_response:
            if item.status_code == 200:
                page_url = item.url
                if page_url in request_list:
                    page_soup = bs4.BeautifulSoup(item.content, 'html.parser')
                    scrape(page_soup)
                    book_urls.remove(page_url)

<h1>Fast Scraper</h1>

In [127]:
links = get_links()[:200] + ['https://www.iranketab.ir/book/270-gone-with-the-wind']

books_data_list = []
site_tags_data_list = []
site_summary_data_list = []
writers_data_list = []
publishers_data_list = []
books_writers_data_list = []

site_index = 1
max_threads = 20

lock = threading.Lock()

with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
    executor.map(fast_scrape, links)


ERROR:root:An error occurred
Traceback (most recent call last):
  File "C:\Users\raeim\AppData\Local\Temp\ipykernel_5524\3854377785.py", line 3, in fast_scrape
    [site_links, site_soup] = get_soup(link, site_links)
  File "C:\Users\raeim\AppData\Local\Temp\ipykernel_5524\291745038.py", line 10, in get_soup
    book_links.remove(page_url)
AttributeError: 'str' object has no attribute 'remove'
ERROR:root:An error occurred
Traceback (most recent call last):
  File "C:\Users\raeim\AppData\Local\Temp\ipykernel_5524\3854377785.py", line 3, in fast_scrape
    [site_links, site_soup] = get_soup(link, site_links)
  File "C:\Users\raeim\AppData\Local\Temp\ipykernel_5524\291745038.py", line 10, in get_soup
    book_links.remove(page_url)
AttributeError: 'str' object has no attribute 'remove'
ERROR:root:An error occurred
Traceback (most recent call last):
  File "C:\Users\raeim\AppData\Local\Temp\ipykernel_5524\3854377785.py", line 3, in fast_scrape
    [site_links, site_soup] = get_soup(link, s

<h1>Check Completnes</h1>

In [128]:
if len(book_urls) == 0:
    print('All links scraped!')
else:
    print('Something wrong happened!',len(book_urls),)

Something wrong happened! Missed  11 links while scraping.


<h1>Make Dataframes</h1>

In [None]:
tableOfData = pd.DataFrame(books_data_list,
                           columns=['site_index', 'code', 'Isbn', 'fa_title', 'en_title', 'price', 'discount', 'score',
                                    'publisher', 'author', 'pages', 'publication_per_date', 'publication_ad_date',
                                    'size', 'cover_material', 'print_series', 'language', 'earliest_send_time',
                                    'presence'])
tableOfData

In [None]:
file_path = "bookData.csv"
tableOfData.to_csv(file_path, index=False, encoding='utf-8')

In [None]:
tableOfSummaryData = pd.DataFrame(site_summary_data_list, columns=['site_index', 'summary'])
tableOfSummaryData

In [None]:
file_path = "BookSummaryData.csv"
tableOfData.to_csv(file_path, index=False, encoding='utf-8')

In [None]:
tableOfSiteTagsData = pd.DataFrame(site_tags_data_list, columns=['site_index', 'tag'])
tableOfSiteTagsData

In [None]:
file_path = "bookTagsData.csv"
tableOfData.to_csv(file_path, index=False, encoding='utf-8')

In [None]:
table_of_publishers = pd.DataFrame(publishers_data_list).drop_duplicates(subset=['id', 'name', 'link'])
table_of_publishers.to_csv('./publishers.csv', index=False)
table_of_publishers

In [None]:
table_of_writers = pd.DataFrame(writers_data_list).drop_duplicates(subset=['id', 'name', 'link'])
table_of_writers.to_csv('./writers.csv', index=False)
table_of_writers