In [30]:
#import all libraries
import pandas as pd
import requests
import bs4
import re
import zipfile
import logging
import concurrent.futures
import math
from time import sleep
from concurrent.futures import ThreadPoolExecutor
import threading

In [31]:
def get_links():
    urls = list(pd.read_csv('books_url.csv')['link'])
    return urls

In [32]:
def get_soup(input_url):
    headers = {
        'User-Agent': 'My User Agent 1.0',
        "Accept-Language": "en-US,en;q=0.5"
    }
    response = requests.get(input_url, headers=headers)
    if response.status_code != 200:
        print("Error in getting link")
        print("response code is : ", response.status_code)
    soup = bs4.BeautifulSoup(response.text, 'html.parser')
    return soup

In [33]:
def get_fa_title(soup):
    title = soup.select('.product-name strong')[0]
    return title.text


In [34]:
def get_en_title(soup):
    title = soup.select('.product-name-englishname')[0]
    return title.text

In [35]:
def get_price(soup):
    price = soup.select('.price-broken , .col-md-7 .price:nth-child(1)')[0].text
    return int(price.replace(',', ''))  #TODO make the price integer

In [74]:
def get_discount(soup):
    try:
        discount_price = int(soup.select('.col-md-12+ .clearfix .price-special')[0].text.replace(',', ''))
        discount_price = ((get_price(soup) - discount_price) / get_price(soup)) * 100
    except Exception:
        discount_price = 0
        logging.exception("This book has no discount!")
    return discount_price  

In [75]:
def get_score(soup):
    soup = soup.find('div', {'class':'col-md-7'}).find('li', {'class':'pull-left'}).find('div', {'class':'my-rating'})
    soup_str = str(soup)

    match = re.search(r'data-rating="(\d+\.\d+)"', soup_str)
    if match:
        data_rating = match.group(1)
        return data_rating
    

In [76]:
def get_publisher(soup):
    try:
        publisher = soup.select('.prodoct-attribute-items:nth-child(1) a .prodoct-attribute-item')[0].text.strip()
        publisher_href = soup.find('div', {'class': 'row clearfix'}).find('a')['href']
    except Exception:
        publisher = None
        logging.exception("This book has no publisher!")
    return publisher

In [77]:
def get_author(soup):
    try:
        author = soup.select('.prodoct-attribute-items+ .prodoct-attribute-items a .prodoct-attribute-item')[0].text.strip()
    except Exception:
        author = None
        logging.exception("This book has no author!")    
    return author

In [78]:
def get_book_attribute(soup):
    rows = soup.find('table', {'class': 'product-table'}).findAll('td')
    code = None
    isbn = None
    size = None
    pages = None
    per_cal = None
    ad_cal = None
    material = None
    series = None
    send_time = None
    code_flag = 0
    isbn_flag = 0
    size_flag = 0
    pages_flag = 0
    per_cal_flag = 0
    ad_cal_flag = 0
    material_flag = 0
    series_flag = 0
    send_time_flag = 0

    for row in rows:
        text = row.text.strip()
        if code_flag == 1:
            code = int(text)
            code_flag = 0
        elif isbn_flag == 1:
            isbn = text
            isbn = re.sub('[^0-9-]', '', isbn)
            isbn_flag = 0
        elif size_flag == 1:
            size = text
            size_flag = 0
        elif pages_flag == 1:
            pages = int(text)
            pages_flag = 0
        elif per_cal_flag == 1:
            per_cal = int(text)
            per_cal_flag = 0
        elif ad_cal_flag == 1:
            ad_cal = int(text)
            ad_cal_flag = 0
        elif material_flag == 1:
            material = text
            material_flag = 0
        elif series_flag == 1:
            series = int(text)
            series_flag = 0
        elif send_time_flag == 1:
            send_time = text
            send_time_flag = 0

        if 'کد کتاب' in text:
            code_flag = 1
        elif 'شابک' in text:
            isbn_flag = 1
        elif 'قطع' in text:
            size_flag = 1
        elif 'تعداد صفحه' in text:
            pages_flag = 1
        elif 'سال انتشار شمسی' in text:
            per_cal_flag = 1
        elif 'سال انتشار میلادی' in text:
            ad_cal_flag = 1
        elif 'نوع جلد' in text:
            material_flag = 1
        elif 'سری چاپ' in text:
            series_flag = 1
        elif 'زودترین زمان ارسال' in text:
            send_time_flag = 1
        
    return [code, isbn, size, pages, per_cal, ad_cal, material, series, send_time]

In [79]:
def get_summary(soup):
    summary = soup.select('.product-description')[0].text.strip()
    return summary

In [80]:
def get_tags(soup):
    tags = soup.select('.product-tags-item')
    tags_list = []
    for tag in tags:
        tags_list += [tag.text.strip()]
    return tags_list

In [87]:
 def get_book_detail(book_soup, site_index):
    book_fa_title = get_fa_title(book_soup)
    book_en_title = get_en_title(book_soup)
    book_price = get_price(book_soup)
    book_discount_percent = get_discount(book_soup)
    book_score = get_score(book_soup)
    book_publisher = get_publisher(book_soup)
    book_author = get_author(book_soup)

    [book_code, book_Isbn, book_size, book_pages, book_publication_per_date, book_publication_ad_date,
     book_cover_material, book_print_series, book_earliest_send_time] = get_book_attribute(book_soup)

    book_data = [site_index, int(book_code), book_Isbn, book_fa_title, book_en_title, book_price, int(book_discount_percent),
                 book_score, book_publisher, book_author,
                 int(book_pages), int(book_publication_per_date), int(book_publication_ad_date), book_size, book_cover_material,
                 int(book_print_series), book_earliest_send_time]
    return book_data

In [82]:
 def get_book_site_summary(book_soup, site_index):
    try:
        book_summary = get_summary(book_soup)
    except Exception:
        book_summary = None
        logging.exception("This book has no summary!")
    return [site_index, book_summary]

In [83]:
 def get_book_site_tags(book_soup, site_index):
    book_tags = get_tags(book_soup)
    book_tags_list = []
    for tag in book_tags:
        book_tags_list += [[site_index,tag]]
    return book_tags_list

In [89]:
links = get_links()[:100] + ['https://www.iranketab.ir/book/270-gone-with-the-wind']
site_index = 1
books_data_list = []
site_summary_data_list = []
site_tags_data_list = []
lock = threading.Lock()

def process_link(link):
    try:
        site_soup = get_soup(link)
        with lock:
            global site_index
            site_summary_data_list.append(get_book_site_summary(site_soup, site_index))
            site_tags_data_list.extend(get_book_site_tags(site_soup, site_index))
            site_page_books = site_soup.select('.clearfix .clearfix .row')
            for book_index in range(0, len(site_page_books), 2):
                data = get_book_detail(site_page_books[book_index], site_index)
                books_data_list.append(data)
            site_index += 1
    except Exception:
        logging.exception("An error occurred")

with ThreadPoolExecutor(max_workers=10) as executor:
    executor.map(process_link, links)


ERROR:root:An error occurred
Traceback (most recent call last):
  File "C:\Users\raeim\AppData\Local\Temp\ipykernel_21760\382316889.py", line 17, in process_link
    data = get_book_detail(site_page_books[book_index], site_index)
  File "C:\Users\raeim\AppData\Local\Temp\ipykernel_21760\1694710275.py", line 15, in get_book_detail
    int(book_pages), int(book_publication_per_date), int(book_publication_ad_date), book_size, book_cover_material,
TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'
ERROR:root:This book has no discount!
Traceback (most recent call last):
  File "C:\Users\raeim\AppData\Local\Temp\ipykernel_21760\709879376.py", line 3, in get_discount
    discount_price = int(soup.select('.col-md-12+ .clearfix .price-special')[0].text.replace(',', ''))
IndexError: list index out of range
ERROR:root:This book has no summary!
Traceback (most recent call last):
  File "C:\Users\raeim\AppData\Local\Temp\ipykernel_21760\2869572784.py", line 

In [90]:
tableOfData = pd.DataFrame(books_data_list,
                           columns=['site_index', 'code', 'Isbn', 'fa_title', 'en_title', 'price', 'discount', 'score',
                                    'publisher', 'author', 'pages', 'publication_per_date', 'publication_ad_date',
                                    'size', 'cover_material', 'print_series', 'earliest_send_time'])
tableOfData

Unnamed: 0,site_index,code,Isbn,fa_title,en_title,price,discount,score,publisher,author,pages,publication_per_date,publication_ad_date,size,cover_material,print_series,earliest_send_time
0,1,116421,978-6009313884,کتاب تانیا,Tanya,75000,20,3.62,پژواک فرزان,آلکسی آربوزوف,159,1396,1939,جیبی,شومیز,1,6 مهر
1,2,64447,978-6229736944,کتاب فناوری در سال 2021,"The Year in Tech, 2021",41500,0,3.01,راه پرداخت,هاروارد بیزینس ریویو,116,1399,2020,رقعی,شومیز,1,---
2,3,49138,978-9644458071,کتاب گوزن شاخدار فایده اش چیه؟,What Use Is A Moose?,70000,15,3.58,علمی و فرهنگی,مارتین وادل,32,1400,1996,وزیری,شومیز,5,4 مهر
3,4,51563,978-9641940289,کتاب بانوان عمارت میسالونگی,The Ladies of Missalonghi,30000,15,3.17,روشنگران و مطالعات زنان,کالین مک کالو,184,1388,1987,رقعی,شومیز,1,4 مهر
4,5,83697,978-6007106440,کتاب اعتماد کارساز است!,Trust works!,100000,20,3.74,دنیای اقتصاد,کنت بلانچارد,128,1397,2013,پالتویی,شومیز,2,4 مهر
5,6,536,978-600-8812-16-6,کتاب دختری با کت آبی,Girl in the Blue Coat,155000,15,4.03,میلکان,مونیکا هسی,256,1402,2016,رقعی,شومیز,11,6 مهر
6,7,87845,978-6005590555,کتاب واژه نامه روان کاوی لکانی,An Introductory Dictionary of Lacanian Psycho...,200000,15,3.41,اتاق آبی,دیلان اوانس,368,1401,1996,رقعی,شومیز,2,4 مهر
7,8,79714,978-6007033661,کتاب تکنولوژی های انقلابی,Radical Technologies,150000,20,3.02,کتاب آمه,آدام گرینفیلد,487,1401,2017,رقعی,شومیز,1,6 مهر
8,9,88169,978-6227342611,کتاب مثل حرفه ای ها یاد بگیر,Learn like a pro,72000,15,3.97,چترنگ,باربارا اوکلی,179,1401,2021,رقعی,شومیز,2,4 مهر
9,10,7267,9789643379353,کتاب کتاب بی نام اعترافات,Anonymous Confession Book,27000,15,3.38,نیستان,داوود غفارزادگان,348,1396,2009,رقعی,شومیز,3,4 مهر


In [26]:
file_path = "bookData.csv"
tableOfData.to_csv(file_path, index=False, encoding='utf-8')

In [91]:
tableOfSummaryData = pd.DataFrame(site_summary_data_list, columns=['site_index', 'summary'])
tableOfSummaryData

Unnamed: 0,site_index,summary
0,1,طب چینی یا سوزنی یکی از دو رشته درمانی طبی دنی...
1,1,نمایش‌نامه «تانیا» داستان زندگی زنی جوان است ک...
2,2,امروزه هنگامی که کســب‌وکارها بــه فناوری فکر ...
3,3,
4,3,
...,...,...
96,47,شاهنامه قصه همدلی ها، همراهی ها و مبارزه با نا...
97,47,
98,47,این کتاب حاوی مجموعه ای از مطالب و اطلاعات ارا...
99,47,کتاب نهج البلاغه، ترجمه و شرح واژگان که بر اسا...


In [92]:
tableOfSiteTagsData = pd.DataFrame(site_tags_data_list, columns=['site_index', 'tag'])
tableOfSiteTagsData

Unnamed: 0,site_index,tag
0,1,پزشکی
1,1,ادبیات ایران
2,1,کتاب مصور
3,1,بهداشت
4,1,ادبیات کلاسیک
...,...,...
547,47,فهرست برترین رمان های تاریخی
548,47,برترین آثار داستانی با شخصیت اصلی زن
549,47,برترین آثار تبدیل شده به فیلم و سریال
550,47,فهرست برترین رمان های عاشقانه
