In [89]:
#import all libraries
import pandas as pd
import requests
import bs4
import re
import zipfile
import logging
import concurrent.futures
import math
from time import sleep
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import as_completed

In [90]:
def get_links():
    urls = list(pd.read_csv('books_url.csv')['link'])
    return urls

In [91]:
def get_soup(input_url):
    headers = {
        'User-Agent': 'My User Agent 1.0',
        "Accept-Language": "en-US,en;q=0.5"
    }
    response = requests.get(input_url, headers=headers)
    if response.status_code != 200:
        print("Error in getting link")
        print("response code is : ", response.status_code)
    soup = bs4.BeautifulSoup(response.text, 'html.parser')
    return soup

In [92]:
def get_fa_title(soup):
    title = soup.select('.product-name strong')[0]
    return title.text


In [93]:
def get_en_title(soup):
    title = soup.select('.product-name-englishname')[0]
    return title.text

In [94]:
def get_price(soup):
    price = soup.select('.price-broken , .col-md-7 .price:nth-child(1)')[0].text
    return int(price.replace(',', ''))  #TODO make the price integer

In [95]:
def get_discount(soup):
    try:
        discount_price = int(soup.select('.col-md-12+ .clearfix .price-special')[0].text.replace(',', ''))
        discount_price = ((get_price(soup) - discount_price) / get_price(soup)) * 100
    except Exception:
        discount_price = 0
        logging.exception("This book has no discount!")
    return discount_price  

In [96]:
def get_score(soup):
    soup = soup.find('div', {'class':'col-md-7'}).find('li', {'class':'pull-left'}).find('div', {'class':'my-rating'})
    soup_str = str(soup)

    match = re.search(r'data-rating="(\d+\.\d+)"', soup_str)
    if match:
        data_rating = match.group(1)
        return data_rating
    

In [97]:
def get_publisher(soup):
    try:
        publisher = soup.select('.prodoct-attribute-items:nth-child(1) a .prodoct-attribute-item')[0].text.strip()
    except Exception:
        publisher = None
        logging.exception("This book has no publisher!")
    return publisher

In [98]:
def get_author(soup):
    try:
        author = soup.select('.prodoct-attribute-items+ .prodoct-attribute-items a .prodoct-attribute-item')[0].text.strip()
    except Exception:
        author = None
        logging.exception("This book has no author!")    
    return author

In [99]:
def get_code(soup):
    code = soup.select('tr:nth-child(1) td+ td')[0]
    return code.text

In [100]:
def get_isbn(soup):
    isbn = soup.select('.product-table .ltr')[0]
    cleaned_isbn = isbn.text.strip()
    cleaned_isbn = re.sub('[^0-9-]', '', cleaned_isbn)
    return cleaned_isbn

In [101]:
def get_pages(soup):
    pages = soup.select('tr:nth-child(5) .rtl')[0]
    cleaned_pages = pages.text.strip()
    cleaned_pages = re.sub('[^0-9-]', '', cleaned_pages)
    return cleaned_pages

In [102]:
def get_per_date(soup):
    try:
        date = soup.select('.product-table > tbody:nth-child(1) > tr:nth-child(6) > td:nth-child(2)')[0].text
    except Exception:
        date = None
        logging.exception("This book has persian calendar!")
    return date.strip() 

In [103]:
def get_ad_date(soup):
    pass #TODO
    return None

In [104]:
def get_size(soup):
    size = soup.select(
        'html body div.container div div.product-container.well.clearfix div.clearfix div.col-md-9.col-sm-9 div.row div.col-md-5 table.product-table tbody tr td.rtl')[0].text
    return size.strip()


In [105]:
def get_material(soup):
    pass #TODO
    return None


In [106]:
def get_series(soup):
    pass #TODO
    return None


In [107]:
def get_send_time(soup):
    pass
    return None


In [108]:
def get_summary(soup):
    try:
        summary = soup.select('.product-description')[0].text.strip()
    except Exception:
        summary = None
        logging.exception("This book has no summary!")
    return summary

In [109]:
def get_tags(soup):
    tags = soup.select('.product-tags-item')
    tags_list = []
    for tag in tags:
        tags_list += [tag.text.strip()]
    return tags_list

In [124]:
 def get_book_detail(book_soup, site_index):
    book_fa_title = get_fa_title(book_soup)
    book_en_title = get_en_title(book_soup)
    book_price = get_price(book_soup)
    book_discount_percent = get_discount(book_soup)
    book_score = get_score(book_soup)
    book_publisher = get_publisher(book_soup)
    book_author = get_author(book_soup)
    book_code = get_code(book_soup)
    book_Isbn = get_isbn(book_soup)
    book_pages = get_pages(book_soup)
    book_publication_per_date = get_per_date(book_soup)
    book_publication_ad_date = get_ad_date(book_soup)
    book_size = get_size(book_soup)
    book_cover_material = get_material(book_soup)
    book_print_series = get_series(book_soup)
    book_earliest_send_time = get_send_time(book_soup)

    book_data = [site_index, book_code, book_Isbn, book_fa_title, book_en_title, book_price, book_discount_percent,
                 book_score, book_publisher, book_author,
                 book_pages, book_publication_per_date, book_publication_ad_date, book_size, book_cover_material,
                 book_print_series, book_earliest_send_time]  #TODO
    return book_data

In [125]:
 def get_book_site_summary(book_soup, site_index):
    book_summary = get_summary(book_soup)
    return [site_index, book_summary]

In [137]:
 def get_book_site_tags(book_soup, site_index):
    book_tags = get_tags(book_soup)
    book_tags_list = []
    for tag in book_tags:
        book_tags_list += [[site_index,tag]]
    return book_tags_list

In [138]:
books_data_list = []
site_index = 1
site_summary_data_list = []
site_tags_data_list = []
links = get_links()[:10] + ['https://www.iranketab.ir/book/270-gone-with-the-wind']

for link in links:
    try:
        site_soup = get_soup(link)
        site_summary_data_list += [get_book_site_summary(site_soup, site_index)]
        for site_tag in get_book_site_tags(site_soup, site_index):
            site_tags_data_list += [site_tag]
        site_page_books = site_soup.select('.clearfix .clearfix .row')
        for book_index in range(0, len(site_page_books), 2):
            data = get_book_detail(site_page_books[book_index], site_index)
            books_data_list += [data];
        site_index += 1
    except Exception:
        logging.exception("An error occurred");

ERROR:root:This book has no summary!
Traceback (most recent call last):
  File "C:\Users\raeim\AppData\Local\Temp\ipykernel_4956\1115907846.py", line 3, in get_summary
    summary = soup.select('.product-description')[0].text.strip()
IndexError: list index out of range
ERROR:root:This book has no summary!
Traceback (most recent call last):
  File "C:\Users\raeim\AppData\Local\Temp\ipykernel_4956\1115907846.py", line 3, in get_summary
    summary = soup.select('.product-description')[0].text.strip()
IndexError: list index out of range
ERROR:root:This book has no summary!
Traceback (most recent call last):
  File "C:\Users\raeim\AppData\Local\Temp\ipykernel_4956\1115907846.py", line 3, in get_summary
    summary = soup.select('.product-description')[0].text.strip()
IndexError: list index out of range
ERROR:root:This book has no summary!
Traceback (most recent call last):
  File "C:\Users\raeim\AppData\Local\Temp\ipykernel_4956\1115907846.py", line 3, in get_summary
    summary = soup.sel

In [134]:
tableOfData = pd.DataFrame(books_data_list,
                           columns=['site_index', 'code', 'Isbn', 'fa_title', 'en_title', 'price', 'discount', 'score',
                                    'publisher', 'author', 'pages', 'publication_per_date', 'publication_ad_date',
                                    'size', 'cover_material', 'print_series', 'earliest_send_time'])
tableOfData

Unnamed: 0,site_index,code,Isbn,fa_title,en_title,price,discount,score,publisher,author,pages,publication_per_date,publication_ad_date,size,cover_material,print_series,earliest_send_time
0,1,70554,978-9647553261,کتاب کلیات پزشکی سنتی چینی و طب سوزنی,Pezeshki Sonnati,300000,15.0,3.87,المعی,حسن رضوانی,1402,شومیز,,وزیری,,,
1,2,51563,978-9641940289,کتاب بانوان عمارت میسالونگی,The Ladies of Missalonghi,30000,15.0,3.17,روشنگران و مطالعات زنان,کالین مک کالو,184,1388,,رقعی,,,
2,3,49138,978-9644458071,کتاب گوزن شاخدار فایده اش چیه؟,What Use Is A Moose?,70000,15.0,3.58,علمی و فرهنگی,مارتین وادل,32,1400,,وزیری,,,
3,4,116421,978-6009313884,کتاب تانیا,Tanya,75000,20.0,3.62,پژواک فرزان,آلکسی آربوزوف,159,1396,,جیبی,,,
4,5,76016,978-9642683864,کتاب دانه های روشنایی,Flakes of light,9000,30.0,3.67,اشک,عباس مهرپویا,1388,شومیز,,خشتی,,,
5,6,73891,978-9640669687,کتاب رازهای ناگفته ی بازاریابی,Marketing,20000,15.0,3.67,نیما عربشاهی,جو ویتالی,96,1384,,خشتی,,,
6,7,41857,978-9644239441,کتاب سلام بر عاشورا,Salam bar Ashoora,5000,15.0,3.63,اطلاعات,رضا اسماعیلی,1394,شومیز,,رقعی,,,
7,8,107977,978-9641721932,کتاب قاصدک ها در هوا ایستاده اند,Dandelions are standing in the air,69000,20.0,3.18,دنیای نو,مهدی حسینی (مهد),1401,شومیز,,رقعی,,,
8,9,27986,978-9643911140,کتاب سقف تالار آیینه عمارت نارنجستان شیراز,Saghf-e Talar-e Ayieneh,1600,0.0,3.35,کانون پرورش فکری کودکان و نوجوان,هادی سیف,1390,شومیز,,رحلی,,,
9,10,64447,978-6229736944,کتاب فناوری در سال 2021,"The Year in Tech, 2021",41500,0.0,3.01,راه پرداخت,هاروارد بیزینس ریویو,116,1399,,رقعی,,,


In [133]:
tableOfSummaryData = pd.DataFrame(site_summary_data_list,
                           columns=['site_index', 'summary'])
tableOfSummaryData

Unnamed: 0,site_index,summary
0,1,طب چینی یا سوزنی یکی از دو رشته درمانی طبی دنی...
1,2,
2,3,
3,4,نمایش‌نامه «تانیا» داستان زندگی زنی جوان است ک...
4,5,
5,6,این کتابچه متضمن چهل و پنج راهکار و نکته در با...
6,7,شاعر گرانمایه جناب آقای رضا اسماعیلی که از چهر...
7,8,
8,9,"کتاب حاضر، از مجموعه ی ""چرا ندیدیم؟""، معرفی دق..."
9,10,امروزه هنگامی که کســب‌وکارها بــه فناوری فکر ...


In [139]:
tableOfSiteTagsData = pd.DataFrame(site_tags_data_list,
                           columns=['site_index', 'tag'])
tableOfSiteTagsData

Unnamed: 0,site_index,tag
0,1,پزشکی
1,1,ادبیات ایران
2,1,کتاب مصور
3,1,بهداشت
4,2,ادبیات استرالیا
...,...,...
66,11,فهرست برترین رمان های تاریخی
67,11,برترین آثار داستانی با شخصیت اصلی زن
68,11,برترین آثار تبدیل شده به فیلم و سریال
69,11,فهرست برترین رمان های عاشقانه


In [26]:
file_path = "data.csv"

tableOfData.to_csv(file_path, index=False, encoding='utf-8')