In [50]:
#import all libraries
import pandas as pd
import requests
import bs4
import re
import logging
import concurrent.futures
import threading
from time import sleep
from datetime import datetime

In [51]:
def get_links():
    urls = list(pd.read_csv('books_url.csv')['link'])
    return urls

In [52]:
def get_response(input_url):
    headers = {
        'User-Agent': 'My User Agent 1.0',
        "Accept-Language": "en-US,en;q=0.5"
    }
    response = requests.get(input_url, headers=headers)
    return response

In [53]:
def get_soup(input_url):
    headers = {
        'User-Agent': 'My User Agent 1.0',
        "Accept-Language": "en-US,en;q=0.5"
    }
    response = requests.get(input_url, headers=headers)
    if response.status_code != 200:
        print("Error in getting link")
        print("response code is : ", response.status_code)
    soup = bs4.BeautifulSoup(response.text, 'html.parser')
    return soup

In [54]:
def get_fa_title(soup):
    title = soup.select('.product-name strong')[0]
    return title.text


In [55]:
def get_en_title(soup):
    title = soup.select('.product-name-englishname')[0]
    return title.text

In [56]:
def get_price(soup):
    price = soup.select('.price-broken , .col-md-7 .price:nth-child(1)')[0].text
    return int(price.replace(',', ''))

In [57]:
def get_discount(soup):
    try:
        discount_price = int(soup.select('.col-md-12+ .clearfix .price-special')[0].text.replace(',', ''))
        discount_price = ((get_price(soup) - discount_price) / get_price(soup)) * 100
    except Exception:
        discount_price = 0
        logging.exception("This book has no discount!")
    return discount_price

In [58]:
def get_score(soup):
    soup = soup.find('div', {'class': 'col-md-7'}).find('li', {'class': 'pull-left'}).find('div',
                                                                                           {'class': 'my-rating'})
    soup_str = str(soup)

    match = re.search(r'data-rating="(\d+\.\d+)"', soup_str)
    if match:
        data_rating = match.group(1)
        return data_rating


In [59]:
def get_publisher(soup):
    try:
        publisher_tag = soup.select('div.prodoct-attribute-items:nth-child(1) > a')[0]
        publisher_link=publisher_tag.get('href')
        publisher_id=publisher_link.split('/')[2].split('-')[0]
        publisher_name=publisher_tag.text.strip()
    except Exception:
        publisher_link=-1
        publisher_id=-1
        publisher_name=-1
    return {'id':publisher_id,'name':publisher_name}

In [60]:
def get_author(soup):
    authors_list=[]          
    try:
        authors_a_tag = soup.select('.prodoct-attribute-items+ .prodoct-attribute-items > a')
        if(len(authors_a_tag))==0:
                return authors_list
        for author_a_tag in authors_a_tag:
                author_link=author_a_tag.get('href')
                author_id=author_link.split('/')[2].split('-')[0]
                author_name=author_a_tag.text.strip()
                authors_list.append({'id':author_id,'name':author_name,'link':author_link})
    except Exception:
         authors_list=[]         
    return authors_list

In [61]:
def is_author_available(soup):
    try:
        existence = soup.select('.pull-left+ li span')[0].text
    except:
        existence = None
        logging.exception("This book has no author!")
    return existence


In [62]:
def get_book_attribute(soup):
    rows = soup.find('table', {'class': 'product-table'}).findAll('td')
    code = -1
    isbn = -1
    size = -1
    pages = -1
    per_cal = -1
    ad_cal = -1
    material = -1
    series = -1
    send_time = -1
    language = 'فارسی'
    translators=[]
    paper_type=-1
    code_flag = 0
    isbn_flag = 0
    size_flag = 0
    pages_flag = 0
    per_cal_flag = 0
    ad_cal_flag = 0
    material_flag = 0
    language_flag = 0
    series_flag = 0
    send_time_flag = 0
    translators_flag=0
    paper_type_flag=0
    
    for row in rows:
        text = row.text.strip()
        if code_flag == 1:
            code = int(text)
            code_flag = 0
        elif isbn_flag == 1:
            isbn = text
            isbn = re.sub('[^0-9-]', '', isbn)
            isbn_flag = 0
        elif size_flag == 1:
            size = text
            size_flag = 0
        elif pages_flag == 1:
            pages = int(text)
            pages_flag = 0
        elif per_cal_flag == 1:
            per_cal = int(text)
            per_cal_flag = 0
        elif ad_cal_flag == 1:
            ad_cal = int(text)
            ad_cal_flag = 0
        elif material_flag == 1:
            material = text
            material_flag = 0
        elif language_flag == 1:
            language = text
            language_flag = 0
        elif series_flag == 1:
            series = int(text)
            series_flag = 0
        elif send_time_flag == 1:
            send_time = text
            send_time_flag = 0
        elif paper_type_flag == 1:
            paper_type = text
            paper_type_flag = 0
        elif translators_flag == 1:
            translators_a_tag = row.select('a')
            translators_flag=0
            for a_tag in translators_a_tag:
                translators.append({'id':a_tag.get('href').split('/')[2].split('-')[0],'name':a_tag.text.strip(),'link':a_tag.get('href')})


        if 'کد کتاب' in text:
            code_flag = 1
        elif 'شابک' in text:
            isbn_flag = 1
        elif 'قطع' in text:
            size_flag = 1
        elif 'تعداد صفحه' in text:
            pages_flag = 1
        elif 'سال انتشار شمسی' in text:
            per_cal_flag = 1
        elif 'سال انتشار میلادی' in text:
            ad_cal_flag = 1
        elif 'نوع جلد' in text:
            material_flag = 1
        elif 'زبان کتاب' in text:
            language_flag = 1
        elif 'سری چاپ' in text:
            series_flag = 1
        elif 'زودترین زمان ارسال' in text:
            send_time_flag = 1
        elif 'مترجم' in text:
            translators_flag = 1
        elif   'نوع کاغذ' in text:
            paper_type_flag=1


    return [code, isbn, size, pages, per_cal, ad_cal, material, series,language, send_time,translators,paper_type]

In [63]:
def get_summary(soup):
    summary = soup.select('.product-description')[0].text.strip()
    return summary

In [64]:
def get_tags(soup):
    tags = soup.select('.product-tags-item')
    tags_list = []
    for tag in tags:
        tags_list += [tag.text.strip()]
    return tags_list

In [65]:
def get_book_detail(book_soup, site_index):
    book_fa_title = get_fa_title(book_soup)
    book_en_title = get_en_title(book_soup)
    book_price = get_price(book_soup)
    book_discount_percent = get_discount(book_soup)
    book_score = get_score(book_soup)
    book_publisher = get_publisher(book_soup)
    book_author = get_author(book_soup)
    book_author_presence = is_author_available(book_soup)

    [book_code, book_Isbn, book_size, book_pages, book_publication_per_date, book_publication_ad_date,
     book_cover_material, book_print_series,book_language, book_earliest_send_time,book_translators,paper_type] = get_book_attribute(book_soup)
    price_history_data_list.append({'book_id':int(book_code),'price':book_price,'discount':int(book_discount_percent),'date':str(datetime.today())})
  
    publishers_data_list.append(book_publisher)  #8th column is dict of publisher
    

    #writer
    writer_page_data_list.extend(book_author)     #9th index is the writer column which is a list of writers
    writers_list_of_dict=book_author.copy()        #writes list which is a list of dictionary
    if len(writers_list_of_dict)!=0:
        for w_id in writers_list_of_dict:
            books_writers_data_list.append({'book_id':int(book_code),'writer_id':w_id['id']})

    #translator

    translator_page_data_list.extend(book_translators)
    translators_list_of_dict=book_translators.copy()        #writes list which is a list of dictionary
    if len(translators_list_of_dict)!=0:
        for t_id in translators_list_of_dict:
            books_translators_data_list.append({'book_id':int(book_code),'translator_id':t_id['id']})
    book_publisher=book_publisher['id']               #convert 8th column from dict to the publisher's id
    book_data = [site_index, int(book_code), book_Isbn, book_fa_title, book_en_title,
                 book_score, book_publisher,
                 int(book_pages), int(book_publication_per_date), int(book_publication_ad_date), book_size,
                 book_cover_material,
                  int(book_print_series),book_language, book_earliest_send_time, book_author_presence,paper_type]
    return book_data

In [66]:
def get_book_site_veneration(soup, site_index):
    div = soup.find('div', attrs = {'class':'col-md-6 col-xs-12'})
    ven_lst = list()
    try:
        english_bars = div.find_all('div', attrs = {'class':'english-bar ltr'})
        persian_bars = div.find_all('div', attrs = {'class':'persian-bar'})
        prise_writers = div.find_all('div', attrs = {'class':'prise-writer ltr'})
        
        n = len(english_bars)
        for i in range(n):
            english_quote = english_bars[i].text.strip()
            persian_quote = persian_bars[i].text.strip()
            prise_writer = prise_writers[i].text.strip()

            ven_dic = {'site_index':site_index,
                       'English_Quote': english_quote,
                       'Persian_Quote': persian_quote,
                       'Prise_Writer': prise_writer}
            ven_lst.append(ven_dic)
        return(ven_lst)
    except:
        return ven_lst

In [67]:
def get_book_site_summary(book_soup, site_index):
    try:
        book_summary = get_summary(book_soup)
    except Exception:
        book_summary = None
        logging.exception("This book has no summary!")
    return [site_index, book_summary]

In [68]:
def get_book_site_tags(book_soup, site_index):
    book_tags = get_tags(book_soup)
    book_tags_list = []
    for tag in book_tags:
        book_tags_list += [[site_index, tag]]
    return book_tags_list

In [69]:
def get_book_site_awards(book_soup, site_index):
    awards=[]
    n=len(book_soup.select('.product-features h4'))
    for i in range(0,n):
        awards.append({'site_index':site_index,'award':book_soup.select('.product-features h4')[i].text})
    return awards

In [70]:
def get_req_list(list, req_count):
    if len(list) >= req_count:
        request_list = list[:req_count].copy()
    else:
        request_list = list.copy()
    return request_list

In [71]:
def scrape(site_soup):
    try:
        with lock:
            global site_index
            site_summary_data_list.append(get_book_site_summary(site_soup, site_index))
            site_tags_data_list.extend(get_book_site_tags(site_soup, site_index))
            site_page_books = site_soup.select('.clearfix .clearfix .row')
            for book_index in range(0, len(site_page_books), 2):
                data = get_book_detail(site_page_books[book_index], site_index)
                books_data_list.append(data)
            site_index += 1
    except Exception:
        logging.exception("An error occurred")

In [72]:
def fast_scrape(link):
    try:
        site_soup = get_soup(link)
        with lock:
            global site_index
            site_summary_data_list.append(get_book_site_summary(site_soup, site_index))
            site_tags_data_list.extend(get_book_site_tags(site_soup, site_index))
            book_veneration_data_list.extend(get_book_site_veneration(site_soup,site_index))
            site_award_data_list.extend(get_book_site_awards(site_soup,site_index))
            site_page_books = site_soup.select('.clearfix .clearfix .row')
            for book_index in range(0, len(site_page_books), 2):
                data = get_book_detail(site_page_books[book_index], site_index)
                # price_history_data_list.append({'book_id':data[1],'price':data[5],'discount':data[6],'date':str(datetime.today())})
                # writer_page_data_list.extend(data[9])     #9th index is the writer column which is a list of writers
                # publishers_data_list.append(data[8])  #8th column is dict of publisher
                # data[8]=data[8]['id']                 #convert 8th column from dict to the publisher's id
                # writers_list_of_dict=data[9]          #writes list which is a list of dictionary
                # data.pop(9)                           #remove 9th column from data(9th column was writers)
                # data.pop(5)                           #remove 5th column which is price column
                # data.pop(5)                           #remove 5th column which is discount after deleting price
                # for w_id in writers_list_of_dict:
                #      books_writers_data_list.append({'book_id':data[1],'writer_id':w_id['id']})
                books_data_list.append(data)
            site_index += 1
    except Exception:
        logging.exception("An error occurred")


<h1>Detailed Scraper</h1>

In [73]:
# links = get_links()[:200] + ['https://www.iranketab.ir/book/270-gone-with-the-wind']

# page_response = []
# books_data_list = []
# site_tags_data_list = []
# site_summary_data_list = []

# site_index = 1
# sleep_time = 0.5
# max_threads = 20
# book_count_request = 20  #number of requests per time

# lock = threading.Lock()
# book_urls = links.copy()

# while len(book_urls):
#     sleep(sleep_time)  #sleep so that the site does not ban us
#     request_list = get_req_list(book_urls, book_count_request)  #list of book's urls we want to send request 
#     with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
#         future_list = executor.map(get_response, request_list)
#         for future in future_list:
#             try:
#                 data = future
#                 page_response.append(data)
#             except Exception as exc:
#                 continue
#         for item in page_response:
#             if item.status_code == 200:
#                 page_url = item.url
#                 if page_url in request_list:
#                     page_soup = bs4.BeautifulSoup(item.content, 'html.parser')
#                     scrape(page_soup)
#                     book_urls.remove(page_url)


<h1>Fast Scraper</h1>

In [74]:
links = get_links()[:200]
#+ ['https://www.iranketab.ir/book/270-gone-with-the-wind']

books_data_list = []
site_tags_data_list = []
site_summary_data_list = []
site_award_data_list=[]

writer_page_data_list=[]
translator_page_data_list=[]
publishers_data_list=[]
price_history_data_list=[]
book_veneration_data_list=[]
#middle tables
books_writers_data_list=[]
books_translators_data_list=[]


site_index = 1
max_threads = 20

lock = threading.Lock()

with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
    executor.map(fast_scrape, links)


ERROR:root:This book has no summary!
Traceback (most recent call last):
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_20976\1156613770.py", line 3, in get_book_site_summary
    book_summary = get_summary(book_soup)
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_20976\3952702039.py", line 2, in get_summary
    summary = soup.select('.product-description')[0].text.strip()
IndexError: list index out of range
ERROR:root:This book has no discount!
Traceback (most recent call last):
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_20976\1188541366.py", line 3, in get_discount
    discount_price = int(soup.select('.col-md-12+ .clearfix .price-special')[0].text.replace(',', ''))
IndexError: list index out of range
ERROR:root:This book has no discount!
Traceback (most recent call last):
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_20976\1188541366.py", line 3, in get_discount
    discount_price = int(soup.select('.col-md-12+ .clearfix .price-special')[0].text.replace(',', ''))


<h1>Check Completnes</h1>

In [75]:
# if len(book_urls) == 0:
#     print('All links scraped!')
# else:
#     print('Something wrong happened')

<h1>Make Dataframes</h1>

In [76]:
tableOfData = pd.DataFrame(books_data_list,
                           columns=['site_index', 'code', 'Isbn', 'fa_title', 'en_title', 'score',
                                    'publisher_id', 'pages', 'publication_per_date', 'publication_ad_date',
                                    'size', 'cover_material', 'print_series','language' ,'earliest_send_time', 'presence','paper_type'])
tableOfData=tableOfData[tableOfData['code']!=-1]

In [77]:
tableOfData

Unnamed: 0,site_index,code,Isbn,fa_title,en_title,score,publisher_id,pages,publication_per_date,publication_ad_date,size,cover_material,print_series,language,earliest_send_time,presence,paper_type
0,1,115076,978-6001176869,کتاب درس هایی از زندگی هاوکینگ,How to Think Like Stephen Hawking,3.86,42,176,1402,2016,رقعی,شومیز,1,فارسی,5 مهر,موجود,-1
2,3,112266,978-6226712989,کتاب مجاز,Allowed,3.13,1875,119,1402,-1,رقعی,شومیز,1,فارسی,8 مهر,موجود,-1
3,4,105144,978-6225239210,کتاب چگونه حال بهتری و پول بیشتری داشته باشیم,12 Power Principles for Success,3.09,2628,197,1401,2019,رقعی,شومیز,1,فارسی,6 مهر,موجود,-1
4,5,25277,978-9643691271,کتاب جنگ فجیع جهانی اول,The Frightful First World War,3.75,11,128,1402,1998,رقعی,شومیز,17,فارسی,8 مهر,موجود,-1
5,6,105323,978-6005861280,کتاب من یک احمق هستم شما چطور…؟!,"I'm an idiot, how about you...?!",3.04,1259,144,1402,-1,رقعی,شومیز,3,فارسی,8 مهر,موجود,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219,193,57694,978-6001944031,کتاب برادران واویلا 1,"Snarf Attack, Underfoodle, and the Secret of ...",3.05,1186,148,1398,2004,رقعی,شومیز,3,فارسی,8 مهر,موجود,-1
220,194,60815,978-9642739295,کتاب رهایی از شاوشنک,The Shawshank Redemption,3.19,1427,135,1392,1996,رقعی,شومیز,1,فارسی,---,تمام شد ، اما میاریمش 😏,-1
221,195,43592,978-9643499587,کتاب اتل متل این پسرک,Atal matal in pesarak,3.75,30,32,1399,-1,خشتی,زرکوب,7,فارسی,---,تمام شد ، اما میاریمش 😏,-1
222,196,34605,978-6001212055,کتاب زندگی میکل آنژ,Life of Michelangelo,3.15,81,175,1400,1907,رقعی,شومیز,8,فارسی,---,به زودی 🙄,-1


In [78]:
table_of_cover_type=pd.DataFrame(tableOfData['cover_material'].drop_duplicates())
table_of_cover_type=table_of_cover_type[table_of_cover_type['cover_material']!=-1].reset_index(drop=True)
table_of_cover_type.to_csv('./cover_type.csv',encoding='utf-8')
table_of_cover_type

Unnamed: 0,cover_material
0,شومیز
1,سلفونی
2,جلد سخت
3,جلد نرم
4,زرکوب


In [79]:
table_of_format=pd.DataFrame(tableOfData['size'].drop_duplicates())
table_of_format=table_of_format[table_of_format['size']!=-1].reset_index(drop=True)
table_of_format.to_csv('./format.csv',encoding='utf-8')
table_of_format

Unnamed: 0,size
0,رقعی
1,وزیری
2,رحلی
3,جیبی
4,خشتی
5,پالتویی


In [80]:
def convert_size_to_int(size):
    try:
     return table_of_format.index[table_of_format['size']==str(size)].to_list()[0]
    except:
        return -1

tableOfData['size']=tableOfData['size'].apply(convert_size_to_int)

In [81]:
def convert_cover_type_to_int(material):
    try:
     return table_of_cover_type.index[table_of_cover_type['cover_material']==str(material)].to_list()[0]
    except:
        return -1

tableOfData['cover_material']=tableOfData['cover_material'].apply(convert_cover_type_to_int)

In [82]:

tableOfData.to_csv("bookData.csv", index=False, encoding='utf-8')

In [83]:
tableOfData

Unnamed: 0,site_index,code,Isbn,fa_title,en_title,score,publisher_id,pages,publication_per_date,publication_ad_date,size,cover_material,print_series,language,earliest_send_time,presence,paper_type
0,1,115076,978-6001176869,کتاب درس هایی از زندگی هاوکینگ,How to Think Like Stephen Hawking,3.86,42,176,1402,2016,0,0,1,فارسی,5 مهر,موجود,-1
2,3,112266,978-6226712989,کتاب مجاز,Allowed,3.13,1875,119,1402,-1,0,0,1,فارسی,8 مهر,موجود,-1
3,4,105144,978-6225239210,کتاب چگونه حال بهتری و پول بیشتری داشته باشیم,12 Power Principles for Success,3.09,2628,197,1401,2019,0,0,1,فارسی,6 مهر,موجود,-1
4,5,25277,978-9643691271,کتاب جنگ فجیع جهانی اول,The Frightful First World War,3.75,11,128,1402,1998,0,0,17,فارسی,8 مهر,موجود,-1
5,6,105323,978-6005861280,کتاب من یک احمق هستم شما چطور…؟!,"I'm an idiot, how about you...?!",3.04,1259,144,1402,-1,0,0,3,فارسی,8 مهر,موجود,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219,193,57694,978-6001944031,کتاب برادران واویلا 1,"Snarf Attack, Underfoodle, and the Secret of ...",3.05,1186,148,1398,2004,0,0,3,فارسی,8 مهر,موجود,-1
220,194,60815,978-9642739295,کتاب رهایی از شاوشنک,The Shawshank Redemption,3.19,1427,135,1392,1996,0,0,1,فارسی,---,تمام شد ، اما میاریمش 😏,-1
221,195,43592,978-9643499587,کتاب اتل متل این پسرک,Atal matal in pesarak,3.75,30,32,1399,-1,4,4,7,فارسی,---,تمام شد ، اما میاریمش 😏,-1
222,196,34605,978-6001212055,کتاب زندگی میکل آنژ,Life of Michelangelo,3.15,81,175,1400,1907,0,0,8,فارسی,---,به زودی 🙄,-1


In [84]:
tableOfSummaryData = pd.DataFrame(site_summary_data_list, columns=['site_index', 'summary']).drop_duplicates(subset=['site_index','summary'])
tableOfSummaryData=tableOfSummaryData[tableOfSummaryData.notnull().all(axis=1)]
tableOfSummaryData

Unnamed: 0,site_index,summary
0,1,کتاب «درس‌هایی از زندگی استیون هاوکینگ» بنا دا...
1,2,ساعت خوش نام مجموعه تلویزیونی است که در سال ۱۳...
3,4,تعداد کمی از افراد نسبت به باب پراکتور زمان بی...
4,5,تاریخ دو برابر ترسناک تر از همیشه !جنگ فجیع جه...
5,6,این کتاب به بیان مشکلات گوناگون جامعه ما می پر...
...,...,...
195,193,این کتاب داستان دو برادر کله پوک و بامزه را در...
196,194,کتاب حاضر، متن فیلم‌نامه «رهایی از شاوشنگ» است...
197,195,ترانه های این مجموعه با جلب توجه خردسالان به م...
198,196,زندگی میکل آنژ یکی از بارزترین نمونه های تأثیر...


In [85]:
tableOfSummaryData.to_csv( "BookSummaryData.csv", index=False, encoding='utf-8')

In [86]:
tableOfSiteTagsData = pd.DataFrame(site_tags_data_list, columns=['site_index', 'tag'])\
    .drop_duplicates(subset=['site_index','tag'])
tableOfSiteTagsData

Unnamed: 0,site_index,tag
0,1,ادبیات انگلیس
1,1,ادبیات معاصر
2,1,زندگی نامه
3,1,ادبیات واقع گرایانه
4,1,دهه 2010 میلادی
...,...,...
1006,197,ادبیات داستانی
1007,197,دهه 2000 میلادی
1008,197,ادبیات نمایشی
1009,197,فیلمنامه


In [87]:
table_of_tag=pd.DataFrame(tableOfSiteTagsData['tag']).drop_duplicates(subset=['tag']).reset_index(drop=True)
table_of_tag.to_csv('./tag.csv',encoding='utf-8')
table_of_tag

Unnamed: 0,tag
0,ادبیات انگلیس
1,ادبیات معاصر
2,زندگی نامه
3,ادبیات واقع گرایانه
4,دهه 2010 میلادی
...,...
200,جایزه ی داستان جنایی آلمان
201,ناداستان
202,داستان ماجرایی
203,درام


In [88]:
def convert_tag_to_int(tag):
    try:
     return table_of_tag.index[table_of_tag['tag']==str(tag)].to_list()[0]
    except:
        return -1

tableOfSiteTagsData['tag']=tableOfSiteTagsData['tag'].apply(convert_tag_to_int)

In [89]:
tableOfSiteTagsData

Unnamed: 0,site_index,tag
0,1,0
1,1,1
2,1,2
3,1,3
4,1,4
...,...,...
1006,197,9
1007,197,36
1008,197,94
1009,197,124


In [90]:
tableOfSiteTagsData.to_csv('bookTagsData.csv',index=False,encoding='utf-8')

In [92]:
table_of_publisher=pd.DataFrame(publishers_data_list).drop_duplicates(subset=['id','name'])
table_of_publisher.to_csv('./publisher.csv',index=False,encoding='utf-8')
table_of_publisher

Unnamed: 0,id,name
0,42,سبزان
1,1470,سروش
2,1875,اریش
3,2628,امید سخن
4,11,افق
...,...,...
217,93,موسسه فرهنگی هنری جهان کتاب
218,1265,حرفه نویسنده
219,1186,حوض نقره
220,1427,فارابی


In [93]:
books_writers_data_list=list(filter(bool, books_writers_data_list))
table_of_writer=pd.DataFrame(books_writers_data_list).drop_duplicates(subset=['book_id','writer_id'])
#drop rows which both book_id and writer_id is -1
table_of_writer=table_of_writer[(table_of_writer['book_id']!=-1) & (table_of_writer['writer_id']!=-1)]
table_of_writer.to_csv('./writer.csv',index=False,encoding='utf-8')
table_of_writer

Unnamed: 0,book_id,writer_id
0,115076,11871
1,112266,31542
2,105144,26943
3,25277,14319
4,105323,17747
...,...,...
218,57694,31297
219,60815,33432
220,43592,24833
221,34605,85


In [94]:
table_of_writer_page=pd.DataFrame(writer_page_data_list).drop_duplicates(subset=['id','name','link'])
table_of_writer_page.to_csv('./writer_page.csv',index=False,encoding='utf-8')
table_of_writer_page

Unnamed: 0,id,name,link
0,11871,دانیل اسمیت,/profile/11871-daniel-smith
1,31542,جواد ترشیزی,/profile/31542-%d8%ac%d9%88%d8%a7%d8%af-%d8%aa...
2,26943,باب پراکتور,/profile/26943-bob-proctor
3,14319,تری دیری,/profile/14319-terry-deary
4,17747,حسن چابک,/profile/17747-hasan-chabok
...,...,...,...
218,31297,مری آماتو,/profile/31297-mary-amato
219,33432,فرانک دارابانت,/profile/33432-frank-darabont
220,24833,رها زادمهر,/profile/24833-%d8%b1%d9%87%d8%a7-%d8%b2%d8%a7...
221,85,رومن رولان,/profile/85-romain-rolland


In [95]:
table_of_translator=pd.DataFrame(books_translators_data_list).drop_duplicates(subset=['book_id','translator_id'])
table_of_translator=table_of_translator[(table_of_translator['book_id']!=-1) & (table_of_translator['translator_id']!=-1)]
table_of_translator.to_csv('./translator.csv',index=False,encoding='utf-8')
table_of_translator


Unnamed: 0,book_id,translator_id
0,115076,33130
1,105144,30543
2,25277,14441
3,36714,4702
4,19941,2634
...,...,...
108,60686,8031
109,57694,5736
110,60815,27357
111,34605,4459


In [96]:
table_of_translator_page=pd.DataFrame(translator_page_data_list).drop_duplicates(subset=['id','name','link'])
table_of_translator_page.to_csv('translator_page.csv',index=False,encoding='utf-8')
table_of_translator_page

Unnamed: 0,id,name,link
0,33130,فاریا جنیدی,/profile/33130-%d9%81%d8%a7%d8%b1%db%8c%d8%a7-...
1,30543,فاطمه رحیمی,/profile/30543-%d9%81%d8%a7%d8%b7%d9%85%d9%87-...
2,14441,مهرداد تویسرکانی,/profile/14441-mehrdad-tuyserkani
3,4702,مرتضی ثاقب فر,/profile/4702-morteza-sagheb-far
4,2634,کتایون سلطانی,/profile/2634-katayoun-soltani
...,...,...,...
108,8031,پیمان چهرازی,/profile/8031-%d9%be%db%8c%d9%85%d8%a7%d9%86-%...
109,5736,مهناز ایلدرمی,/profile/5736-%d9%85%d9%87%d9%86%d8%a7%d8%b2-%...
110,27357,حمیدرضا گرشاسبی,/profile/27357-%d8%ad%d9%85%db%8c%d8%af%d8%b1%...
111,4459,اسماعیل سعادت,/profile/4459-%d8%a7%d8%b3%d9%85%d8%a7%d8%b9%d...


In [97]:
table_of_price_history=pd.DataFrame(price_history_data_list).drop_duplicates(subset=['book_id','price','discount','date'])
table_of_price_history=table_of_price_history[table_of_price_history.book_id!=-1]
table_of_price_history.to_csv('./price-history.csv',index=False,encoding='utf-8')
table_of_price_history

Unnamed: 0,book_id,price,discount,date
0,115076,128000,20,2023-09-25 15:20:53.746373
2,112266,100000,20,2023-09-25 15:20:54.226673
3,105144,100000,30,2023-09-25 15:20:54.359188
4,25277,145000,25,2023-09-25 15:20:54.449290
5,105323,69000,25,2023-09-25 15:20:54.783484
...,...,...,...,...
219,57694,140000,25,2023-09-25 15:21:27.445095
220,60815,8000,0,2023-09-25 15:21:27.485091
221,43592,80000,0,2023-09-25 15:21:27.524704
222,34605,32000,0,2023-09-25 15:21:27.565645


In [98]:
book_veneration_data_list=list(filter(bool, book_veneration_data_list))
table_of_book_veneration=pd.DataFrame(book_veneration_data_list).drop_duplicates(subset=['site_index','English_Quote','Persian_Quote','Prise_Writer'])
table_of_book_veneration.to_csv('./book_veneration.csv',index=False,encoding='utf-8')
table_of_book_veneration

Unnamed: 0,site_index,English_Quote,Persian_Quote,Prise_Writer
0,25,One of Faulkner’s comic masterpieces.,از شاهکارهای کمیک فاکنر,barnes and noble
1,60,Convincing and compelling.,باورپذیر و مهیج.,School Library Journal
2,60,"A highly imaginative, absolutely terrific firs...",یک رمان نخست فوق العاده خیال پردازانه و شگرف.,Barnes & Noble
3,60,"An exciting, clever read.",داستانی هیجان انگیز و هوشمندانه.,Booktopia
4,94,An engrossing forecast.,یک پیش بینی هیجان انگیز.,Publishers Weekly
5,94,"Original, accessible, and provocative.",بدیع، قابل فهم و برانگیزاننده.,Science
6,94,A compelling guide to the challenges and choic...,راهنمایی جذاب برای چالش ها و انتخاب های پیش رو...,Elon Musk
7,96,This stunning work showcases Krauss's consiste...,این اثر خیره کننده، نشان دهنده ی استعداد همیشگ...,Publishers Weekly
8,96,"Masterful, evocative and moving.",استادانه، احساس برانگیز و تکان دهنده.,NPR
9,96,A meditation on memory and loss.,تأملی بر خاطره و فقدان.,Los Angeles Times


In [99]:
table_of_award=pd.DataFrame(site_award_data_list).drop_duplicates(subset=['site_index','award'])
table_of_award.to_csv('./award.csv',index=False,encoding='utf-8')
table_of_award

Unnamed: 0,site_index,award
0,25,برنده ی جایزه ی پولیتزر سال ۱۹۶۳
1,47,برنده جایزه پولیتزر
2,47,برنده جایزه ی نمایشنامه ی حلقه ی منتقدین نیو...
3,47,برنده جایزه Tony سال 1987
4,60,نامزد جایزه کتاب ناشر مستقل سال 1999
5,60,برنده جایزه خواننده گرند کنیون سال 1998
6,60,نامزد مدال خوانندگان جوان کالیفرنیا سال 1998
7,60,برنده جایزه سکویا اوکلاهما سال 1998
8,94,از پرفروش ترین کتاب های نیویورک تایمز
9,96,برنده ی جایزه ی Anisfield-Wolf سال 2011


for extra points

In [105]:
def get_person_info(soup):
    info=''
    try:
       info= soup.select('h5')[0].text.strip()
    except:
        info=None
    return info

In [106]:
translator_df=pd.read_csv('./translator_page.csv')
translator_url=translator_df['link'].to_list()
translators_info=[]
for i in range(len(translator_url)):
    soup=get_soup('https://www.iranketab.ir'+translator_url[i])
    t_info=get_person_info(soup)
    translators_info.append(t_info)
translator_df['information']=translators_info
translator_df.drop(columns=['link'],inplace=True)
translator_df.to_csv('./translator_page.csv',index=False,encoding='utf-8')

In [112]:
translator_df

Unnamed: 0,id,name,information
0,33130,فاریا جنیدی,فاریا جنیدی مترجم ایرانی متولد سال 1349 می باشد.
1,30543,فاطمه رحیمی,فاطمه رحیمی متولد سال 1366، نویسنده ایرانی می ...
2,14441,مهرداد تویسرکانی,مهرداد تویسرکانی(متولد ۲۴ مرداد ۱۳۴۴ در شهر ته...
3,4702,مرتضی ثاقب فر,مرتضی ثاقب‌فر (۹ مرداد ۱۳۲۱ تهران - ۱۱ دی ۱۳۹۱...
4,2634,کتایون سلطانی,کتایون سلطانی متولد سال 1335، نویسنده و مترجم ...
...,...,...,...
100,8031,پیمان چهرازی,
101,5736,مهناز ایلدرمی,
102,27357,حمیدرضا گرشاسبی,
103,4459,اسماعیل سعادت,


In [107]:
writer_df=pd.read_csv('./writer_page.csv')
writer_url=writer_df['link'].to_list().copy()
writers_info=[]
for i in range(len(writer_url)):
    soup=get_soup('https://www.iranketab.ir'+writer_url[i])
    w_info=get_person_info(soup)
    writers_info.append(w_info)
writer_df['information']=writers_info
writer_df.drop(columns=['link'],inplace=True)
writer_df.to_csv('./writer_page.csv',index=False,encoding='utf-8')

In [111]:
writer_df

Unnamed: 0,id,name,information
0,11871,دانیل اسمیت,دانیل اسمیت نویسنده و ویراستار غیر داستانی است...
1,31542,جواد ترشیزی,جواد ترشیزی متولد سال 1364، نویسنده ایرانی می ...
2,26943,باب پراکتور,باب پراکتور (Bob Proctor) (متولد ۵ ژوئیه ۱۹۳۴)...
3,14319,تری دیری,"ترنس ویلیام ""تری"" دیری (William Terence ""Terry..."
4,17747,حسن چابک,حسن چابک متولد سال 1351، نویسنده ایرانی با حوز...
...,...,...,...
183,31297,مری آماتو,مری آماتو (متولد 3 ژانویه 1961 ، بلویدر ، ایلی...
184,33432,فرانک دارابانت,فرانک دارابونت (Frank Darabont) کارگردان، فیلم...
185,24833,رها زادمهر,رها زادمهر متولد سال 1350 ، نویسنده ی کتاب های...
186,85,رومن رولان,رومن رولان، زاده ی ۲۶ ژانویه ۱۸۶۶، درگذشته ی ۳...
