In [300]:
#import all libraries
import pandas as pd
import requests
import bs4
import re
import logging
import concurrent.futures
import threading
from time import sleep
from datetime import datetime

In [301]:
def get_links():
    urls = list(pd.read_csv('books_url.csv')['link'])
    return urls

In [302]:
def get_response(input_url):
    headers = {
        'User-Agent': 'My User Agent 1.0',
        "Accept-Language": "en-US,en;q=0.5"
    }
    response = requests.get(input_url, headers=headers)
    return response

In [303]:
def get_soup(input_url):
    headers = {
        'User-Agent': 'My User Agent 1.0',
        "Accept-Language": "en-US,en;q=0.5"
    }
    response = requests.get(input_url, headers=headers)
    if response.status_code != 200:
        print("Error in getting link")
        print("response code is : ", response.status_code)
    soup = bs4.BeautifulSoup(response.text, 'html.parser')
    return soup

In [304]:
def get_fa_title(soup):
    title = soup.select('.product-name strong')[0]
    return title.text


In [305]:
def get_en_title(soup):
    title = soup.select('.product-name-englishname')[0]
    return title.text

In [306]:
def get_price(soup):
    price = soup.select('.price-broken , .col-md-7 .price:nth-child(1)')[0].text
    return int(price.replace(',', ''))

In [307]:
def get_discount(soup):
    try:
        discount_price = int(soup.select('.col-md-12+ .clearfix .price-special')[0].text.replace(',', ''))
        discount_price = ((get_price(soup) - discount_price) / get_price(soup)) * 100
    except Exception:
        discount_price = 0
        logging.exception("This book has no discount!")
    return discount_price

In [308]:
def get_score(soup):
    soup = soup.find('div', {'class': 'col-md-7'}).find('li', {'class': 'pull-left'}).find('div',
                                                                                           {'class': 'my-rating'})
    soup_str = str(soup)

    match = re.search(r'data-rating="(\d+\.\d+)"', soup_str)
    if match:
        data_rating = match.group(1)
        return data_rating


In [309]:
def get_publisher(soup):
    try:
        publisher_tag = soup.select('div.prodoct-attribute-items:nth-child(1) > a')[0]
        publisher_link=publisher_tag.get('href')
        publisher_id=publisher_link.split('/')[2].split('-')[0]
        publisher_name=publisher_tag.text.strip()
    except Exception:
        publisher_link=-1
        publisher_id=-1
        publisher_name=-1
    return {'id':publisher_id,'name':publisher_name,'link':publisher_link}

In [310]:
def get_author(soup):
    authors_list=[]          
    try:
        authors_a_tag = soup.select('.prodoct-attribute-items+ .prodoct-attribute-items > a')
        if(len(authors_a_tag))==0:
                return authors_list
        for author_a_tag in authors_a_tag:
                author_link=author_a_tag.get('href')
                author_id=author_link.split('/')[2].split('-')[0]
                author_name=author_a_tag.text.strip()
                authors_list.append({'id':author_id,'name':author_name,'link':author_link})
    except Exception:
         authors_list=[]         
    return authors_list

In [311]:
def is_author_available(soup):
    try:
        existence = soup.select('.pull-left+ li span')[0].text
    except:
        existence = None
        logging.exception("This book has no author!")
    return existence


In [312]:
def get_book_attribute(soup):
    rows = soup.find('table', {'class': 'product-table'}).findAll('td')
    code = -1
    isbn = -1
    size = -1
    pages = -1
    per_cal = -1
    ad_cal = -1
    material = -1
    series = -1
    send_time = -1
    language = 'فارسی'
    code_flag = 0
    isbn_flag = 0
    size_flag = 0
    pages_flag = 0
    per_cal_flag = 0
    ad_cal_flag = 0
    material_flag = 0
    language_flag = 0
    series_flag = 0
    send_time_flag = 0

    for row in rows:
        text = row.text.strip()
        if code_flag == 1:
            code = int(text)
            code_flag = 0
        elif isbn_flag == 1:
            isbn = text
            isbn = re.sub('[^0-9-]', '', isbn)
            isbn_flag = 0
        elif size_flag == 1:
            size = text
            size_flag = 0
        elif pages_flag == 1:
            pages = int(text)
            pages_flag = 0
        elif per_cal_flag == 1:
            per_cal = int(text)
            per_cal_flag = 0
        elif ad_cal_flag == 1:
            ad_cal = int(text)
            ad_cal_flag = 0
        elif material_flag == 1:
            material = text
            material_flag = 0
        elif language_flag == 1:
            language = text
            language_flag = 0
        elif series_flag == 1:
            series = int(text)
            series_flag = 0
        elif send_time_flag == 1:
            send_time = text
            send_time_flag = 0

        if 'کد کتاب' in text:
            code_flag = 1
        elif 'شابک' in text:
            isbn_flag = 1
        elif 'قطع' in text:
            size_flag = 1
        elif 'تعداد صفحه' in text:
            pages_flag = 1
        elif 'سال انتشار شمسی' in text:
            per_cal_flag = 1
        elif 'سال انتشار میلادی' in text:
            ad_cal_flag = 1
        elif 'نوع جلد' in text:
            material_flag = 1
        elif 'زبان کتاب' in text:
            language_flag = 1
        elif 'سری چاپ' in text:
            series_flag = 1
        elif 'زودترین زمان ارسال' in text:
            send_time_flag = 1

    return [code, isbn, size, pages, per_cal, ad_cal, material,language, series, send_time]

In [313]:
def get_summary(soup):
    summary = soup.select('.product-description')[0].text.strip()
    return summary

In [314]:
def get_tags(soup):
    tags = soup.select('.product-tags-item')
    tags_list = []
    for tag in tags:
        tags_list += [tag.text.strip()]
    return tags_list

In [315]:
def get_book_detail(book_soup, site_index):
    book_fa_title = get_fa_title(book_soup)
    book_en_title = get_en_title(book_soup)
    book_price = get_price(book_soup)
    book_discount_percent = get_discount(book_soup)
    book_score = get_score(book_soup)
    book_publisher = get_publisher(book_soup)
    book_author = get_author(book_soup)
    book_author_presence = is_author_available(book_soup)

    [book_code, book_Isbn, book_size, book_pages, book_publication_per_date, book_publication_ad_date,
     book_cover_material,book_language, book_print_series, book_earliest_send_time] = get_book_attribute(book_soup)

    price_history_data_list.append({'book_id':int(book_code),'price':book_price,'discount':int(book_discount_percent),'date':str(datetime.today())})
    writer_page_data_list.extend(book_author)     #9th index is the writer column which is a list of writers
    publishers_data_list.append(book_publisher)  #8th column is dict of publisher
    writers_list_of_dict=book_author.copy()        #writes list which is a list of dictionary
    if len(writers_list_of_dict)==0:
        books_writers_data_list.append({'book_id':int(book_code),'writer_id':-1})
    else:
        for w_id in writers_list_of_dict:
            books_writers_data_list.append({'book_id':int(book_code),'writer_id':w_id['id']})
    book_publisher=book_publisher['id']               #convert 8th column from dict to the publisher's id
    book_data = [site_index, int(book_code), book_Isbn, book_fa_title, book_en_title,
                 book_score, book_publisher,
                 int(book_pages), int(book_publication_per_date), int(book_publication_ad_date), book_size,
                 book_cover_material,
                  book_language,int(book_print_series), book_earliest_send_time, book_author_presence]
    return book_data

In [316]:
def get_book_site_veneration(soup, site_index):
    div = soup.find('div', attrs = {'class':'col-md-6 col-xs-12'})
    try:
        english_bars = div.find_all('div', attrs = {'class':'english-bar ltr'})
        persian_bars = div.find_all('div', attrs = {'class':'persian-bar'})
        prise_writers = div.find_all('div', attrs = {'class':'prise-writer ltr'})
        
        n = len(english_bars)
        ven_lst = list()
        for i in range(n):
            english_quote = english_bars[i].text.strip()
            persian_quote = persian_bars[i].text.strip()
            prise_writer = prise_writers[i].text.strip()

            ven_dic = {'id':site_index,
                       'English_Quote': english_quote,
                       'Persian_Quote': persian_quote,
                       'Prise_Writer': prise_writer}
            ven_lst.append(ven_dic)
        return(ven_lst)
    except:
        return []

In [317]:
def get_book_site_summary(book_soup, site_index):
    try:
        book_summary = get_summary(book_soup)
    except Exception:
        book_summary = None
        logging.exception("This book has no summary!")
    return [site_index, book_summary]

In [318]:
def get_book_site_tags(book_soup, site_index):
    book_tags = get_tags(book_soup)
    book_tags_list = []
    for tag in book_tags:
        book_tags_list += [[site_index, tag]]
    return book_tags_list

In [319]:
def get_site_awards(soup, site_index):
    awards_list = []
    awards = soup.select('book_soup, site_index')

    for award in awards:
        print(award.text)
        awards += [award.text]

In [320]:
def get_req_list(list, req_count):
    if len(list) >= req_count:
        request_list = list[:req_count].copy()
    else:
        request_list = list.copy()
    return request_list

In [321]:
def scrape(site_soup):
    try:
        with lock:
            global site_index
            site_summary_data_list.append(get_book_site_summary(site_soup, site_index))
            site_tags_data_list.extend(get_book_site_tags(site_soup, site_index))
            site_page_books = site_soup.select('.clearfix .clearfix .row')
            for book_index in range(0, len(site_page_books), 2):
                data = get_book_detail(site_page_books[book_index], site_index)
                books_data_list.append(data)
            site_index += 1
    except Exception:
        logging.exception("An error occurred")

In [322]:
def fast_scrape(link):
    try:
        site_soup = get_soup(link)
        with lock:
            global site_index
            site_summary_data_list.append(get_book_site_summary(site_soup, site_index))
            site_tags_data_list.extend(get_book_site_tags(site_soup, site_index))
            book_veneration_data_list.extend(get_book_site_veneration(site_soup,site_index))
            site_page_books = site_soup.select('.clearfix .clearfix .row')
            for book_index in range(0, len(site_page_books), 2):
                data = get_book_detail(site_page_books[book_index], site_index)
                # price_history_data_list.append({'book_id':data[1],'price':data[5],'discount':data[6],'date':str(datetime.today())})
                # writer_page_data_list.extend(data[9])     #9th index is the writer column which is a list of writers
                # publishers_data_list.append(data[8])  #8th column is dict of publisher
                # data[8]=data[8]['id']                 #convert 8th column from dict to the publisher's id
                # writers_list_of_dict=data[9]          #writes list which is a list of dictionary
                # data.pop(9)                           #remove 9th column from data(9th column was writers)
                # data.pop(5)                           #remove 5th column which is price column
                # data.pop(5)                           #remove 5th column which is discount after deleting price
                # for w_id in writers_list_of_dict:
                #      books_writers_data_list.append({'book_id':data[1],'writer_id':w_id['id']})
                books_data_list.append(data)
            site_index += 1
    except Exception:
        logging.exception("An error occurred")


<h1>Detailed Scraper</h1>

In [323]:
# links = get_links()[:200] + ['https://www.iranketab.ir/book/270-gone-with-the-wind']

# page_response = []
# books_data_list = []
# site_tags_data_list = []
# site_summary_data_list = []

# site_index = 1
# sleep_time = 0.5
# max_threads = 20
# book_count_request = 20  #number of requests per time

# lock = threading.Lock()
# book_urls = links.copy()

# while len(book_urls):
#     sleep(sleep_time)  #sleep so that the site does not ban us
#     request_list = get_req_list(book_urls, book_count_request)  #list of book's urls we want to send request 
#     with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
#         future_list = executor.map(get_response, request_list)
#         for future in future_list:
#             try:
#                 data = future
#                 page_response.append(data)
#             except Exception as exc:
#                 continue
#         for item in page_response:
#             if item.status_code == 200:
#                 page_url = item.url
#                 if page_url in request_list:
#                     page_soup = bs4.BeautifulSoup(item.content, 'html.parser')
#                     scrape(page_soup)
#                     book_urls.remove(page_url)


<h1>Fast Scraper</h1>

In [324]:
links = get_links()[:500] #+ ['https://www.iranketab.ir/book/270-gone-with-the-wind']

books_data_list = []
site_tags_data_list = []
site_summary_data_list = []
writer_page_data_list=[]
publishers_data_list=[]
books_writers_data_list=[]
price_history_data_list=[]
book_veneration_data_list=[]

site_index = 1
max_threads = 20

lock = threading.Lock()

with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
    executor.map(fast_scrape, links)


ERROR:root:This book has no discount!
Traceback (most recent call last):
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_37076\1188541366.py", line 3, in get_discount
    discount_price = int(soup.select('.col-md-12+ .clearfix .price-special')[0].text.replace(',', ''))
IndexError: list index out of range
ERROR:root:This book has no discount!
Traceback (most recent call last):
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_37076\1188541366.py", line 3, in get_discount
    discount_price = int(soup.select('.col-md-12+ .clearfix .price-special')[0].text.replace(',', ''))
IndexError: list index out of range
ERROR:root:This book has no summary!
Traceback (most recent call last):
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_37076\1156613770.py", line 3, in get_book_site_summary
    book_summary = get_summary(book_soup)
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_37076\3952702039.py", line 2, in get_summary
    summary = soup.select('.product-description')[0].text.strip()


<h1>Check Completnes</h1>

In [325]:
# if len(book_urls) == 0:
#     print('All links scraped!')
# else:
#     print('Something wrong happened')

<h1>Make Dataframes</h1>

In [326]:
tableOfData = pd.DataFrame(books_data_list,
                           columns=['site_index', 'code', 'Isbn', 'fa_title', 'en_title', 'score',
                                    'publisher_id', 'pages', 'publication_per_date', 'publication_ad_date',
                                    'size', 'cover_material', 'print_series','language' ,'earliest_send_time', 'presence'])
tableOfData

Unnamed: 0,site_index,code,Isbn,fa_title,en_title,score,publisher_id,pages,publication_per_date,publication_ad_date,size,cover_material,print_series,language,earliest_send_time,presence
0,1,25277,978-9643691271,کتاب جنگ فجیع جهانی اول,The Frightful First World War,3.75,11,128,1402,1998,رقعی,شومیز,فارسی,17,8 مهر,موجود
1,2,25671,978-9643124694,کتاب مدیریت منابع انسانی,Human resources management,3.8,66,496,1398,-1,وزیری,شومیز,فارسی,13,---,به زودی 🙄
2,3,67445,978-9640363584,کتاب رویکردهای یادگیری نظریه و کاربست,Approaches to Learning,3.07,1967,362,1399,2008,وزیری,شومیز,فارسی,5,---,تمام شد ، اما میاریمش 😏
3,4,8108,9786002961136,کتاب یک روز مناسب برای شنا قورباغه,A perfect day to swim a frog,3.06,30,92,1393,2014,رقعی,شومیز,فارسی,1,8 مهر,موجود
4,5,19941,978-6006935157,کتاب نوربرت خرگردن,Norberto Nucagorda,3.86,11,56,1398,1987,رقعی,شومیز,فارسی,2,5 مهر,موجود
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
542,490,48973,978-6001197413,کتاب آدم و حوا,Adam and Eve,3.11,87,126,1400,1931,رقعی,شومیز,فارسی,3,---,به زودی 🙄
543,491,81245,978-6008557036,کتاب شیری که می خواست جلال باشد,Shiri ke mikhast Jalal bashad,3.64,72,202,1396,-1,رقعی,شومیز,فارسی,1,5 مهر,موجود
544,492,3293,9789643698249,کتاب آوازهای کوچکی برای ماه,Correspondence between George Sand and Gustav...,4.1,11,456,1392,1921,رقعی,شومیز,فارسی,2,---,به زودی 🙄
545,493,73404,978-6004056830,کتاب چگونگی بازنمایی شهر در ادبیات و سینما,How the city is represented in literature and...,3.33,31,267,1400,-1,رقعی,شومیز,فارسی,1,8 مهر,موجود


In [327]:
table_of_cover_type=pd.DataFrame(tableOfData['cover_material'].drop_duplicates())
table_of_cover_type=table_of_cover_type[table_of_cover_type['cover_material']!=-1].reset_index(drop=True)
table_of_cover_type.to_csv('./cover_type.csv',encoding='utf-8')
table_of_cover_type

Unnamed: 0,cover_material
0,شومیز
1,سلفونی
2,جلد سخت
3,جلد نرم
4,زرکوب


In [328]:
table_of_format=pd.DataFrame(tableOfData['size'].drop_duplicates())
table_of_format=table_of_format[table_of_format['size']!=-1].reset_index(drop=True)
table_of_format.to_csv('./format.csv',encoding='utf-8')
table_of_format

Unnamed: 0,size
0,رقعی
1,وزیری
2,جیبی
3,رحلی
4,خشتی
5,پالتویی


In [329]:
def convert_size_to_int(size):
    try:
     return table_of_format.index[table_of_format['size']==str(size)].to_list()[0]
    except:
        return -1

tableOfData['size']=tableOfData['size'].apply(convert_size_to_int)

In [330]:
tableOfData

Unnamed: 0,site_index,code,Isbn,fa_title,en_title,score,publisher_id,pages,publication_per_date,publication_ad_date,size,cover_material,print_series,language,earliest_send_time,presence
0,1,25277,978-9643691271,کتاب جنگ فجیع جهانی اول,The Frightful First World War,3.75,11,128,1402,1998,0,شومیز,فارسی,17,8 مهر,موجود
1,2,25671,978-9643124694,کتاب مدیریت منابع انسانی,Human resources management,3.8,66,496,1398,-1,1,شومیز,فارسی,13,---,به زودی 🙄
2,3,67445,978-9640363584,کتاب رویکردهای یادگیری نظریه و کاربست,Approaches to Learning,3.07,1967,362,1399,2008,1,شومیز,فارسی,5,---,تمام شد ، اما میاریمش 😏
3,4,8108,9786002961136,کتاب یک روز مناسب برای شنا قورباغه,A perfect day to swim a frog,3.06,30,92,1393,2014,0,شومیز,فارسی,1,8 مهر,موجود
4,5,19941,978-6006935157,کتاب نوربرت خرگردن,Norberto Nucagorda,3.86,11,56,1398,1987,0,شومیز,فارسی,2,5 مهر,موجود
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
542,490,48973,978-6001197413,کتاب آدم و حوا,Adam and Eve,3.11,87,126,1400,1931,0,شومیز,فارسی,3,---,به زودی 🙄
543,491,81245,978-6008557036,کتاب شیری که می خواست جلال باشد,Shiri ke mikhast Jalal bashad,3.64,72,202,1396,-1,0,شومیز,فارسی,1,5 مهر,موجود
544,492,3293,9789643698249,کتاب آوازهای کوچکی برای ماه,Correspondence between George Sand and Gustav...,4.1,11,456,1392,1921,0,شومیز,فارسی,2,---,به زودی 🙄
545,493,73404,978-6004056830,کتاب چگونگی بازنمایی شهر در ادبیات و سینما,How the city is represented in literature and...,3.33,31,267,1400,-1,0,شومیز,فارسی,1,8 مهر,موجود


In [331]:
def convert_cover_type_to_int(material):
    try:
     return table_of_cover_type.index[table_of_cover_type['cover_material']==str(material)].to_list()[0]
    except:
        return -1

tableOfData['cover_material']=tableOfData['cover_material'].apply(convert_cover_type_to_int)

In [343]:
tableOfData

Unnamed: 0,site_index,code,Isbn,fa_title,en_title,score,publisher_id,pages,publication_per_date,publication_ad_date,size,cover_material,print_series,language,earliest_send_time,presence
0,1,25277,978-9643691271,کتاب جنگ فجیع جهانی اول,The Frightful First World War,3.75,11,128,1402,1998,0,0,فارسی,17,8 مهر,موجود
1,2,25671,978-9643124694,کتاب مدیریت منابع انسانی,Human resources management,3.8,66,496,1398,-1,1,0,فارسی,13,---,به زودی 🙄
2,3,67445,978-9640363584,کتاب رویکردهای یادگیری نظریه و کاربست,Approaches to Learning,3.07,1967,362,1399,2008,1,0,فارسی,5,---,تمام شد ، اما میاریمش 😏
3,4,8108,9786002961136,کتاب یک روز مناسب برای شنا قورباغه,A perfect day to swim a frog,3.06,30,92,1393,2014,0,0,فارسی,1,8 مهر,موجود
4,5,19941,978-6006935157,کتاب نوربرت خرگردن,Norberto Nucagorda,3.86,11,56,1398,1987,0,0,فارسی,2,5 مهر,موجود
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
542,490,48973,978-6001197413,کتاب آدم و حوا,Adam and Eve,3.11,87,126,1400,1931,0,0,فارسی,3,---,به زودی 🙄
543,491,81245,978-6008557036,کتاب شیری که می خواست جلال باشد,Shiri ke mikhast Jalal bashad,3.64,72,202,1396,-1,0,0,فارسی,1,5 مهر,موجود
544,492,3293,9789643698249,کتاب آوازهای کوچکی برای ماه,Correspondence between George Sand and Gustav...,4.1,11,456,1392,1921,0,0,فارسی,2,---,به زودی 🙄
545,493,73404,978-6004056830,کتاب چگونگی بازنمایی شهر در ادبیات و سینما,How the city is represented in literature and...,3.33,31,267,1400,-1,0,0,فارسی,1,8 مهر,موجود


In [344]:
tableOfData

Unnamed: 0,site_index,code,Isbn,fa_title,en_title,score,publisher_id,pages,publication_per_date,publication_ad_date,size,cover_material,print_series,language,earliest_send_time,presence
0,1,25277,978-9643691271,کتاب جنگ فجیع جهانی اول,The Frightful First World War,3.75,11,128,1402,1998,0,0,فارسی,17,8 مهر,موجود
1,2,25671,978-9643124694,کتاب مدیریت منابع انسانی,Human resources management,3.8,66,496,1398,-1,1,0,فارسی,13,---,به زودی 🙄
2,3,67445,978-9640363584,کتاب رویکردهای یادگیری نظریه و کاربست,Approaches to Learning,3.07,1967,362,1399,2008,1,0,فارسی,5,---,تمام شد ، اما میاریمش 😏
3,4,8108,9786002961136,کتاب یک روز مناسب برای شنا قورباغه,A perfect day to swim a frog,3.06,30,92,1393,2014,0,0,فارسی,1,8 مهر,موجود
4,5,19941,978-6006935157,کتاب نوربرت خرگردن,Norberto Nucagorda,3.86,11,56,1398,1987,0,0,فارسی,2,5 مهر,موجود
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
542,490,48973,978-6001197413,کتاب آدم و حوا,Adam and Eve,3.11,87,126,1400,1931,0,0,فارسی,3,---,به زودی 🙄
543,491,81245,978-6008557036,کتاب شیری که می خواست جلال باشد,Shiri ke mikhast Jalal bashad,3.64,72,202,1396,-1,0,0,فارسی,1,5 مهر,موجود
544,492,3293,9789643698249,کتاب آوازهای کوچکی برای ماه,Correspondence between George Sand and Gustav...,4.1,11,456,1392,1921,0,0,فارسی,2,---,به زودی 🙄
545,493,73404,978-6004056830,کتاب چگونگی بازنمایی شهر در ادبیات و سینما,How the city is represented in literature and...,3.33,31,267,1400,-1,0,0,فارسی,1,8 مهر,موجود


In [333]:
file_path = "bookData.csv"
tableOfData.to_csv(file_path, index=False, encoding='utf-8')

In [334]:
tableOfSummaryData = pd.DataFrame(site_summary_data_list, columns=['site_index', 'summary']).drop_duplicates(subset=['site_index','summary'])
tableOfSummaryData

Unnamed: 0,site_index,summary
0,1,تاریخ دو برابر ترسناک تر از همیشه !جنگ فجیع جه...
1,2,توجه به نیروی انسانی طی سال های اخیر بخش مهمی ...
2,3,نظریه‌های یادگیری از دروس مهم در دوره‌های کارش...
3,4,مجموعه داستان «یک روز مناسب برای شنای قورباغه»...
4,5,در دشت های پهناور آفریقا کرگدنی به نام نوربرت ...
...,...,...
494,490,
495,491,سال‌ها است که با تنها دخترم آیلی زندگی می‌کنم ...
496,492,گوستاو فلوبر (1880 - 1821) یکی از نویسندگان تأ...
497,493,نسبت میان شهر و ادبیات، نشانه‌شناسی، ادبیات تط...


In [335]:
file_path = "BookSummaryData.csv"
tableOfData.to_csv(file_path, index=False, encoding='utf-8')

In [336]:
tableOfSiteTagsData = pd.DataFrame(site_tags_data_list, columns=['site_index', 'tag']).drop_duplicates(subset=['site_index','tag'])
tableOfSiteTagsData

Unnamed: 0,site_index,tag
0,1,ادبیات انگلیس
1,1,داستان تاریخی
2,1,ادبیات داستانی
3,1,ادبیات کودک و نوجوان
4,1,دهه 1990 میلادی
...,...,...
2552,494,ادبیات معاصر
2553,494,ادبیات واقع گرایانه
2554,494,ادبیات ایران
2555,494,زبان تخصصی


In [337]:
file_path = "bookTagsData.csv"
tableOfData.to_csv(file_path, index=False, encoding='utf-8')

In [338]:
table_of_publisher=pd.DataFrame(publishers_data_list).drop_duplicates(subset=['id','name','link'])
table_of_publisher.to_csv('./publisher.csv',index=False,encoding='utf-8')
table_of_publisher

Unnamed: 0,id,name,link
0,11,افق,/publisher/11-%d8%a7%d9%81%d9%82
1,66,نشر نی,/publisher/66-%d9%86%d8%b4%d8%b1-%d9%86%db%8c
2,1967,دانشگاه تهران,/publisher/1967-%d8%af%d8%a7%d9%86%d8%b4%da%af...
3,30,پیدایش,/publisher/30-%d9%be%db%8c%d8%af%d8%a7%db%8c%d...
5,1259,چابک اندیش,/publisher/1259-%da%86%d8%a7%d8%a8%da%a9-%d8%a...
...,...,...,...
534,1580,کتاب طه,/publisher/1580-%da%a9%d8%aa%d8%a7%d8%a8-%d8%b...
536,1338,سرزمین اهورایی,/publisher/1338-%d8%b3%d8%b1%d8%b2%d9%85%db%8c...
537,1982,آوای نور,/publisher/1982-%d8%a2%d9%88%d8%a7%db%8c-%d9%8...
538,1531,بازاریابی,/publisher/1531-%d8%a8%d8%a7%d8%b2%d8%a7%d8%b1...


In [339]:
table_of_writer_page=pd.DataFrame(writer_page_data_list).drop_duplicates(subset=['id','name','link'])
table_of_writer_page.to_csv('./writer_page.csv',index=False,encoding='utf-8')
table_of_writer_page

Unnamed: 0,id,name,link
0,14319,تری دیری,/profile/14319-terry-deary
1,14929,نسرین جزنی,/profile/14929-%d9%86%d8%b3%d8%b1%db%8c%d9%86-...
2,3274,مجموعه ی نویسندگان,/profile/3274-group-of-authors
3,26997,آن جردن,/profile/26997-anne-jordan
4,4943,رضا زنگی آبادی,/profile/4943-%d8%b1%d8%b6%d8%a7-%d8%b2%d9%86%...
...,...,...,...
540,23651,تورج عاطف,/profile/23651-%d8%aa%d9%88%d8%b1%d8%ac-%d8%b9...
541,1091,گوستاو فلوبر,/profile/1091-gustave-flaubert
542,6650,ژرژ ساند,/profile/6650-george-sand
543,28891,ابوالحسن ریاضی,/profile/28891-%d8%a7%d8%a8%d9%88%d8%a7%d9%84%...


In [340]:
books_writers_data_list=list(filter(bool, books_writers_data_list))
table_of_writer=pd.DataFrame(books_writers_data_list).drop_duplicates(subset=['book_id','writer_id'])
table_of_writer.to_csv('./writer.csv',index=False,encoding='utf-8')
table_of_writer

Unnamed: 0,book_id,writer_id
0,25277,14319
1,25671,14929
2,67445,3274
3,67445,26997
4,8108,4943
...,...,...
568,81245,23651
569,3293,1091
570,3293,6650
571,73404,28891


In [341]:
table_of_price_history=pd.DataFrame(price_history_data_list).drop_duplicates(subset=['book_id','price','discount','date'])
table_of_price_history.to_csv('./price-history.csv',index=False,encoding='utf-8')
table_of_price_history

Unnamed: 0,book_id,price,discount,date
0,25277,145000,15,2023-09-25 00:45:50.256746
1,25671,78000,0,2023-09-25 00:45:51.132833
2,67445,72000,0,2023-09-25 00:45:51.558868
3,8108,50000,15,2023-09-25 00:45:51.735717
4,19941,80000,15,2023-09-25 00:45:51.925564
...,...,...,...,...
542,48973,25000,0,2023-09-25 00:47:56.159654
543,81245,15000,20,2023-09-25 00:47:56.288652
544,3293,16000,0,2023-09-25 00:47:56.383811
545,73404,72000,15,2023-09-25 00:47:56.476945


In [342]:
book_veneration_data_list=list(filter(bool, book_veneration_data_list))
table_of_book_veneration=pd.DataFrame(book_veneration_data_list).drop_duplicates(subset=['id','English_Quote','Persian_Quote','Prise_Writer'])
table_of_book_veneration.to_csv('./book_veneration.csv',index=False,encoding='utf-8')
table_of_book_veneration

Unnamed: 0,id,English_Quote,Persian_Quote,Prise_Writer
0,23,One of Faulkner’s comic masterpieces.,از شاهکارهای کمیک فاکنر,barnes and noble
1,64,Convincing and compelling.,باورپذیر و مهیج.,School Library Journal
2,64,"A highly imaginative, absolutely terrific firs...",یک رمان نخست فوق العاده خیال پردازانه و شگرف.,Barnes & Noble
3,64,"An exciting, clever read.",داستانی هیجان انگیز و هوشمندانه.,Booktopia
4,98,An engrossing forecast.,یک پیش بینی هیجان انگیز.,Publishers Weekly
...,...,...,...,...
63,459,"With its themes of epic passion, repugnant gre...",خاطرات گوچی با مضامین اشتیاق حماسی، طمع نفرت ا...,Booklist
64,479,This demanding journey transformed Morgan's wo...,این سفر طاقت فرسا ، کار مورگان را به عنوان یک ...,Booklist
65,479,An earnest person strides out into the world a...,فردی مشتاق از دنیا گام بیرون می نهد و برمی گرد...,Kirkus Reviews
66,479,with its high-powered package of New Age philo...,این کتاب با مطالب قدرتمندی از فلسفه ی عصر جدید...,Publishers Weekly
