In [274]:
#import all libraries
import pandas as pd
import requests
import bs4
import re
import logging
import concurrent.futures
import threading
from time import sleep
from datetime import datetime

In [275]:
def get_links():
    urls = list(pd.read_csv('books_url.csv')['link'])
    return urls

In [276]:
def get_response(input_url):
    headers = {
        'User-Agent': 'My User Agent 1.0',
        "Accept-Language": "en-US,en;q=0.5"
    }
    response = requests.get(input_url, headers=headers)
    return response

In [277]:
def get_soup(input_url):
    headers = {
        'User-Agent': 'My User Agent 1.0',
        "Accept-Language": "en-US,en;q=0.5"
    }
    response = requests.get(input_url, headers=headers)
    if response.status_code != 200:
        print("Error in getting link")
        print("response code is : ", response.status_code)
    soup = bs4.BeautifulSoup(response.text, 'html.parser')
    return soup

In [278]:
def get_fa_title(soup):
    title = soup.select('.product-name strong')[0]
    return title.text


In [279]:
def get_en_title(soup):
    title = soup.select('.product-name-englishname')[0]
    return title.text

In [280]:
def get_price(soup):
    price = soup.select('.price-broken , .col-md-7 .price:nth-child(1)')[0].text
    return int(price.replace(',', ''))

In [281]:
def get_discount(soup):
    try:
        discount_price = int(soup.select('.col-md-12+ .clearfix .price-special')[0].text.replace(',', ''))
        discount_price = ((get_price(soup) - discount_price) / get_price(soup)) * 100
    except Exception:
        discount_price = 0
        logging.exception("This book has no discount!")
    return discount_price

In [282]:
def get_score(soup):
    soup = soup.find('div', {'class': 'col-md-7'}).find('li', {'class': 'pull-left'}).find('div',
                                                                                           {'class': 'my-rating'})
    soup_str = str(soup)

    match = re.search(r'data-rating="(\d+\.\d+)"', soup_str)
    if match:
        data_rating = match.group(1)
        return data_rating


In [283]:
def get_publisher(soup):
    try:
        publisher_tag = soup.select('div.prodoct-attribute-items:nth-child(1) > a')[0]
        publisher_link=publisher_tag.get('href')
        publisher_id=publisher_link.split('/')[2].split('-')[0]
        publisher_name=publisher_tag.text.strip()
    except Exception:
        publisher_link=-1
        publisher_id=-1
        publisher_name=-1
    return {'id':publisher_id,'name':publisher_name,'link':publisher_link}

In [284]:
def get_author(soup):
    authors_list=[]          
    try:
        authors_a_tag = soup.select('.prodoct-attribute-items+ .prodoct-attribute-items > a')
        if(len(authors_a_tag))==0:
                return authors_list
        for author_a_tag in authors_a_tag:
                author_link=author_a_tag.get('href')
                author_id=author_link.split('/')[2].split('-')[0]
                author_name=author_a_tag.text.strip()
                authors_list.append({'id':author_id,'name':author_name,'link':author_link})
    except Exception:
         authors_list=[]         
    return authors_list

In [285]:
def is_author_available(soup):
    try:
        existence = soup.select('.pull-left+ li span')[0].text
    except:
        existence = None
        logging.exception("This book has no author!")
    return existence


In [286]:
def get_book_attribute(soup):
    rows = soup.find('table', {'class': 'product-table'}).findAll('td')
    code = -1
    isbn = -1
    size = -1
    pages = -1
    per_cal = -1
    ad_cal = -1
    material = -1
    series = -1
    send_time = -1
    language = 'ŸÅÿßÿ±ÿ≥€å'
    code_flag = 0
    isbn_flag = 0
    size_flag = 0
    pages_flag = 0
    per_cal_flag = 0
    ad_cal_flag = 0
    material_flag = 0
    language_flag = 0
    series_flag = 0
    send_time_flag = 0

    for row in rows:
        text = row.text.strip()
        if code_flag == 1:
            code = int(text)
            code_flag = 0
        elif isbn_flag == 1:
            isbn = text
            isbn = re.sub('[^0-9-]', '', isbn)
            isbn_flag = 0
        elif size_flag == 1:
            size = text
            size_flag = 0
        elif pages_flag == 1:
            pages = int(text)
            pages_flag = 0
        elif per_cal_flag == 1:
            per_cal = int(text)
            per_cal_flag = 0
        elif ad_cal_flag == 1:
            ad_cal = int(text)
            ad_cal_flag = 0
        elif material_flag == 1:
            material = text
            material_flag = 0
        elif language_flag == 1:
            language = text
            language_flag = 0
        elif series_flag == 1:
            series = int(text)
            series_flag = 0
        elif send_time_flag == 1:
            send_time = text
            send_time_flag = 0

        if '⁄©ÿØ ⁄©ÿ™ÿßÿ®' in text:
            code_flag = 1
        elif 'ÿ¥ÿßÿ®⁄©' in text:
            isbn_flag = 1
        elif 'ŸÇÿ∑ÿπ' in text:
            size_flag = 1
        elif 'ÿ™ÿπÿØÿßÿØ ÿµŸÅÿ≠Ÿá' in text:
            pages_flag = 1
        elif 'ÿ≥ÿßŸÑ ÿßŸÜÿ™ÿ¥ÿßÿ± ÿ¥ŸÖÿ≥€å' in text:
            per_cal_flag = 1
        elif 'ÿ≥ÿßŸÑ ÿßŸÜÿ™ÿ¥ÿßÿ± ŸÖ€åŸÑÿßÿØ€å' in text:
            ad_cal_flag = 1
        elif 'ŸÜŸàÿπ ÿ¨ŸÑÿØ' in text:
            material_flag = 1
        elif 'ÿ≤ÿ®ÿßŸÜ ⁄©ÿ™ÿßÿ®' in text:
            language_flag = 1
        elif 'ÿ≥ÿ±€å ⁄ÜÿßŸæ' in text:
            series_flag = 1
        elif 'ÿ≤ŸàÿØÿ™ÿ±€åŸÜ ÿ≤ŸÖÿßŸÜ ÿßÿ±ÿ≥ÿßŸÑ' in text:
            send_time_flag = 1

    return [code, isbn, size, pages, per_cal, ad_cal, material,language, series, send_time]

In [287]:
def get_summary(soup):
    summary = soup.select('.product-description')[0].text.strip()
    return summary

In [288]:
def get_tags(soup):
    tags = soup.select('.product-tags-item')
    tags_list = []
    for tag in tags:
        tags_list += [tag.text.strip()]
    return tags_list

In [289]:
def get_book_detail(book_soup, site_index):
    book_fa_title = get_fa_title(book_soup)
    book_en_title = get_en_title(book_soup)
    book_price = get_price(book_soup)
    book_discount_percent = get_discount(book_soup)
    book_score = get_score(book_soup)
    book_publisher = get_publisher(book_soup)
    book_author = get_author(book_soup)
    book_author_presence = is_author_available(book_soup)

    [book_code, book_Isbn, book_size, book_pages, book_publication_per_date, book_publication_ad_date,
     book_cover_material,book_language, book_print_series, book_earliest_send_time] = get_book_attribute(book_soup)

    price_history_data_list.append({'book_id':int(book_code),'price':book_price,'discount':int(book_discount_percent),'date':str(datetime.today())})
    writer_page_data_list.extend(book_author)     #9th index is the writer column which is a list of writers
    publishers_data_list.append(book_publisher)  #8th column is dict of publisher
    writers_list_of_dict=book_author.copy()        #writes list which is a list of dictionary
    if len(writers_list_of_dict)==0:
        books_writers_data_list.append({'book_id':int(book_code),'writer_id':-1})
    else:
        for w_id in writers_list_of_dict:
            books_writers_data_list.append({'book_id':int(book_code),'writer_id':w_id['id']})
    book_publisher=book_publisher['id']               #convert 8th column from dict to the publisher's id
    book_data = [site_index, int(book_code), book_Isbn, book_fa_title, book_en_title,
                 book_score, book_publisher,
                 int(book_pages), int(book_publication_per_date), int(book_publication_ad_date), book_size,
                 book_cover_material,
                  book_language,int(book_print_series), book_earliest_send_time, book_author_presence]
    return book_data

In [290]:
def get_book_site_veneration(soup, site_index):
    div = soup.find('div', attrs = {'class':'col-md-6 col-xs-12'})
    ven_lst = list()
    try:
        english_bars = div.find_all('div', attrs = {'class':'english-bar ltr'})
        persian_bars = div.find_all('div', attrs = {'class':'persian-bar'})
        prise_writers = div.find_all('div', attrs = {'class':'prise-writer ltr'})
        
        n = len(english_bars)
        for i in range(n):
            english_quote = english_bars[i].text.strip()
            persian_quote = persian_bars[i].text.strip()
            prise_writer = prise_writers[i].text.strip()

            ven_dic = {'site_index':site_index,
                       'English_Quote': english_quote,
                       'Persian_Quote': persian_quote,
                       'Prise_Writer': prise_writer}
            ven_lst.append(ven_dic)
        return(ven_lst)
    except:
        return ven_lst

In [291]:
def get_book_site_summary(book_soup, site_index):
    try:
        book_summary = get_summary(book_soup)
    except Exception:
        book_summary = None
        logging.exception("This book has no summary!")
    return [site_index, book_summary]

In [292]:
def get_book_site_tags(book_soup, site_index):
    book_tags = get_tags(book_soup)
    book_tags_list = []
    for tag in book_tags:
        book_tags_list += [[site_index, tag]]
    return book_tags_list

In [293]:
def get_book_site_awards(book_soup, site_index):
    awards=[]
    n=len(book_soup.select('.product-features h4'))
    for i in range(0,n):
        awards.append({'site_index':site_index,'award':book_soup.select('.product-features h4')[i].text})
    return awards

In [294]:
def get_req_list(list, req_count):
    if len(list) >= req_count:
        request_list = list[:req_count].copy()
    else:
        request_list = list.copy()
    return request_list

In [295]:
def scrape(site_soup):
    try:
        with lock:
            global site_index
            site_summary_data_list.append(get_book_site_summary(site_soup, site_index))
            site_tags_data_list.extend(get_book_site_tags(site_soup, site_index))
            site_page_books = site_soup.select('.clearfix .clearfix .row')
            for book_index in range(0, len(site_page_books), 2):
                data = get_book_detail(site_page_books[book_index], site_index)
                books_data_list.append(data)
            site_index += 1
    except Exception:
        logging.exception("An error occurred")

In [296]:
def fast_scrape(link):
    try:
        site_soup = get_soup(link)
        with lock:
            global site_index
            site_summary_data_list.append(get_book_site_summary(site_soup, site_index))
            site_tags_data_list.extend(get_book_site_tags(site_soup, site_index))
            book_veneration_data_list.extend(get_book_site_veneration(site_soup,site_index))
            site_award_data_list.extend(get_book_site_awards(site_soup,site_index))
            site_page_books = site_soup.select('.clearfix .clearfix .row')
            for book_index in range(0, len(site_page_books), 2):
                data = get_book_detail(site_page_books[book_index], site_index)
                # price_history_data_list.append({'book_id':data[1],'price':data[5],'discount':data[6],'date':str(datetime.today())})
                # writer_page_data_list.extend(data[9])     #9th index is the writer column which is a list of writers
                # publishers_data_list.append(data[8])  #8th column is dict of publisher
                # data[8]=data[8]['id']                 #convert 8th column from dict to the publisher's id
                # writers_list_of_dict=data[9]          #writes list which is a list of dictionary
                # data.pop(9)                           #remove 9th column from data(9th column was writers)
                # data.pop(5)                           #remove 5th column which is price column
                # data.pop(5)                           #remove 5th column which is discount after deleting price
                # for w_id in writers_list_of_dict:
                #      books_writers_data_list.append({'book_id':data[1],'writer_id':w_id['id']})
                books_data_list.append(data)
            site_index += 1
    except Exception:
        logging.exception("An error occurred")


<h1>Detailed Scraper</h1>

In [297]:
# links = get_links()[:200] + ['https://www.iranketab.ir/book/270-gone-with-the-wind']

# page_response = []
# books_data_list = []
# site_tags_data_list = []
# site_summary_data_list = []

# site_index = 1
# sleep_time = 0.5
# max_threads = 20
# book_count_request = 20  #number of requests per time

# lock = threading.Lock()
# book_urls = links.copy()

# while len(book_urls):
#     sleep(sleep_time)  #sleep so that the site does not ban us
#     request_list = get_req_list(book_urls, book_count_request)  #list of book's urls we want to send request 
#     with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
#         future_list = executor.map(get_response, request_list)
#         for future in future_list:
#             try:
#                 data = future
#                 page_response.append(data)
#             except Exception as exc:
#                 continue
#         for item in page_response:
#             if item.status_code == 200:
#                 page_url = item.url
#                 if page_url in request_list:
#                     page_soup = bs4.BeautifulSoup(item.content, 'html.parser')
#                     scrape(page_soup)
#                     book_urls.remove(page_url)


<h1>Fast Scraper</h1>

In [298]:
links = get_links()[:200] #+ ['https://www.iranketab.ir/book/270-gone-with-the-wind']

books_data_list = []
site_tags_data_list = []
site_summary_data_list = []
site_award_data_list=[]

writer_page_data_list=[]
publishers_data_list=[]
books_writers_data_list=[]
price_history_data_list=[]
book_veneration_data_list=[]

site_index = 1
max_threads = 20

lock = threading.Lock()

with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
    executor.map(fast_scrape, links)


ERROR:root:This book has no summary!
Traceback (most recent call last):
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_11760\1156613770.py", line 3, in get_book_site_summary
    book_summary = get_summary(book_soup)
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_11760\3952702039.py", line 2, in get_summary
    summary = soup.select('.product-description')[0].text.strip()
IndexError: list index out of range
ERROR:root:This book has no summary!
Traceback (most recent call last):
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_11760\1156613770.py", line 3, in get_book_site_summary
    book_summary = get_summary(book_soup)
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_11760\3952702039.py", line 2, in get_summary
    summary = soup.select('.product-description')[0].text.strip()
IndexError: list index out of range
ERROR:root:This book has no discount!
Traceback (most recent call last):
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_11760\1188541366.py", line 3, in get_dis

<h1>Check Completnes</h1>

In [299]:
# if len(book_urls) == 0:
#     print('All links scraped!')
# else:
#     print('Something wrong happened')

<h1>Make Dataframes</h1>

In [300]:
tableOfData = pd.DataFrame(books_data_list,
                           columns=['site_index', 'code', 'Isbn', 'fa_title', 'en_title', 'score',
                                    'publisher_id', 'pages', 'publication_per_date', 'publication_ad_date',
                                    'size', 'cover_material', 'print_series','language' ,'earliest_send_time', 'presence'])
tableOfData

Unnamed: 0,site_index,code,Isbn,fa_title,en_title,score,publisher_id,pages,publication_per_date,publication_ad_date,size,cover_material,print_series,language,earliest_send_time,presence,writers
0,1,112266,978-6226712989,⁄©ÿ™ÿßÿ® ŸÖÿ¨ÿßÿ≤,Allowed,3.13,1875,119,1402,-1,ÿ±ŸÇÿπ€å,ÿ¥ŸàŸÖ€åÿ≤,ŸÅÿßÿ±ÿ≥€å,1,8 ŸÖŸáÿ±,ŸÖŸàÿ¨ŸàÿØ,"[{'id': '31542', 'name': 'ÿ¨ŸàÿßÿØ ÿ™ÿ±ÿ¥€åÿ≤€å', 'link'..."
1,2,97859,978-9645543630,⁄©ÿ™ÿßÿ® ŸÖ⁄©ÿßŸÑŸÖÿßÿ™ ÿ≠ÿ¨ÿßÿ¨ ÿπÿ™ÿ®ÿßÿ™ ÿπÿßŸÑ€åÿßÿ™ ŸÅÿßÿ±ÿ≥€å ÿ®Ÿá ÿπÿ±ÿ®€å,the dialog persian to Arabic for hojjaj,3.05,2651,166,1382,-1,ÿ¨€åÿ®€å,ÿ¥ŸàŸÖ€åÿ≤,ŸÅÿßÿ±ÿ≥€å Ÿà ÿπÿ±ÿ®€å,1,---,ÿ™ŸÖÿßŸÖ ÿ¥ÿØ ÿå ÿßŸÖÿß ŸÖ€åÿßÿ±€åŸÖÿ¥ üòè,"[{'id': '52921', 'name': '€åÿ≠€å€å ŸÖÿπÿ±ŸàŸÅ', 'link':..."
2,3,25277,978-9643691271,⁄©ÿ™ÿßÿ® ÿ¨ŸÜ⁄Ø ŸÅÿ¨€åÿπ ÿ¨ŸáÿßŸÜ€å ÿßŸàŸÑ,The Frightful First World War,3.75,11,128,1402,1998,ÿ±ŸÇÿπ€å,ÿ¥ŸàŸÖ€åÿ≤,ŸÅÿßÿ±ÿ≥€å,17,8 ŸÖŸáÿ±,ŸÖŸàÿ¨ŸàÿØ,"[{'id': '14319', 'name': 'ÿ™ÿ±€å ÿØ€åÿ±€å', 'link': '..."
3,4,-1,-1,ÿ≥ÿ±€åÿßŸÑ ÿ≥ÿßÿπÿ™ ÿÆŸàÿ¥,Saat-e khosh,3.07,1470,-1,-1,-1,-1,-1,ŸÅÿßÿ±ÿ≥€å,-1,5 ŸÖŸáÿ±,ŸÖŸàÿ¨ŸàÿØ,[]
4,5,115076,978-6001176869,⁄©ÿ™ÿßÿ® ÿØÿ±ÿ≥ Ÿáÿß€å€å ÿßÿ≤ ÿ≤ŸÜÿØ⁄Ø€å ŸáÿßŸà⁄©€åŸÜ⁄Ø,How to Think Like Stephen Hawking,3.86,42,176,1402,2016,ÿ±ŸÇÿπ€å,ÿ¥ŸàŸÖ€åÿ≤,ŸÅÿßÿ±ÿ≥€å,1,5 ŸÖŸáÿ±,ŸÖŸàÿ¨ŸàÿØ,"[{'id': '11871', 'name': 'ÿØÿßŸÜ€åŸÑ ÿßÿ≥ŸÖ€åÿ™', 'link'..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219,193,110410,978-6004411028,⁄©ÿ™ÿßÿ® ÿßÿØÿ® ÿπÿßÿ¥ŸÇ€å,Adab-e Asheghi,3.36,1873,168,1398,-1,ÿ±ŸÇÿπ€å,ÿ¥ŸàŸÖ€åÿ≤,ŸÅÿßÿ±ÿ≥€å,3,8 ŸÖŸáÿ±,ŸÖŸàÿ¨ŸàÿØ,"[{'id': '65522', 'name': 'ŸÖÿ≠ŸÖÿØÿ±ÿ∂ÿß ÿπÿßÿ®ÿØ€åŸÜ€å', 'l..."
220,194,57694,978-6001944031,⁄©ÿ™ÿßÿ® ÿ®ÿ±ÿßÿØÿ±ÿßŸÜ ŸàÿßŸà€åŸÑÿß 1,"Snarf Attack, Underfoodle, and the Secret of ...",3.05,1186,148,1398,2004,ÿ±ŸÇÿπ€å,ÿ¥ŸàŸÖ€åÿ≤,ŸÅÿßÿ±ÿ≥€å,3,8 ŸÖŸáÿ±,ŸÖŸàÿ¨ŸàÿØ,"[{'id': '31297', 'name': 'ŸÖÿ±€å ÿ¢ŸÖÿßÿ™Ÿà', 'link': ..."
221,195,34605,978-6001212055,⁄©ÿ™ÿßÿ® ÿ≤ŸÜÿØ⁄Ø€å ŸÖ€å⁄©ŸÑ ÿ¢ŸÜ⁄ò,Life of Michelangelo,3.15,81,175,1400,1907,ÿ±ŸÇÿπ€å,ÿ¥ŸàŸÖ€åÿ≤,ŸÅÿßÿ±ÿ≥€å,8,---,ÿ®Ÿá ÿ≤ŸàÿØ€å üôÑ,"[{'id': '85', 'name': 'ÿ±ŸàŸÖŸÜ ÿ±ŸàŸÑÿßŸÜ', 'link': '/..."
222,196,34500,978-6004366670,⁄©ÿ™ÿßÿ® ÿ¥Ÿáÿ± ÿßŸÜÿØŸàŸá,A City of Sadness,3.22,81,135,1399,2002,ÿ±ŸÇÿπ€å,ÿ¥ŸàŸÖ€åÿ≤,ŸÅÿßÿ±ÿ≥€å,2,5 ŸÖŸáÿ±,ŸÖŸàÿ¨ŸàÿØ,"[{'id': '19936', 'name': 'ÿ®ÿ±ŸÜ€åÿ≥ ÿ±ŸÜŸà', 'link': ..."


In [301]:
tableOfData[tableOfData.code==-1]

Unnamed: 0,site_index,code,Isbn,fa_title,en_title,score,publisher_id,pages,publication_per_date,publication_ad_date,size,cover_material,print_series,language,earliest_send_time,presence,writers
3,4,-1,-1,ÿ≥ÿ±€åÿßŸÑ ÿ≥ÿßÿπÿ™ ÿÆŸàÿ¥,Saat-e khosh,3.07,1470,-1,-1,-1,-1,-1,ŸÅÿßÿ±ÿ≥€å,-1,5 ŸÖŸáÿ±,ŸÖŸàÿ¨ŸàÿØ,[]
22,23,-1,-1,⁄©ÿ™ÿßÿ® ÿµŸàÿ™€å ÿ®ŸÑŸá €åÿß ÿÆ€åÿ±,Yes or No,3.42,1479,-1,-1,-1,-1,-1,ŸÅÿßÿ±ÿ≥€å,-1,---,ÿ®Ÿá ÿ≤ŸàÿØ€å üôÑ,"[{'id': '1045', 'name': 'ÿßÿ≥ŸæŸÜÿ≥ÿ± ÿ¨ÿßŸÜÿ≥ŸàŸÜ', 'link..."
106,95,-1,-1,⁄©ÿ™ÿßÿ® ÿµŸàÿ™€å 50 ÿ®ÿß€åÿØ Ÿà ŸÜÿ®ÿß€åÿØ ÿØÿ± ÿ≤ŸÜÿØ⁄Ø€å ÿ≤ŸÜÿßÿ¥Ÿà€å€å,50 Bayad va Nabayad,3.54,2368,-1,-1,-1,-1,-1,ŸÅÿßÿ±ÿ≥€å,-1,5 ŸÖŸáÿ±,ŸÖŸàÿ¨ŸàÿØ,[]
161,140,-1,-1,ÿ¢ŸÑÿ®ŸàŸÖ ŸÖŸàÿ≥€åŸÇ€å ÿ≤ÿ®€å⁄ØŸÜ€åŸÅ Ÿæÿ±ÿß€åÿ≤ŸÜÿ±,Zbigniew Preisner,3.2,2841,-1,-1,-1,-1,-1,ŸÅÿßÿ±ÿ≥€å,-1,8 ŸÖŸáÿ±,ŸÖŸàÿ¨ŸàÿØ,[]


In [302]:
sss

NameError: name 'sss' is not defined

In [None]:
table_of_cover_type=pd.DataFrame(tableOfData['cover_material'].drop_duplicates())
table_of_cover_type=table_of_cover_type[table_of_cover_type['cover_material']!=-1].reset_index(drop=True)
table_of_cover_type.to_csv('./cover_type.csv',encoding='utf-8')
table_of_cover_type

Unnamed: 0,cover_material
0,ÿ¥ŸàŸÖ€åÿ≤
1,ÿ≥ŸÑŸÅŸàŸÜ€å
2,ÿ¨ŸÑÿØ ÿ≥ÿÆÿ™
3,ÿ¨ŸÑÿØ ŸÜÿ±ŸÖ
4,ÿ≤ÿ±⁄©Ÿàÿ®


In [None]:
table_of_format=pd.DataFrame(tableOfData['size'].drop_duplicates())
table_of_format=table_of_format[table_of_format['size']!=-1].reset_index(drop=True)
table_of_format.to_csv('./format.csv',encoding='utf-8')
table_of_format

Unnamed: 0,size
0,ÿ±ŸÇÿπ€å
1,Ÿàÿ≤€åÿ±€å
2,ÿ¨€åÿ®€å
3,ÿ±ÿ≠ŸÑ€å
4,ÿÆÿ¥ÿ™€å
5,ŸæÿßŸÑÿ™Ÿà€å€å


In [None]:
def convert_size_to_int(size):
    try:
     return table_of_format.index[table_of_format['size']==str(size)].to_list()[0]
    except:
        return -1

tableOfData['size']=tableOfData['size'].apply(convert_size_to_int)

In [None]:
def convert_cover_type_to_int(material):
    try:
     return table_of_cover_type.index[table_of_cover_type['cover_material']==str(material)].to_list()[0]
    except:
        return -1

tableOfData['cover_material']=tableOfData['cover_material'].apply(convert_cover_type_to_int)

In [None]:
tableOfData

Unnamed: 0,site_index,code,Isbn,fa_title,en_title,score,publisher_id,pages,publication_per_date,publication_ad_date,size,cover_material,print_series,language,earliest_send_time,presence
0,1,105323,978-6005861280,⁄©ÿ™ÿßÿ® ŸÖŸÜ €å⁄© ÿßÿ≠ŸÖŸÇ Ÿáÿ≥ÿ™ŸÖ ÿ¥ŸÖÿß ⁄Üÿ∑Ÿàÿ±‚Ä¶ÿü!,"I'm an idiot, how about you...?!",3.04,1259,144,1402,-1,0,0,ŸÅÿßÿ±ÿ≥€å,3,8 ŸÖŸáÿ±,ŸÖŸàÿ¨ŸàÿØ
1,2,19941,978-6006935157,⁄©ÿ™ÿßÿ® ŸÜŸàÿ±ÿ®ÿ±ÿ™ ÿÆÿ±⁄Øÿ±ÿØŸÜ,Norberto Nucagorda,3.86,11,56,1398,1987,0,0,ŸÅÿßÿ±ÿ≥€å,2,8 ŸÖŸáÿ±,ŸÖŸàÿ¨ŸàÿØ
2,3,36714,978-9643111304,⁄©ÿ™ÿßÿ® ÿß€åÿ±ÿßŸÜ ÿ®ÿßÿ≥ÿ™ÿßŸÜ,Ancient Persia,3.88,48,392,1402,1998,1,1,ŸÅÿßÿ±ÿ≥€å,16,5 ŸÖŸáÿ±,ŸÖŸàÿ¨ŸàÿØ
3,4,14109,978-6004054591,⁄©ÿ™ÿßÿ® ŸÜÿßŸÖŸá Ÿáÿß€å€å ÿ®Ÿá ÿßŸàŸÑ⁄Øÿß,Letters to Olga,3.37,31,402,1402,1983,0,2,ŸÅÿßÿ±ÿ≥€å,2,5 ŸÖŸáÿ±,ŸÖŸàÿ¨ŸàÿØ
4,5,8108,9786002961136,⁄©ÿ™ÿßÿ® €å⁄© ÿ±Ÿàÿ≤ ŸÖŸÜÿßÿ≥ÿ® ÿ®ÿ±ÿß€å ÿ¥ŸÜÿß ŸÇŸàÿ±ÿ®ÿßÿ∫Ÿá,A perfect day to swim a frog,3.06,30,92,1393,2014,0,0,ŸÅÿßÿ±ÿ≥€å,1,8 ŸÖŸáÿ±,ŸÖŸàÿ¨ŸàÿØ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219,193,60815,978-9642739295,⁄©ÿ™ÿßÿ® ÿ±Ÿáÿß€å€å ÿßÿ≤ ÿ¥ÿßŸàÿ¥ŸÜ⁄©,The Shawshank Redemption,3.19,1427,135,1392,1996,0,0,ŸÅÿßÿ±ÿ≥€å,1,---,ÿ™ŸÖÿßŸÖ ÿ¥ÿØ ÿå ÿßŸÖÿß ŸÖ€åÿßÿ±€åŸÖÿ¥ üòè
220,194,43592,978-9643499587,⁄©ÿ™ÿßÿ® ÿßÿ™ŸÑ ŸÖÿ™ŸÑ ÿß€åŸÜ Ÿæÿ≥ÿ±⁄©,Atal matal in pesarak,3.75,30,32,1399,-1,4,4,ŸÅÿßÿ±ÿ≥€å,7,---,ÿ™ŸÖÿßŸÖ ÿ¥ÿØ ÿå ÿßŸÖÿß ŸÖ€åÿßÿ±€åŸÖÿ¥ üòè
221,195,57694,978-6001944031,⁄©ÿ™ÿßÿ® ÿ®ÿ±ÿßÿØÿ±ÿßŸÜ ŸàÿßŸà€åŸÑÿß 1,"Snarf Attack, Underfoodle, and the Secret of ...",3.05,1186,148,1398,2004,0,0,ŸÅÿßÿ±ÿ≥€å,3,8 ŸÖŸáÿ±,ŸÖŸàÿ¨ŸàÿØ
222,196,34500,978-6004366670,⁄©ÿ™ÿßÿ® ÿ¥Ÿáÿ± ÿßŸÜÿØŸàŸá,A City of Sadness,3.22,81,135,1399,2002,0,0,ŸÅÿßÿ±ÿ≥€å,2,5 ŸÖŸáÿ±,ŸÖŸàÿ¨ŸàÿØ


In [None]:
tableOfData.to_csv("bookData.csv", index=False, encoding='utf-8')

In [None]:
tableOfSummaryData = pd.DataFrame(site_summary_data_list, columns=['site_index', 'summary']).drop_duplicates(subset=['site_index','summary'])
tableOfSummaryData=tableOfSummaryData[tableOfSummaryData.notnull().all(axis=1)]
tableOfSummaryData

Unnamed: 0,site_index,summary
0,1,ÿß€åŸÜ ⁄©ÿ™ÿßÿ® ÿ®Ÿá ÿ®€åÿßŸÜ ŸÖÿ¥⁄©ŸÑÿßÿ™ ⁄ØŸàŸÜÿß⁄ØŸàŸÜ ÿ¨ÿßŸÖÿπŸá ŸÖÿß ŸÖ€å Ÿæÿ±...
1,2,ÿØÿ± ÿØÿ¥ÿ™ Ÿáÿß€å ŸæŸáŸÜÿßŸàÿ± ÿ¢ŸÅÿ±€åŸÇÿß ⁄©ÿ±⁄ØÿØŸÜ€å ÿ®Ÿá ŸÜÿßŸÖ ŸÜŸàÿ±ÿ®ÿ±ÿ™ ...
2,3,ÿßÿ≤ ŸÖ€åÿßŸÜ ÿ™ŸÖÿßŸÖ ÿ™ŸÖÿØŸÜ Ÿáÿß€å ÿ®ÿ≤ÿ±⁄Ø ÿ¨ŸáÿßŸÜ ÿ®ÿßÿ≥ÿ™ÿßŸÜÿå ÿ™ŸÖÿØŸÜ ÿß...
3,4,ÿß€åŸÜ ⁄©Ÿá ŸÖŸÜ ÿßŸàŸÇÿßÿ™ŸÖ ÿ±ÿß ÿØÿ± ÿß€åŸÜ ÿ¨ÿß Ÿà ÿØŸÇ€åŸÇÿß ÿØÿ± ÿß€åŸÜ ÿ¨...
4,5,ŸÖÿ¨ŸÖŸàÿπŸá ÿØÿßÿ≥ÿ™ÿßŸÜ ¬´€å⁄© ÿ±Ÿàÿ≤ ŸÖŸÜÿßÿ≥ÿ® ÿ®ÿ±ÿß€å ÿ¥ŸÜÿß€å ŸÇŸàÿ±ÿ®ÿßÿ∫Ÿá¬ª...
...,...,...
195,193,⁄©ÿ™ÿßÿ® ÿ≠ÿßÿ∂ÿ±ÿå ŸÖÿ™ŸÜ ŸÅ€åŸÑŸÖ‚ÄåŸÜÿßŸÖŸá ¬´ÿ±Ÿáÿß€å€å ÿßÿ≤ ÿ¥ÿßŸàÿ¥ŸÜ⁄Ø¬ª ÿßÿ≥ÿ™...
196,194,ÿ™ÿ±ÿßŸÜŸá Ÿáÿß€å ÿß€åŸÜ ŸÖÿ¨ŸÖŸàÿπŸá ÿ®ÿß ÿ¨ŸÑÿ® ÿ™Ÿàÿ¨Ÿá ÿÆÿ±ÿØÿ≥ÿßŸÑÿßŸÜ ÿ®Ÿá ŸÖ...
197,195,ÿß€åŸÜ ⁄©ÿ™ÿßÿ® ÿØÿßÿ≥ÿ™ÿßŸÜ ÿØŸà ÿ®ÿ±ÿßÿØÿ± ⁄©ŸÑŸá ŸæŸà⁄© Ÿà ÿ®ÿßŸÖÿ≤Ÿá ÿ±ÿß ÿØÿ±...
198,196,ÿ®ÿ±ŸÜ€åÿ≥ ÿ±ŸÜŸà ÿßÿ≥ÿ™ÿßÿØ ŸÖŸàÿ≥ÿ≥Ÿá ŸáŸÜÿ± ⁄©ÿßŸÑ€åŸÅÿ±ŸÜ€åÿß ÿßÿ≤ ÿ±ÿ¥ÿ™Ÿá Ÿáÿß...


In [None]:
tableOfSummaryData.to_csv( "BookSummaryData.csv", index=False, encoding='utf-8')

In [None]:
tableOfSiteTagsData = pd.DataFrame(site_tags_data_list, columns=['site_index', 'tag'])\
    .drop_duplicates(subset=['site_index','tag'])
tableOfSiteTagsData

Unnamed: 0,site_index,tag
0,1,ÿßÿØÿ®€åÿßÿ™ ŸÖÿπÿßÿµÿ±
1,1,ÿßÿØÿ®€åÿßÿ™ ŸàÿßŸÇÿπ ⁄Øÿ±ÿß€åÿßŸÜŸá
2,1,ÿ±ŸàÿßŸÜÿ¥ŸÜÿßÿ≥€å
3,1,ÿßÿØÿ®€åÿßÿ™ ÿß€åÿ±ÿßŸÜ
4,2,ÿßÿØÿ®€åÿßÿ™ ÿØÿßÿ≥ÿ™ÿßŸÜ€å
...,...,...
1007,197,ÿßÿØÿ®€åÿßÿ™ ⁄©ŸÑÿßÿ≥€å⁄©
1008,197,ÿßÿØÿ®€åÿßÿ™ ŸàÿßŸÇÿπ ⁄Øÿ±ÿß€åÿßŸÜŸá
1009,197,ÿØŸáŸá 1900 ŸÖ€åŸÑÿßÿØ€å
1010,197,⁄©ÿ™ÿßÿ® ŸÖÿµŸàÿ±


In [None]:
table_of_tag=pd.DataFrame(tableOfSiteTagsData['tag']).drop_duplicates(subset=['tag']).reset_index(drop=True)
table_of_tag.to_csv('./tag.csv',encoding='utf-8')
table_of_tag

Unnamed: 0,tag
0,ÿßÿØÿ®€åÿßÿ™ ŸÖÿπÿßÿµÿ±
1,ÿßÿØÿ®€åÿßÿ™ ŸàÿßŸÇÿπ ⁄Øÿ±ÿß€åÿßŸÜŸá
2,ÿ±ŸàÿßŸÜÿ¥ŸÜÿßÿ≥€å
3,ÿßÿØÿ®€åÿßÿ™ ÿß€åÿ±ÿßŸÜ
4,ÿßÿØÿ®€åÿßÿ™ ÿØÿßÿ≥ÿ™ÿßŸÜ€å
...,...
200,ŸÜÿßÿØÿßÿ≥ÿ™ÿßŸÜ
201,ÿ¢ŸÖŸàÿ≤ÿ¥ Ÿàÿ±ÿ≤ÿ¥€å
202,ÿØÿ±ÿßŸÖ
203,ÿØÿßÿ≥ÿ™ÿßŸÜ ŸÖÿßÿ¨ÿ±ÿß€å€å


In [None]:
def convert_tag_to_int(tag):
    try:
     return table_of_tag.index[table_of_tag['tag']==str(tag)].to_list()[0]
    except:
        return -1

tableOfSiteTagsData['tag']=tableOfSiteTagsData['tag'].apply(convert_tag_to_int)

In [None]:
tableOfSiteTagsData.to_csv('bookTagsData.csv',index=False,encoding='utf-8')

In [None]:
table_of_publisher=pd.DataFrame(publishers_data_list).drop_duplicates(subset=['id','name','link'])
table_of_publisher.to_csv('./publisher.csv',index=False,encoding='utf-8')
table_of_publisher

Unnamed: 0,id,name,link
0,1259,⁄Üÿßÿ®⁄© ÿßŸÜÿØ€åÿ¥,/publisher/1259-%da%86%d8%a7%d8%a8%da%a9-%d8%a...
1,11,ÿßŸÅŸÇ,/publisher/11-%d8%a7%d9%81%d9%82
2,48,ŸÇŸÇŸÜŸàÿ≥,/publisher/48-%d9%82%d9%82%d9%86%d9%88%d8%b3
3,31,ŸÜÿ¥ÿ± ÿ´ÿßŸÑÿ´,/publisher/31-%d9%86%d8%b4%d8%b1-%d8%ab%d8%a7%...
4,30,Ÿæ€åÿØÿß€åÿ¥,/publisher/30-%d9%be%db%8c%d8%af%d8%a7%db%8c%d...
...,...,...,...
217,1265,ÿ≠ÿ±ŸÅŸá ŸÜŸà€åÿ≥ŸÜÿØŸá,/publisher/1265-%d8%ad%d8%b1%d9%81%d9%87-%d9%8...
218,1405,ŸÅÿ±ÿßÿ±ŸàÿßŸÜ,/publisher/1405-%d9%81%d8%b1%d8%a7%d8%b1%d9%88...
219,1427,ŸÅÿßÿ±ÿßÿ®€å,/publisher/1427-%d9%81%d8%a7%d8%b1%d8%a7%d8%a8...
221,1186,ÿ≠Ÿàÿ∂ ŸÜŸÇÿ±Ÿá,/publisher/1186-%d8%ad%d9%88%d8%b6-%d9%86%d9%8...


In [None]:
table_of_writer_page=pd.DataFrame(writer_page_data_list).drop_duplicates(subset=['id','name','link'])
table_of_writer_page.to_csv('./writer_page.csv',index=False,encoding='utf-8')
table_of_writer_page

Unnamed: 0,id,name,link
0,17747,ÿ≠ÿ≥ŸÜ ⁄Üÿßÿ®⁄©,/profile/17747-hasan-chabok
1,11484,ŸÖ€åÿ¥ÿßÿ¶€åŸÑ ÿßŸÜÿØŸá,/profile/11484-michael-ende
2,18389,€åŸàÿ≤ŸÅ Ÿà€åÿ≤ŸáŸàŸÅÿ±,/profile/18389-josef-wieseh%c3%b6fer
3,7713,Ÿàÿßÿ™ÿ≥ŸÑÿßŸÅ ŸáÿßŸàŸÑ,/profile/7713-v%c3%a1clav-havel
4,4943,ÿ±ÿ∂ÿß ÿ≤ŸÜ⁄Ø€å ÿ¢ÿ®ÿßÿØ€å,/profile/4943-%d8%b1%d8%b6%d8%a7-%d8%b2%d9%86%...
...,...,...,...
218,33432,ŸÅÿ±ÿßŸÜ⁄© ÿØÿßÿ±ÿßÿ®ÿßŸÜÿ™,/profile/33432-frank-darabont
219,24833,ÿ±Ÿáÿß ÿ≤ÿßÿØŸÖŸáÿ±,/profile/24833-%d8%b1%d9%87%d8%a7-%d8%b2%d8%a7...
220,31297,ŸÖÿ±€å ÿ¢ŸÖÿßÿ™Ÿà,/profile/31297-mary-amato
221,19936,ÿ®ÿ±ŸÜ€åÿ≥ ÿ±ŸÜŸà,/profile/19936-b%c3%a9r%c3%a9nice-reynaud


In [None]:
books_writers_data_list=list(filter(bool, books_writers_data_list))
table_of_writer=pd.DataFrame(books_writers_data_list).drop_duplicates(subset=['book_id','writer_id'])
#drop rows which both book_id and writer_id is -1
# table_of_writer=table_of_writer[(table_of_writer['book_id']!=-1) | (table_of_writer['writer_id']!=-1)]
table_of_writer.to_csv('./writer.csv',index=False,encoding='utf-8')
table_of_writer

Unnamed: 0,book_id,writer_id
0,105323,17747
1,19941,11484
2,36714,18389
3,14109,7713
4,8108,4943
...,...,...
227,60815,33432
228,43592,24833
229,57694,31297
230,34500,19936


In [None]:
table_of_price_history=pd.DataFrame(price_history_data_list).drop_duplicates(subset=['book_id','price','discount','date'])
table_of_price_history.to_csv('./price-history.csv',index=False,encoding='utf-8')
table_of_price_history

Unnamed: 0,book_id,price,discount,date
0,105323,69000,15,2023-09-25 08:34:03.302672
1,19941,80000,15,2023-09-25 08:34:04.649521
2,36714,220000,15,2023-09-25 08:34:05.822251
3,14109,350000,15,2023-09-25 08:34:06.443938
4,8108,50000,15,2023-09-25 08:34:06.551931
...,...,...,...,...
219,60815,8000,0,2023-09-25 08:34:50.561454
220,43592,80000,0,2023-09-25 08:34:50.647022
221,57694,140000,25,2023-09-25 08:34:50.732730
222,34500,74000,15,2023-09-25 08:34:50.824738


In [None]:
book_veneration_data_list=list(filter(bool, book_veneration_data_list))
table_of_book_veneration=pd.DataFrame(book_veneration_data_list).drop_duplicates(subset=['site_index','English_Quote','Persian_Quote','Prise_Writer'])
table_of_book_veneration.to_csv('./book_veneration.csv',index=False,encoding='utf-8')
table_of_book_veneration

Unnamed: 0,site_index,English_Quote,Persian_Quote,Prise_Writer
0,22,One of Faulkner‚Äôs comic masterpieces.,ÿßÿ≤ ÿ¥ÿßŸá⁄©ÿßÿ±Ÿáÿß€å ⁄©ŸÖ€å⁄© ŸÅÿß⁄©ŸÜÿ±,barnes and noble
1,58,Convincing and compelling.,ÿ®ÿßŸàÿ±Ÿæÿ∞€åÿ± Ÿà ŸÖŸá€åÿ¨.,School Library Journal
2,58,"A highly imaginative, absolutely terrific firs...",€å⁄© ÿ±ŸÖÿßŸÜ ŸÜÿÆÿ≥ÿ™ ŸÅŸàŸÇ ÿßŸÑÿπÿßÿØŸá ÿÆ€åÿßŸÑ Ÿæÿ±ÿØÿßÿ≤ÿßŸÜŸá Ÿà ÿ¥⁄Øÿ±ŸÅ.,Barnes & Noble
3,58,"An exciting, clever read.",ÿØÿßÿ≥ÿ™ÿßŸÜ€å Ÿá€åÿ¨ÿßŸÜ ÿßŸÜ⁄Ø€åÿ≤ Ÿà ŸáŸàÿ¥ŸÖŸÜÿØÿßŸÜŸá.,Booktopia
4,100,An engrossing forecast.,€å⁄© Ÿæ€åÿ¥ ÿ®€åŸÜ€å Ÿá€åÿ¨ÿßŸÜ ÿßŸÜ⁄Ø€åÿ≤.,Publishers Weekly
5,100,"Original, accessible, and provocative.",ÿ®ÿØ€åÿπÿå ŸÇÿßÿ®ŸÑ ŸÅŸáŸÖ Ÿà ÿ®ÿ±ÿßŸÜ⁄Ø€åÿ≤ÿßŸÜŸÜÿØŸá.,Science
6,100,A compelling guide to the challenges and choic...,ÿ±ÿßŸáŸÜŸÖÿß€å€å ÿ¨ÿ∞ÿßÿ® ÿ®ÿ±ÿß€å ⁄ÜÿßŸÑÿ¥ Ÿáÿß Ÿà ÿßŸÜÿ™ÿÆÿßÿ® Ÿáÿß€å Ÿæ€åÿ¥ ÿ±Ÿà...,Elon Musk
7,101,This stunning work showcases Krauss's consiste...,ÿß€åŸÜ ÿßÿ´ÿ± ÿÆ€åÿ±Ÿá ⁄©ŸÜŸÜÿØŸáÿå ŸÜÿ¥ÿßŸÜ ÿØŸáŸÜÿØŸá €å ÿßÿ≥ÿ™ÿπÿØÿßÿØ ŸáŸÖ€åÿ¥⁄Ø...,Publishers Weekly
8,101,"Masterful, evocative and moving.",ÿßÿ≥ÿ™ÿßÿØÿßŸÜŸáÿå ÿßÿ≠ÿ≥ÿßÿ≥ ÿ®ÿ±ÿßŸÜ⁄Ø€åÿ≤ Ÿà ÿ™⁄©ÿßŸÜ ÿØŸáŸÜÿØŸá.,NPR
9,101,A meditation on memory and loss.,ÿ™ÿ£ŸÖŸÑ€å ÿ®ÿ± ÿÆÿßÿ∑ÿ±Ÿá Ÿà ŸÅŸÇÿØÿßŸÜ.,Los Angeles Times


In [None]:
table_of_award=pd.DataFrame(site_award_data_list).drop_duplicates(subset=['site_index','award'])
table_of_award.to_csv('./award.csv',index=False,encoding='utf-8')
table_of_award

Unnamed: 0,site_index,award
0,22,ÿ®ÿ±ŸÜÿØŸá €å ÿ¨ÿß€åÿ≤Ÿá €å ŸæŸàŸÑ€åÿ™ÿ≤ÿ± ÿ≥ÿßŸÑ €±€π€∂€≥
1,48,ÿ®ÿ±ŸÜÿØŸá ÿ¨ÿß€åÿ≤Ÿá ŸæŸàŸÑ€åÿ™ÿ≤ÿ±
2,48,ÿ®ÿ±ŸÜÿØŸá ÿ¨ÿß€åÿ≤Ÿá €å ŸÜŸÖÿß€åÿ¥ŸÜÿßŸÖŸá €å ÿ≠ŸÑŸÇŸá €å ŸÖŸÜÿ™ŸÇÿØ€åŸÜ ŸÜ€åŸà...
3,48,ÿ®ÿ±ŸÜÿØŸá ÿ¨ÿß€åÿ≤Ÿá Tony ÿ≥ÿßŸÑ 1987
4,58,ŸÜÿßŸÖÿ≤ÿØ ÿ¨ÿß€åÿ≤Ÿá ⁄©ÿ™ÿßÿ® ŸÜÿßÿ¥ÿ± ŸÖÿ≥ÿ™ŸÇŸÑ ÿ≥ÿßŸÑ 1999
5,58,ÿ®ÿ±ŸÜÿØŸá ÿ¨ÿß€åÿ≤Ÿá ÿÆŸàÿßŸÜŸÜÿØŸá ⁄Øÿ±ŸÜÿØ ⁄©ŸÜ€åŸàŸÜ ÿ≥ÿßŸÑ 1998
6,58,ŸÜÿßŸÖÿ≤ÿØ ŸÖÿØÿßŸÑ ÿÆŸàÿßŸÜŸÜÿØ⁄ØÿßŸÜ ÿ¨ŸàÿßŸÜ ⁄©ÿßŸÑ€åŸÅÿ±ŸÜ€åÿß ÿ≥ÿßŸÑ 1998
7,58,ÿ®ÿ±ŸÜÿØŸá ÿ¨ÿß€åÿ≤Ÿá ÿ≥⁄©Ÿà€åÿß ÿßŸà⁄©ŸÑÿßŸáŸÖÿß ÿ≥ÿßŸÑ 1998
8,100,ÿßÿ≤ Ÿæÿ±ŸÅÿ±Ÿàÿ¥ ÿ™ÿ±€åŸÜ ⁄©ÿ™ÿßÿ® Ÿáÿß€å ŸÜ€åŸà€åŸàÿ±⁄© ÿ™ÿß€åŸÖÿ≤
9,101,ÿ®ÿ±ŸÜÿØŸá €å ÿ¨ÿß€åÿ≤Ÿá €å Anisfield-Wolf ÿ≥ÿßŸÑ 2011
