In [1]:
#import all libraries
import pandas as pd
import requests
import bs4
import re
import logging
import concurrent.futures
import threading
from time import sleep
from datetime import datetime

In [2]:
def get_links():
    urls = list(pd.read_csv('books_url.csv')['link'])
    return urls

In [3]:
def get_response(input_url):
    headers = {
        'User-Agent': 'My User Agent 1.0',
        "Accept-Language": "en-US,en;q=0.5"
    }
    response = requests.get(input_url, headers=headers)
    return response

In [4]:
def get_soup(input_url):
    headers = {
        'User-Agent': 'My User Agent 1.0',
        "Accept-Language": "en-US,en;q=0.5"
    }
    response = requests.get(input_url, headers=headers)
    if response.status_code != 200:
        print("Error in getting link")
        print("response code is : ", response.status_code)
    soup = bs4.BeautifulSoup(response.text, 'html.parser')
    return soup

In [5]:
def get_fa_title(soup):
    title = soup.select('.product-name strong')[0]
    return title.text


In [6]:
def get_en_title(soup):
    title = soup.select('.product-name-englishname')[0]
    return title.text

In [7]:
def get_price(soup):
    price = soup.select('.price-broken , .col-md-7 .price:nth-child(1)')[0].text
    return int(price.replace(',', ''))

In [8]:
def get_discount(soup):
    try:
        discount_price = int(soup.select('.col-md-12+ .clearfix .price-special')[0].text.replace(',', ''))
        discount_price = ((get_price(soup) - discount_price) / get_price(soup)) * 100
    except Exception:
        discount_price = 0
        logging.exception("This book has no discount!")
    return discount_price

In [9]:
def get_score(soup):
    soup = soup.find('div', {'class': 'col-md-7'}).find('li', {'class': 'pull-left'}).find('div',
                                                                                           {'class': 'my-rating'})
    soup_str = str(soup)

    match = re.search(r'data-rating="(\d+\.\d+)"', soup_str)
    if match:
        data_rating = match.group(1)
        return data_rating


In [10]:
def get_publisher(soup):
    try:
        publisher_tag = soup.select('div.prodoct-attribute-items:nth-child(1) > a')[0]
        publisher_link = publisher_tag.get('href')
        publisher_id = publisher_link.split('/')[2].split('-')[0]
        publisher_name = publisher_tag.text.strip()
    except Exception:
        publisher_link = -1
        publisher_id = -1
        publisher_name = -1
    return {'id': publisher_id, 'name': publisher_name}

In [11]:
def get_author(soup):
    authors_list = []
    try:
        authors_a_tag = soup.select('.prodoct-attribute-items+ .prodoct-attribute-items > a')
        if (len(authors_a_tag)) == 0:
            return authors_list
        for author_a_tag in authors_a_tag:
            author_link = author_a_tag.get('href')
            author_id = author_link.split('/')[2].split('-')[0]
            author_name = author_a_tag.text.strip()
            authors_list.append({'id': author_id, 'name': author_name, 'link': author_link})
    except Exception:
        authors_list = []
    return authors_list

In [12]:
def is_author_available(soup):
    try:
        existence = soup.select('.pull-left+ li span')[0].text
    except:
        existence = None
        logging.exception("This book has no author!")
    return existence


In [13]:
def get_book_attribute(soup):
    rows = soup.find('table', {'class': 'product-table'}).findAll('td')
    code = -1
    isbn = -1
    size = -1
    pages = -1
    per_cal = -1
    ad_cal = -1
    material = -1
    series = -1
    send_time = -1
    language = 'ŸÅÿßÿ±ÿ≥€å'
    translators = []
    paper_type = -1
    code_flag = 0
    isbn_flag = 0
    size_flag = 0
    pages_flag = 0
    per_cal_flag = 0
    ad_cal_flag = 0
    material_flag = 0
    language_flag = 0
    series_flag = 0
    send_time_flag = 0
    translators_flag = 0
    paper_type_flag = 0

    for row in rows:
        text = row.text.strip()
        if code_flag == 1:
            code = int(text)
            code_flag = 0
        elif isbn_flag == 1:
            isbn = text
            isbn = re.sub('[^0-9-]', '', isbn)
            isbn_flag = 0
        elif size_flag == 1:
            size = text
            size_flag = 0
        elif pages_flag == 1:
            pages = int(text)
            pages_flag = 0
        elif per_cal_flag == 1:
            per_cal = int(text)
            per_cal_flag = 0
        elif ad_cal_flag == 1:
            ad_cal = int(text)
            ad_cal_flag = 0
        elif material_flag == 1:
            material = text
            material_flag = 0
        elif language_flag == 1:
            language = text
            language_flag = 0
        elif series_flag == 1:
            series = int(text)
            series_flag = 0
        elif send_time_flag == 1:
            send_time = text
            send_time_flag = 0
        elif paper_type_flag == 1:
            paper_type = text
            paper_type_flag = 0
        elif translators_flag == 1:
            translators_a_tag = row.select('a')
            translators_flag = 0
            for a_tag in translators_a_tag:
                translators.append({'id': a_tag.get('href').split('/')[2].split('-')[0], 'name': a_tag.text.strip(),
                                    'link': a_tag.get('href')})

        if '⁄©ÿØ ⁄©ÿ™ÿßÿ®' in text:
            code_flag = 1
        elif 'ÿ¥ÿßÿ®⁄©' in text:
            isbn_flag = 1
        elif 'ŸÇÿ∑ÿπ' in text:
            size_flag = 1
        elif 'ÿ™ÿπÿØÿßÿØ ÿµŸÅÿ≠Ÿá' in text:
            pages_flag = 1
        elif 'ÿ≥ÿßŸÑ ÿßŸÜÿ™ÿ¥ÿßÿ± ÿ¥ŸÖÿ≥€å' in text:
            per_cal_flag = 1
        elif 'ÿ≥ÿßŸÑ ÿßŸÜÿ™ÿ¥ÿßÿ± ŸÖ€åŸÑÿßÿØ€å' in text:
            ad_cal_flag = 1
        elif 'ŸÜŸàÿπ ÿ¨ŸÑÿØ' in text:
            material_flag = 1
        elif 'ÿ≤ÿ®ÿßŸÜ ⁄©ÿ™ÿßÿ®' in text:
            language_flag = 1
        elif 'ÿ≥ÿ±€å ⁄ÜÿßŸæ' in text:
            series_flag = 1
        elif 'ÿ≤ŸàÿØÿ™ÿ±€åŸÜ ÿ≤ŸÖÿßŸÜ ÿßÿ±ÿ≥ÿßŸÑ' in text:
            send_time_flag = 1
        elif 'ŸÖÿ™ÿ±ÿ¨ŸÖ' in text:
            translators_flag = 1
        elif 'ŸÜŸàÿπ ⁄©ÿßÿ∫ÿ∞' in text:
            paper_type_flag = 1

    return [code, isbn, size, pages, per_cal, ad_cal, material, series, language, send_time, translators, paper_type]

In [14]:
def get_summary(soup):
    summary = soup.select('.product-description')[0].text.strip()
    return summary

In [15]:
def get_tags(soup):
    tags = soup.select('.product-tags-item')
    tags_list = []
    for tag in tags:
        tags_list += [tag.text.strip()]
    return tags_list

In [16]:
def get_book_detail(book_soup, book_id):
    book_fa_title = get_fa_title(book_soup)
    book_en_title = get_en_title(book_soup)
    book_price = get_price(book_soup)
    book_discount_percent = get_discount(book_soup)
    book_score = get_score(book_soup)
    book_publisher = get_publisher(book_soup)
    book_author = get_author(book_soup)
    book_author_presence = is_author_available(book_soup)

    [book_code, book_Isbn, book_size, book_pages, book_publication_per_date, book_publication_ad_date,
     book_cover_material, book_print_series, book_language, book_earliest_send_time, book_translators,
     paper_type] = get_book_attribute(book_soup)
    price_history_data_list.append(
        {'book_id': int(book_code), 'price': book_price, 'discount': int(book_discount_percent),
         'date': str(datetime.today())})

    publishers_data_list.append(book_publisher)  #8th column is dict of publisher

    #writer
    writer_page_data_list.extend(book_author)  #9th index is the writer column which is a list of writers
    writers_list_of_dict = book_author.copy()  #writes list which is a list of dictionary
    if len(writers_list_of_dict) != 0:
        for w_id in writers_list_of_dict:
            books_writers_data_list.append({'book_id': int(book_code), 'writer_id': w_id['id']})

    #translator

    translator_page_data_list.extend(book_translators)
    translators_list_of_dict = book_translators.copy()  #writes list which is a list of dictionary
    if len(translators_list_of_dict) != 0:
        for t_id in translators_list_of_dict:
            books_translators_data_list.append({'book_id': int(book_code), 'translator_id': t_id['id']})
    book_publisher = book_publisher['id']  #convert 8th column from dict to the publisher's id
    book_data = [site_index, int(book_code), book_Isbn, book_fa_title, book_en_title,
                 book_score, book_publisher,
                 int(book_pages), int(book_publication_per_date), int(book_publication_ad_date), book_size,
                 book_cover_material,
                 int(book_print_series), book_language, book_earliest_send_time, book_author_presence, paper_type]
    return book_data

In [17]:
def get_book_site_veneration(soup, site_index):
    div = soup.find('div', attrs={'class': 'col-md-6 col-xs-12'})
    ven_lst = list()
    try:
        english_bars = div.find_all('div', attrs={'class': 'english-bar ltr'})
        persian_bars = div.find_all('div', attrs={'class': 'persian-bar'})
        prise_writers = div.find_all('div', attrs={'class': 'prise-writer ltr'})

        n = len(english_bars)
        for i in range(n):
            english_quote = english_bars[i].text.strip()
            persian_quote = persian_bars[i].text.strip()
            prise_writer = prise_writers[i].text.strip()

            ven_dic = {'site_index': site_index,
                       'English_Quote': english_quote,
                       'Persian_Quote': persian_quote,
                       'Prise_Writer': prise_writer}
            ven_lst.append(ven_dic)
        return (ven_lst)
    except:
        return ven_lst

In [18]:
def get_book_site_summary(book_soup, site_index):
    try:
        book_summary = get_summary(book_soup)
    except Exception:
        book_summary = None
        logging.exception("This book has no summary!")
    return [site_index, book_summary]

In [19]:
def get_book_site_tags(book_soup, site_index):
    book_tags = get_tags(book_soup)
    book_tags_list = []
    for tag in book_tags:
        book_tags_list += [[site_index, tag]]
    return book_tags_list

In [20]:
def get_book_site_awards(book_soup, site_index):
    awards = []
    n = len(book_soup.select('.product-features h4'))
    for i in range(0, n):
        awards.append({'site_index': site_index, 'award': book_soup.select('.product-features h4')[i].text})
    return awards

In [21]:
def get_req_list(list, req_count):
    if len(list) >= req_count:
        request_list = list[:req_count].copy()
    else:
        request_list = list.copy()
    return request_list

In [22]:
def scrape(site_soup):
    try:
        with lock:
            global site_index
            site_summary_data_list.append(get_book_site_summary(site_soup, site_index))
            site_tags_data_list.extend(get_book_site_tags(site_soup, site_index))
            site_page_books = site_soup.select('.clearfix .clearfix .row')
            for book_index in range(0, len(site_page_books), 2):
                data = get_book_detail(site_page_books[book_index], site_index)
                books_data_list.append(data)
            site_index += 1
    except Exception:
        logging.exception("An error occurred")

In [51]:
def fast_scrape(link):
    try:
        book_id = re.findall(r"https://www\.iranketab\.ir/book/(\d+).*", link)[0]
        site_soup = get_soup(link)
        with lock:
            site_summary_data_list.append(get_book_site_summary(site_soup, book_id))
            site_tags_data_list.extend(get_book_site_tags(site_soup, book_id))
            book_veneration_data_list.extend(get_book_site_veneration(site_soup, book_id))
            site_award_data_list.extend(get_book_site_awards(site_soup, book_id))
            site_page_books = site_soup.select('.clearfix .clearfix .row')
            for book_index in range(0, len(site_page_books), 2):
                data = get_book_detail(site_page_books[book_index], book_id)
                # price_history_data_list.append({'book_id':data[1],'price':data[5],'discount':data[6],'date':str(datetime.today())})
                # writer_page_data_list.extend(data[9])     #9th index is the writer column which is a list of writers
                # publishers_data_list.append(data[8])  #8th column is dict of publisher
                # data[8]=data[8]['id']                 #convert 8th column from dict to the publisher's id
                # writers_list_of_dict=data[9]          #writes list which is a list of dictionary
                # data.pop(9)                           #remove 9th column from data(9th column was writers)
                # data.pop(5)                           #remove 5th column which is price column
                # data.pop(5)                           #remove 5th column which is discount after deleting price
                # for w_id in writers_list_of_dict:
                #      books_writers_data_list.append({'book_id':data[1],'writer_id':w_id['id']})
                books_data_list.append(data)
    except Exception:
        logging.exception("An error occurred")


<h1>Detailed Scraper</h1>

In [24]:
# links = get_links()[:200] + ['https://www.iranketab.ir/book/270-gone-with-the-wind']

# page_response = []
# books_data_list = []
# site_tags_data_list = []
# site_summary_data_list = []

# site_index = 1
# sleep_time = 0.5
# max_threads = 20
# book_count_request = 20  #number of requests per time

# lock = threading.Lock()
# book_urls = links.copy()

# while len(book_urls):
#     sleep(sleep_time)  #sleep so that the site does not ban us
#     request_list = get_req_list(book_urls, book_count_request)  #list of book's urls we want to send request 
#     with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
#         future_list = executor.map(get_response, request_list)
#         for future in future_list:
#             try:
#                 data = future
#                 page_response.append(data)
#             except Exception as exc:
#                 continue
#         for item in page_response:
#             if item.status_code == 200:
#                 page_url = item.url
#                 if page_url in request_list:
#                     page_soup = bs4.BeautifulSoup(item.content, 'html.parser')
#                     scrape(page_soup)
#                     book_urls.remove(page_url)


<h1>Fast Scraper</h1>

In [52]:
links = get_links()[:200]
#+ ['https://www.iranketab.ir/book/270-gone-with-the-wind']

books_data_list = []
site_tags_data_list = []
site_summary_data_list = []
site_award_data_list = []

writer_page_data_list = []
translator_page_data_list = []
publishers_data_list = []
price_history_data_list = []
book_veneration_data_list = []
#middle tables
books_writers_data_list = []
books_translators_data_list = []

max_threads = 20

lock = threading.Lock()

with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
    executor.map(fast_scrape, links)


ERROR:root:This book has no discount!
Traceback (most recent call last):
  File "C:\Users\raeim\AppData\Local\Temp\ipykernel_11352\2187018050.py", line 3, in get_discount
    discount_price = int(soup.select('.col-md-12+ .clearfix .price-special')[0].text.replace(',', ''))
IndexError: list index out of range
ERROR:root:This book has no discount!
Traceback (most recent call last):
  File "C:\Users\raeim\AppData\Local\Temp\ipykernel_11352\2187018050.py", line 3, in get_discount
    discount_price = int(soup.select('.col-md-12+ .clearfix .price-special')[0].text.replace(',', ''))
IndexError: list index out of range
ERROR:root:This book has no summary!
Traceback (most recent call last):
  File "C:\Users\raeim\AppData\Local\Temp\ipykernel_11352\378574974.py", line 3, in get_book_site_summary
    book_summary = get_summary(book_soup)
  File "C:\Users\raeim\AppData\Local\Temp\ipykernel_11352\1354406443.py", line 2, in get_summary
    summary = soup.select('.product-description')[0].text.strip

<h1>Check Completnes</h1>

In [26]:
# if len(book_urls) == 0:
#     print('All links scraped!')
# else:
#     print('Something wrong happened')

<h1>Make Dataframes</h1>

In [55]:
tableOfData = pd.DataFrame(books_data_list,
                           columns=['site_id', 'code', 'Isbn', 'fa_title', 'en_title', 'score',
                                    'publisher_id', 'pages', 'publication_per_date', 'publication_ad_date',
                                    'size', 'cover_material', 'print_series', 'language', 'earliest_send_time',
                                    'presence', 'paper_type'])
tableOfData = tableOfData[tableOfData['code'] != -1]

In [56]:
tableOfData

Unnamed: 0,site_id,code,Isbn,fa_title,en_title,score,publisher_id,pages,publication_per_date,publication_ad_date,size,cover_material,print_series,language,earliest_send_time,presence,paper_type
1,8108,8108,9786002961136,⁄©ÿ™ÿßÿ® €å⁄© ÿ±Ÿàÿ≤ ŸÖŸÜÿßÿ≥ÿ® ÿ®ÿ±ÿß€å ÿ¥ŸÜÿß ŸÇŸàÿ±ÿ®ÿßÿ∫Ÿá,A perfect day to swim a frog,3.06,30,92,1393,2014,ÿ±ŸÇÿπ€å,ÿ¥ŸàŸÖ€åÿ≤,1,ŸÅÿßÿ±ÿ≥€å,8 ŸÖŸáÿ±,ŸÖŸàÿ¨ŸàÿØ,-1
2,67734,67734,978-6007084205,⁄©ÿ™ÿßÿ® ÿ±Ÿàÿßÿ®ÿ∑ ÿ®€åÿ≤ÿßŸÜÿ≥ Ÿà ÿß€åÿ±ÿßŸÜ,Ravebet Bizanso Iran,3.62,1988,312,1393,-1,Ÿàÿ≤€åÿ±€å,ÿ¥ŸàŸÖ€åÿ≤,1,ŸÅÿßÿ±ÿ≥€å,5 ŸÖŸáÿ±,ŸÖŸàÿ¨ŸàÿØ,-1
3,67445,67445,978-9640363584,⁄©ÿ™ÿßÿ® ÿ±Ÿà€å⁄©ÿ±ÿØŸáÿß€å €åÿßÿØ⁄Ø€åÿ±€å ŸÜÿ∏ÿ±€åŸá Ÿà ⁄©ÿßÿ±ÿ®ÿ≥ÿ™,Approaches to Learning,3.07,1967,362,1399,2008,Ÿàÿ≤€åÿ±€å,ÿ¥ŸàŸÖ€åÿ≤,5,ŸÅÿßÿ±ÿ≥€å,---,ÿ™ŸÖÿßŸÖ ÿ¥ÿØ ÿå ÿßŸÖÿß ŸÖ€åÿßÿ±€åŸÖÿ¥ üòè,-1
4,115076,115076,978-6001176869,⁄©ÿ™ÿßÿ® ÿØÿ±ÿ≥ Ÿáÿß€å€å ÿßÿ≤ ÿ≤ŸÜÿØ⁄Ø€å ŸáÿßŸà⁄©€åŸÜ⁄Ø,How to Think Like Stephen Hawking,3.86,42,176,1402,2016,ÿ±ŸÇÿπ€å,ÿ¥ŸàŸÖ€åÿ≤,1,ŸÅÿßÿ±ÿ≥€å,5 ŸÖŸáÿ±,ŸÖŸàÿ¨ŸàÿØ,-1
5,25671,25671,978-9643124694,⁄©ÿ™ÿßÿ® ŸÖÿØ€åÿ±€åÿ™ ŸÖŸÜÿßÿ®ÿπ ÿßŸÜÿ≥ÿßŸÜ€å,Human resources management,3.8,66,496,1398,-1,Ÿàÿ≤€åÿ±€å,ÿ¥ŸàŸÖ€åÿ≤,13,ŸÅÿßÿ±ÿ≥€å,---,ÿ®Ÿá ÿ≤ŸàÿØ€å üôÑ,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219,60815,60815,978-9642739295,⁄©ÿ™ÿßÿ® ÿ±Ÿáÿß€å€å ÿßÿ≤ ÿ¥ÿßŸàÿ¥ŸÜ⁄©,The Shawshank Redemption,3.19,1427,135,1392,1996,ÿ±ŸÇÿπ€å,ÿ¥ŸàŸÖ€åÿ≤,1,ŸÅÿßÿ±ÿ≥€å,---,ÿ™ŸÖÿßŸÖ ÿ¥ÿØ ÿå ÿßŸÖÿß ŸÖ€åÿßÿ±€åŸÖÿ¥ üòè,-1
220,34605,34605,978-6001212055,⁄©ÿ™ÿßÿ® ÿ≤ŸÜÿØ⁄Ø€å ŸÖ€å⁄©ŸÑ ÿ¢ŸÜ⁄ò,Life of Michelangelo,3.15,81,175,1400,1907,ÿ±ŸÇÿπ€å,ÿ¥ŸàŸÖ€åÿ≤,8,ŸÅÿßÿ±ÿ≥€å,---,ÿ®Ÿá ÿ≤ŸàÿØ€å üôÑ,-1
221,43592,43592,978-9643499587,⁄©ÿ™ÿßÿ® ÿßÿ™ŸÑ ŸÖÿ™ŸÑ ÿß€åŸÜ Ÿæÿ≥ÿ±⁄©,Atal matal in pesarak,3.75,30,32,1399,-1,ÿÆÿ¥ÿ™€å,ÿ≤ÿ±⁄©Ÿàÿ®,7,ŸÅÿßÿ±ÿ≥€å,---,ÿ™ŸÖÿßŸÖ ÿ¥ÿØ ÿå ÿßŸÖÿß ŸÖ€åÿßÿ±€åŸÖÿ¥ üòè,-1
222,108426,108426,978-9641933687,⁄©ÿ™ÿßÿ® ⁄©ÿ™ŸÖÿßŸÜ,Ketmaan,3.63,1781,990,1400,-1,ÿ±ŸÇÿπ€å,ÿ¥ŸàŸÖ€åÿ≤,1,ŸÅÿßÿ±ÿ≥€å,8 ŸÖŸáÿ±,ŸÖŸàÿ¨ŸàÿØ,-1


In [28]:
table_of_cover_type = pd.DataFrame(tableOfData['cover_material'].drop_duplicates())
table_of_cover_type = table_of_cover_type[table_of_cover_type['cover_material'] != -1].reset_index(drop=True)
table_of_cover_type.to_csv('./cover_type.csv', encoding='utf-8')
table_of_cover_type

Unnamed: 0,cover_material
0,ÿ¥ŸàŸÖ€åÿ≤
1,ÿ¨ŸÑÿØ ÿ≥ÿÆÿ™
2,ÿ≥ŸÑŸÅŸàŸÜ€å
3,ÿ¨ŸÑÿØ ŸÜÿ±ŸÖ
4,ÿ≤ÿ±⁄©Ÿàÿ®


In [29]:
table_of_format = pd.DataFrame(tableOfData['size'].drop_duplicates())
table_of_format = table_of_format[table_of_format['size'] != -1].reset_index(drop=True)
table_of_format.to_csv('./format.csv', encoding='utf-8')
table_of_format

Unnamed: 0,size
0,ÿ±ŸÇÿπ€å
1,ÿ±ÿ≠ŸÑ€å
2,Ÿàÿ≤€åÿ±€å
3,ÿ¨€åÿ®€å
4,ŸæÿßŸÑÿ™Ÿà€å€å
5,ÿÆÿ¥ÿ™€å


In [30]:
def convert_size_to_int(size):
    try:
        return table_of_format.index[table_of_format['size'] == str(size)].to_list()[0]
    except:
        return -1


tableOfData['size'] = tableOfData['size'].apply(convert_size_to_int)

In [31]:
def convert_cover_type_to_int(material):
    try:
        return table_of_cover_type.index[table_of_cover_type['cover_material'] == str(material)].to_list()[0]
    except:
        return -1


tableOfData['cover_material'] = tableOfData['cover_material'].apply(convert_cover_type_to_int)

In [32]:

tableOfData.to_csv("bookData.csv", index=False, encoding='utf-8')

In [33]:
tableOfData

Unnamed: 0,site_index,code,Isbn,fa_title,en_title,score,publisher_id,pages,publication_per_date,publication_ad_date,size,cover_material,print_series,language,earliest_send_time,presence,paper_type
1,2,7204,9782000547822,⁄©ÿ™ÿßÿ® ŸÜÿ±⁄Øÿ≥ + ÿπÿ¥ŸÇ,Narcisuss + Love,3.21,51,228,1395,2002,0,0,1,ŸÅÿßÿ±ÿ≥€å,---,ÿ™ŸÖÿßŸÖ ÿ¥ÿØ ÿå ÿßŸÖÿß ŸÖ€åÿßÿ±€åŸÖÿ¥ üòè,-1
2,3,105144,978-6225239210,⁄©ÿ™ÿßÿ® ⁄Ü⁄ØŸàŸÜŸá ÿ≠ÿßŸÑ ÿ®Ÿáÿ™ÿ±€å Ÿà ŸæŸàŸÑ ÿ®€åÿ¥ÿ™ÿ±€å ÿØÿßÿ¥ÿ™Ÿá ÿ®ÿßÿ¥€åŸÖ,12 Power Principles for Success,3.09,2628,197,1401,2019,0,0,1,ŸÅÿßÿ±ÿ≥€å,6 ŸÖŸáÿ±,ŸÖŸàÿ¨ŸàÿØ,-1
3,4,16787,978-2000111603,⁄©ÿ™ÿßÿ® ÿØÿ≥ÿ™Ÿàÿ± ŸÖŸÇÿØŸÖÿßÿ™€å Ÿà€åŸàŸÑŸÜ ŸáŸÜÿ±ÿ≥ÿ™ÿßŸÜ ŸÖŸàÿ≥€åŸÇ€å,Violin,3.49,1575,51,1388,-1,1,0,7,ŸÅÿßÿ±ÿ≥€å,8 ŸÖŸáÿ±,ŸÖŸàÿ¨ŸàÿØ,-1
4,5,115076,978-6001176869,⁄©ÿ™ÿßÿ® ÿØÿ±ÿ≥ Ÿáÿß€å€å ÿßÿ≤ ÿ≤ŸÜÿØ⁄Ø€å ŸáÿßŸà⁄©€åŸÜ⁄Ø,How to Think Like Stephen Hawking,3.86,42,176,1402,2016,0,0,1,ŸÅÿßÿ±ÿ≥€å,5 ŸÖŸáÿ±,ŸÖŸàÿ¨ŸàÿØ,-1
5,6,8108,9786002961136,⁄©ÿ™ÿßÿ® €å⁄© ÿ±Ÿàÿ≤ ŸÖŸÜÿßÿ≥ÿ® ÿ®ÿ±ÿß€å ÿ¥ŸÜÿß ŸÇŸàÿ±ÿ®ÿßÿ∫Ÿá,A perfect day to swim a frog,3.06,30,92,1393,2014,0,0,1,ŸÅÿßÿ±ÿ≥€å,8 ŸÖŸáÿ±,ŸÖŸàÿ¨ŸàÿØ,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219,193,34500,978-6004366670,⁄©ÿ™ÿßÿ® ÿ¥Ÿáÿ± ÿßŸÜÿØŸàŸá,A City of Sadness,3.22,81,135,1399,2002,0,0,2,ŸÅÿßÿ±ÿ≥€å,5 ŸÖŸáÿ±,ŸÖŸàÿ¨ŸàÿØ,-1
220,194,60686,978-6006445298,⁄©ÿ™ÿßÿ® ŸàŸÇŸÅŸá €å ÿ™ÿßÿ±€å⁄©,The dark interval,3.68,1265,120,1400,1920,0,0,1,ŸÅÿßÿ±ÿ≥€å,---,ÿ®Ÿá ÿ≤ŸàÿØ€å üôÑ,-1
221,195,34605,978-6001212055,⁄©ÿ™ÿßÿ® ÿ≤ŸÜÿØ⁄Ø€å ŸÖ€å⁄©ŸÑ ÿ¢ŸÜ⁄ò,Life of Michelangelo,3.15,81,175,1400,1907,0,0,8,ŸÅÿßÿ±ÿ≥€å,---,ÿ®Ÿá ÿ≤ŸàÿØ€å üôÑ,-1
222,196,60815,978-9642739295,⁄©ÿ™ÿßÿ® ÿ±Ÿáÿß€å€å ÿßÿ≤ ÿ¥ÿßŸàÿ¥ŸÜ⁄©,The Shawshank Redemption,3.19,1427,135,1392,1996,0,0,1,ŸÅÿßÿ±ÿ≥€å,---,ÿ™ŸÖÿßŸÖ ÿ¥ÿØ ÿå ÿßŸÖÿß ŸÖ€åÿßÿ±€åŸÖÿ¥ üòè,-1


In [54]:
tableOfSummaryData = pd.DataFrame(site_summary_data_list, columns=['site_index', 'summary']).drop_duplicates(
    subset=['site_index', 'summary'])
tableOfSummaryData = tableOfSummaryData[tableOfSummaryData.notnull().all(axis=1)]
tableOfSummaryData

Unnamed: 0,site_index,summary
0,72401,ÿ≥ÿßÿπÿ™ ÿÆŸàÿ¥ ŸÜÿßŸÖ ŸÖÿ¨ŸÖŸàÿπŸá ÿ™ŸÑŸà€åÿ≤€åŸàŸÜ€å ÿßÿ≥ÿ™ ⁄©Ÿá ÿØÿ± ÿ≥ÿßŸÑ €±€≥...
1,8108,ŸÖÿ¨ŸÖŸàÿπŸá ÿØÿßÿ≥ÿ™ÿßŸÜ ¬´€å⁄© ÿ±Ÿàÿ≤ ŸÖŸÜÿßÿ≥ÿ® ÿ®ÿ±ÿß€å ÿ¥ŸÜÿß€å ŸÇŸàÿ±ÿ®ÿßÿ∫Ÿá¬ª...
2,67734,⁄©ÿ™ÿßÿ® ÿ≠ÿßÿ∂ÿ±ÿå Ÿæ⁄òŸàŸáÿ¥€å ÿßÿ≥ÿ™ ⁄©Ÿá ÿØÿ± ÿ¢ŸÜ ŸÖÿ≥ÿ¶ŸÑŸá ÿßÿµŸÑ€å ÿß€åŸÜ ...
3,67445,ŸÜÿ∏ÿ±€åŸá‚ÄåŸáÿß€å €åÿßÿØ⁄Ø€åÿ±€å ÿßÿ≤ ÿØÿ±Ÿàÿ≥ ŸÖŸáŸÖ ÿØÿ± ÿØŸàÿ±Ÿá‚ÄåŸáÿß€å ⁄©ÿßÿ±ÿ¥...
4,115076,⁄©ÿ™ÿßÿ® ¬´ÿØÿ±ÿ≥‚ÄåŸáÿß€å€å ÿßÿ≤ ÿ≤ŸÜÿØ⁄Ø€å ÿßÿ≥ÿ™€åŸàŸÜ ŸáÿßŸà⁄©€åŸÜ⁄Ø¬ª ÿ®ŸÜÿß ÿØÿß...
...,...,...
194,252,ÿπÿ¥ŸÇ Ÿá€å⁄Ü ŸàŸÇÿ™ ÿ¢ÿ≥ÿßŸÜ ŸÜ€åÿ≥ÿ™-ÿ®Ÿá ÿÆÿµŸàÿµ ÿ≤ŸÖÿßŸÜ€å ⁄©Ÿá ŸÜÿßŸÖÿ≤ÿØÿ™ÿß...
195,60815,⁄©ÿ™ÿßÿ® ÿ≠ÿßÿ∂ÿ±ÿå ŸÖÿ™ŸÜ ŸÅ€åŸÑŸÖ‚ÄåŸÜÿßŸÖŸá ¬´ÿ±Ÿáÿß€å€å ÿßÿ≤ ÿ¥ÿßŸàÿ¥ŸÜ⁄Ø¬ª ÿßÿ≥ÿ™...
196,34605,ÿ≤ŸÜÿØ⁄Ø€å ŸÖ€å⁄©ŸÑ ÿ¢ŸÜ⁄ò €å⁄©€å ÿßÿ≤ ÿ®ÿßÿ±ÿ≤ÿ™ÿ±€åŸÜ ŸÜŸÖŸàŸÜŸá Ÿáÿß€å ÿ™ÿ£ÿ´€åÿ±...
197,43592,ÿ™ÿ±ÿßŸÜŸá Ÿáÿß€å ÿß€åŸÜ ŸÖÿ¨ŸÖŸàÿπŸá ÿ®ÿß ÿ¨ŸÑÿ® ÿ™Ÿàÿ¨Ÿá ÿÆÿ±ÿØÿ≥ÿßŸÑÿßŸÜ ÿ®Ÿá ŸÖ...


In [35]:
tableOfSummaryData.to_csv("BookSummaryData.csv", index=False, encoding='utf-8')

In [36]:
tableOfSiteTagsData = pd.DataFrame(site_tags_data_list, columns=['site_index', 'tag']) \
    .drop_duplicates(subset=['site_index', 'tag'])
tableOfSiteTagsData

Unnamed: 0,site_index,tag
0,1,ÿµŸàÿ™€å Ÿà ÿ™ÿµŸà€åÿ±€å
1,1,ÿ≥ÿ±€åÿßŸÑ
2,2,ÿßÿØÿ®€åÿßÿ™ ÿØÿßÿ≥ÿ™ÿßŸÜ€å
3,2,ÿßÿØÿ®€åÿßÿ™ ŸÖÿπÿßÿµÿ±
4,2,ÿØŸáŸá 2000 ŸÖ€åŸÑÿßÿØ€å
...,...,...
1005,197,ÿßÿØÿ®€åÿßÿ™ ÿØÿßÿ≥ÿ™ÿßŸÜ€å
1006,197,ÿßÿØÿ®€åÿßÿ™ ÿß€åÿ±ÿßŸÜ
1007,197,⁄©ÿ™ÿßÿ® ⁄©ŸàÿØ⁄©
1008,197,ÿ¥ÿπÿ± ⁄©ŸàÿØ⁄©


In [37]:
table_of_tag = pd.DataFrame(tableOfSiteTagsData['tag']).drop_duplicates(subset=['tag']).reset_index(drop=True)
table_of_tag.to_csv('./tag.csv', encoding='utf-8')
table_of_tag

Unnamed: 0,tag
0,ÿµŸàÿ™€å Ÿà ÿ™ÿµŸà€åÿ±€å
1,ÿ≥ÿ±€åÿßŸÑ
2,ÿßÿØÿ®€åÿßÿ™ ÿØÿßÿ≥ÿ™ÿßŸÜ€å
3,ÿßÿØÿ®€åÿßÿ™ ŸÖÿπÿßÿµÿ±
4,ÿØŸáŸá 2000 ŸÖ€åŸÑÿßÿØ€å
...,...
199,ÿ¨ÿß€åÿ≤Ÿá €å ÿØÿßÿ≥ÿ™ÿßŸÜ ÿ¨ŸÜÿß€å€å ÿ¢ŸÑŸÖÿßŸÜ
200,ÿØÿßÿ≥ÿ™ÿßŸÜ ŸÖÿßÿ¨ÿ±ÿß€å€å
201,ŸÜÿßÿØÿßÿ≥ÿ™ÿßŸÜ
202,ÿØŸáŸá 1900 ŸÖ€åŸÑÿßÿØ€å


In [38]:
def convert_tag_to_int(tag):
    try:
        return table_of_tag.index[table_of_tag['tag'] == str(tag)].to_list()[0]
    except:
        return -1


tableOfSiteTagsData['tag'] = tableOfSiteTagsData['tag'].apply(convert_tag_to_int)

In [39]:
tableOfSiteTagsData

Unnamed: 0,site_index,tag
0,1,0
1,1,1
2,2,2
3,2,3
4,2,4
...,...,...
1005,197,2
1006,197,7
1007,197,21
1008,197,105


In [40]:
tableOfSiteTagsData.to_csv('bookTagsData.csv', index=False, encoding='utf-8')

In [41]:
table_of_publisher = pd.DataFrame(publishers_data_list).drop_duplicates(subset=['id', 'name'])
table_of_publisher.to_csv('./publisher.csv', index=False, encoding='utf-8')
table_of_publisher

Unnamed: 0,id,name
0,1470,ÿ≥ÿ±Ÿàÿ¥
1,51,⁄©ÿ™ÿßÿ®ÿ≥ÿ±ÿß€å ÿ™ŸÜÿØ€åÿ≥
2,2628,ÿßŸÖ€åÿØ ÿ≥ÿÆŸÜ
3,1575,ÿµŸÅ€å ÿπŸÑ€åÿ¥ÿßŸá
4,42,ÿ≥ÿ®ÿ≤ÿßŸÜ
...,...,...
216,93,ŸÖŸàÿ≥ÿ≥Ÿá ŸÅÿ±ŸáŸÜ⁄Ø€å ŸáŸÜÿ±€å ÿ¨ŸáÿßŸÜ ⁄©ÿ™ÿßÿ®
218,1186,ÿ≠Ÿàÿ∂ ŸÜŸÇÿ±Ÿá
219,81,ÿπŸÑŸÖ€å Ÿà ŸÅÿ±ŸáŸÜ⁄Ø€å
220,1265,ÿ≠ÿ±ŸÅŸá ŸÜŸà€åÿ≥ŸÜÿØŸá


In [42]:
books_writers_data_list = list(filter(bool, books_writers_data_list))
table_of_writer = pd.DataFrame(books_writers_data_list).drop_duplicates(subset=['book_id', 'writer_id'])
#drop rows which both book_id and writer_id is -1
table_of_writer = table_of_writer[(table_of_writer['book_id'] != -1) & (table_of_writer['writer_id'] != -1)]
table_of_writer.to_csv('./writer.csv', index=False, encoding='utf-8')
table_of_writer

Unnamed: 0,book_id,writer_id
0,7204,4985
1,105144,26943
2,16787,9341
3,115076,11871
4,8108,4943
...,...,...
218,34500,19936
219,60686,681
220,34605,85
221,60815,33432


In [43]:
table_of_writer_page = pd.DataFrame(writer_page_data_list).drop_duplicates(subset=['id', 'name', 'link'])
table_of_writer_page.to_csv('./writer_page.csv', index=False, encoding='utf-8')
table_of_writer_page

Unnamed: 0,id,name,link
0,4985,ÿ®ŸÜŸÅÿ¥Ÿá ÿ≠ÿ¨ÿßÿ≤€å,/profile/4985-banafshe-hejazi
1,26943,ÿ®ÿßÿ® Ÿæÿ±ÿß⁄©ÿ™Ÿàÿ±,/profile/26943-bob-proctor
2,9341,ÿ±Ÿàÿ≠ ÿßŸÑŸÑŸá ÿÆÿßŸÑŸÇ€å,/profile/9341-ruhollah-khaleqi
3,11871,ÿØÿßŸÜ€åŸÑ ÿßÿ≥ŸÖ€åÿ™,/profile/11871-daniel-smith
4,4943,ÿ±ÿ∂ÿß ÿ≤ŸÜ⁄Ø€å ÿ¢ÿ®ÿßÿØ€å,/profile/4943-%d8%b1%d8%b6%d8%a7-%d8%b2%d9%86%...
...,...,...,...
218,19936,ÿ®ÿ±ŸÜ€åÿ≥ ÿ±ŸÜŸà,/profile/19936-b%c3%a9r%c3%a9nice-reynaud
219,681,ÿ±ÿß€åŸÜÿ± ŸÖÿßÿ±€åÿß ÿ±€åŸÑ⁄©Ÿá,/profile/681-rainer-maria-rilke
220,85,ÿ±ŸàŸÖŸÜ ÿ±ŸàŸÑÿßŸÜ,/profile/85-romain-rolland
221,33432,ŸÅÿ±ÿßŸÜ⁄© ÿØÿßÿ±ÿßÿ®ÿßŸÜÿ™,/profile/33432-frank-darabont


In [44]:
table_of_translator = pd.DataFrame(books_translators_data_list).drop_duplicates(subset=['book_id', 'translator_id'])
table_of_translator = table_of_translator[
    (table_of_translator['book_id'] != -1) & (table_of_translator['translator_id'] != -1)]
table_of_translator.to_csv('./translator.csv', index=False, encoding='utf-8')
table_of_translator


Unnamed: 0,book_id,translator_id
0,105144,30543
1,115076,33130
2,25277,14441
3,19941,2634
4,67445,24867
...,...,...
108,57694,5736
109,34500,19935
110,60686,8031
111,34605,4459


In [45]:
table_of_translator_page = pd.DataFrame(translator_page_data_list).drop_duplicates(subset=['id', 'name', 'link'])
table_of_translator_page.to_csv('translator_page.csv', index=False, encoding='utf-8')
table_of_translator_page

Unnamed: 0,id,name,link
0,30543,ŸÅÿßÿ∑ŸÖŸá ÿ±ÿ≠€åŸÖ€å,/profile/30543-%d9%81%d8%a7%d8%b7%d9%85%d9%87-...
1,33130,ŸÅÿßÿ±€åÿß ÿ¨ŸÜ€åÿØ€å,/profile/33130-%d9%81%d8%a7%d8%b1%db%8c%d8%a7-...
2,14441,ŸÖŸáÿ±ÿØÿßÿØ ÿ™Ÿà€åÿ≥ÿ±⁄©ÿßŸÜ€å,/profile/14441-mehrdad-tuyserkani
3,2634,⁄©ÿ™ÿß€åŸàŸÜ ÿ≥ŸÑÿ∑ÿßŸÜ€å,/profile/2634-katayoun-soltani
4,24867,ÿßŸÑŸáŸá ÿ≠ÿ¨ÿßÿ≤€å,/profile/24867-%d8%a7%d9%84%d9%87%d9%87-%d8%ad...
...,...,...,...
108,5736,ŸÖŸáŸÜÿßÿ≤ ÿß€åŸÑÿØÿ±ŸÖ€å,/profile/5736-%d9%85%d9%87%d9%86%d8%a7%d8%b2-%...
109,19935,ÿ¢ÿ≤ÿßÿØŸá ÿ¨ÿπŸÅÿ±€å,/profile/19935-%d8%a2%d8%b2%d8%a7%d8%af%d9%87-...
110,8031,Ÿæ€åŸÖÿßŸÜ ⁄ÜŸáÿ±ÿßÿ≤€å,/profile/8031-%d9%be%db%8c%d9%85%d8%a7%d9%86-%...
111,4459,ÿßÿ≥ŸÖÿßÿπ€åŸÑ ÿ≥ÿπÿßÿØÿ™,/profile/4459-%d8%a7%d8%b3%d9%85%d8%a7%d8%b9%d...


In [46]:
table_of_price_history = pd.DataFrame(price_history_data_list).drop_duplicates(
    subset=['book_id', 'price', 'discount', 'date'])
table_of_price_history = table_of_price_history[table_of_price_history.book_id != -1]
table_of_price_history.to_csv('./price-history.csv', index=False, encoding='utf-8')
table_of_price_history

Unnamed: 0,book_id,price,discount,date
1,7204,15000,0,2023-09-25 21:13:37.660574
2,105144,100000,30,2023-09-25 21:13:38.010721
3,16787,75000,25,2023-09-25 21:13:38.082829
4,115076,128000,20,2023-09-25 21:13:38.170579
5,8108,50000,30,2023-09-25 21:13:38.231573
...,...,...,...,...
219,34500,74000,25,2023-09-25 21:13:51.390872
220,60686,48000,0,2023-09-25 21:13:51.420872
221,34605,32000,0,2023-09-25 21:13:51.475578
222,60815,8000,0,2023-09-25 21:13:51.502665


In [47]:
book_veneration_data_list = list(filter(bool, book_veneration_data_list))
table_of_book_veneration = pd.DataFrame(book_veneration_data_list).drop_duplicates(
    subset=['site_index', 'English_Quote', 'Persian_Quote', 'Prise_Writer'])
table_of_book_veneration.to_csv('./book_veneration.csv', index=False, encoding='utf-8')
table_of_book_veneration

Unnamed: 0,site_index,English_Quote,Persian_Quote,Prise_Writer
0,25,One of Faulkner‚Äôs comic masterpieces.,ÿßÿ≤ ÿ¥ÿßŸá⁄©ÿßÿ±Ÿáÿß€å ⁄©ŸÖ€å⁄© ŸÅÿß⁄©ŸÜÿ±,barnes and noble
1,57,Convincing and compelling.,ÿ®ÿßŸàÿ±Ÿæÿ∞€åÿ± Ÿà ŸÖŸá€åÿ¨.,School Library Journal
2,57,"A highly imaginative, absolutely terrific firs...",€å⁄© ÿ±ŸÖÿßŸÜ ŸÜÿÆÿ≥ÿ™ ŸÅŸàŸÇ ÿßŸÑÿπÿßÿØŸá ÿÆ€åÿßŸÑ Ÿæÿ±ÿØÿßÿ≤ÿßŸÜŸá Ÿà ÿ¥⁄Øÿ±ŸÅ.,Barnes & Noble
3,57,"An exciting, clever read.",ÿØÿßÿ≥ÿ™ÿßŸÜ€å Ÿá€åÿ¨ÿßŸÜ ÿßŸÜ⁄Ø€åÿ≤ Ÿà ŸáŸàÿ¥ŸÖŸÜÿØÿßŸÜŸá.,Booktopia
4,98,An engrossing forecast.,€å⁄© Ÿæ€åÿ¥ ÿ®€åŸÜ€å Ÿá€åÿ¨ÿßŸÜ ÿßŸÜ⁄Ø€åÿ≤.,Publishers Weekly
5,98,"Original, accessible, and provocative.",ÿ®ÿØ€åÿπÿå ŸÇÿßÿ®ŸÑ ŸÅŸáŸÖ Ÿà ÿ®ÿ±ÿßŸÜ⁄Ø€åÿ≤ÿßŸÜŸÜÿØŸá.,Science
6,98,A compelling guide to the challenges and choic...,ÿ±ÿßŸáŸÜŸÖÿß€å€å ÿ¨ÿ∞ÿßÿ® ÿ®ÿ±ÿß€å ⁄ÜÿßŸÑÿ¥ Ÿáÿß Ÿà ÿßŸÜÿ™ÿÆÿßÿ® Ÿáÿß€å Ÿæ€åÿ¥ ÿ±Ÿà...,Elon Musk
7,108,This stunning work showcases Krauss's consiste...,ÿß€åŸÜ ÿßÿ´ÿ± ÿÆ€åÿ±Ÿá ⁄©ŸÜŸÜÿØŸáÿå ŸÜÿ¥ÿßŸÜ ÿØŸáŸÜÿØŸá €å ÿßÿ≥ÿ™ÿπÿØÿßÿØ ŸáŸÖ€åÿ¥⁄Ø...,Publishers Weekly
8,108,"Masterful, evocative and moving.",ÿßÿ≥ÿ™ÿßÿØÿßŸÜŸáÿå ÿßÿ≠ÿ≥ÿßÿ≥ ÿ®ÿ±ÿßŸÜ⁄Ø€åÿ≤ Ÿà ÿ™⁄©ÿßŸÜ ÿØŸáŸÜÿØŸá.,NPR
9,108,A meditation on memory and loss.,ÿ™ÿ£ŸÖŸÑ€å ÿ®ÿ± ÿÆÿßÿ∑ÿ±Ÿá Ÿà ŸÅŸÇÿØÿßŸÜ.,Los Angeles Times


In [48]:
table_of_award = pd.DataFrame(site_award_data_list).drop_duplicates(subset=['site_index', 'award'])
table_of_award.to_csv('./award.csv', index=False, encoding='utf-8')
table_of_award

Unnamed: 0,site_index,award
0,25,ÿ®ÿ±ŸÜÿØŸá €å ÿ¨ÿß€åÿ≤Ÿá €å ŸæŸàŸÑ€åÿ™ÿ≤ÿ± ÿ≥ÿßŸÑ €±€π€∂€≥
1,43,ÿ®ÿ±ŸÜÿØŸá ÿ¨ÿß€åÿ≤Ÿá ŸæŸàŸÑ€åÿ™ÿ≤ÿ±
2,43,ÿ®ÿ±ŸÜÿØŸá ÿ¨ÿß€åÿ≤Ÿá €å ŸÜŸÖÿß€åÿ¥ŸÜÿßŸÖŸá €å ÿ≠ŸÑŸÇŸá €å ŸÖŸÜÿ™ŸÇÿØ€åŸÜ ŸÜ€åŸà...
3,43,ÿ®ÿ±ŸÜÿØŸá ÿ¨ÿß€åÿ≤Ÿá Tony ÿ≥ÿßŸÑ 1987
4,57,ŸÜÿßŸÖÿ≤ÿØ ÿ¨ÿß€åÿ≤Ÿá ⁄©ÿ™ÿßÿ® ŸÜÿßÿ¥ÿ± ŸÖÿ≥ÿ™ŸÇŸÑ ÿ≥ÿßŸÑ 1999
5,57,ÿ®ÿ±ŸÜÿØŸá ÿ¨ÿß€åÿ≤Ÿá ÿÆŸàÿßŸÜŸÜÿØŸá ⁄Øÿ±ŸÜÿØ ⁄©ŸÜ€åŸàŸÜ ÿ≥ÿßŸÑ 1998
6,57,ŸÜÿßŸÖÿ≤ÿØ ŸÖÿØÿßŸÑ ÿÆŸàÿßŸÜŸÜÿØ⁄ØÿßŸÜ ÿ¨ŸàÿßŸÜ ⁄©ÿßŸÑ€åŸÅÿ±ŸÜ€åÿß ÿ≥ÿßŸÑ 1998
7,57,ÿ®ÿ±ŸÜÿØŸá ÿ¨ÿß€åÿ≤Ÿá ÿ≥⁄©Ÿà€åÿß ÿßŸà⁄©ŸÑÿßŸáŸÖÿß ÿ≥ÿßŸÑ 1998
8,98,ÿßÿ≤ Ÿæÿ±ŸÅÿ±Ÿàÿ¥ ÿ™ÿ±€åŸÜ ⁄©ÿ™ÿßÿ® Ÿáÿß€å ŸÜ€åŸà€åŸàÿ±⁄© ÿ™ÿß€åŸÖÿ≤
9,100,ÿ±ŸàÿØ⁄©€å ŸÖÿ¥ŸáŸàÿ± ÿ®Ÿá ÿßÿ≥ÿ™ÿßÿØ ÿ¥ÿßÿπÿ±ÿßŸÜÿå ŸÜÿÆÿ≥ÿ™€åŸÜ ÿ¥ÿßÿπÿ± ŸÖÿ¥Ÿá...


for extra points

In [49]:
def get_person_info(soup):
    info = ''
    try:
        info = soup.select('h5')[0].text.strip()
    except:
        info = None
    return info

In [50]:
translator_df = pd.read_csv('./translator_page.csv')
translator_url = translator_df['link'].to_list()
translators_info = []
for i in range(len(translator_url)):
    soup = get_soup('https://www.iranketab.ir' + translator_url[i])
    t_info = get_person_info(soup)
    translators_info.append(t_info)
translator_df['information'] = translators_info
translator_df.drop(columns=['link'], inplace=True)
translator_df.to_csv('./translator_page.csv', index=False, encoding='utf-8')

In [112]:
translator_df

Unnamed: 0,id,name,information
0,33130,ŸÅÿßÿ±€åÿß ÿ¨ŸÜ€åÿØ€å,ŸÅÿßÿ±€åÿß ÿ¨ŸÜ€åÿØ€å ŸÖÿ™ÿ±ÿ¨ŸÖ ÿß€åÿ±ÿßŸÜ€å ŸÖÿ™ŸàŸÑÿØ ÿ≥ÿßŸÑ 1349 ŸÖ€å ÿ®ÿßÿ¥ÿØ.
1,30543,ŸÅÿßÿ∑ŸÖŸá ÿ±ÿ≠€åŸÖ€å,ŸÅÿßÿ∑ŸÖŸá ÿ±ÿ≠€åŸÖ€å ŸÖÿ™ŸàŸÑÿØ ÿ≥ÿßŸÑ 1366ÿå ŸÜŸà€åÿ≥ŸÜÿØŸá ÿß€åÿ±ÿßŸÜ€å ŸÖ€å ...
2,14441,ŸÖŸáÿ±ÿØÿßÿØ ÿ™Ÿà€åÿ≥ÿ±⁄©ÿßŸÜ€å,ŸÖŸáÿ±ÿØÿßÿØ ÿ™Ÿà€åÿ≥ÿ±⁄©ÿßŸÜ€å(ŸÖÿ™ŸàŸÑÿØ €≤€¥ ŸÖÿ±ÿØÿßÿØ €±€≥€¥€¥ ÿØÿ± ÿ¥Ÿáÿ± ÿ™Ÿá...
3,4702,ŸÖÿ±ÿ™ÿ∂€å ÿ´ÿßŸÇÿ® ŸÅÿ±,ŸÖÿ±ÿ™ÿ∂€å ÿ´ÿßŸÇÿ®‚ÄåŸÅÿ± (€π ŸÖÿ±ÿØÿßÿØ €±€≥€≤€± ÿ™Ÿáÿ±ÿßŸÜ - €±€± ÿØ€å €±€≥€π€±...
4,2634,⁄©ÿ™ÿß€åŸàŸÜ ÿ≥ŸÑÿ∑ÿßŸÜ€å,⁄©ÿ™ÿß€åŸàŸÜ ÿ≥ŸÑÿ∑ÿßŸÜ€å ŸÖÿ™ŸàŸÑÿØ ÿ≥ÿßŸÑ 1335ÿå ŸÜŸà€åÿ≥ŸÜÿØŸá Ÿà ŸÖÿ™ÿ±ÿ¨ŸÖ ...
...,...,...,...
100,8031,Ÿæ€åŸÖÿßŸÜ ⁄ÜŸáÿ±ÿßÿ≤€å,
101,5736,ŸÖŸáŸÜÿßÿ≤ ÿß€åŸÑÿØÿ±ŸÖ€å,
102,27357,ÿ≠ŸÖ€åÿØÿ±ÿ∂ÿß ⁄Øÿ±ÿ¥ÿßÿ≥ÿ®€å,
103,4459,ÿßÿ≥ŸÖÿßÿπ€åŸÑ ÿ≥ÿπÿßÿØÿ™,


In [107]:
writer_df = pd.read_csv('./writer_page.csv')
writer_url = writer_df['link'].to_list().copy()
writers_info = []
for i in range(len(writer_url)):
    soup = get_soup('https://www.iranketab.ir' + writer_url[i])
    w_info = get_person_info(soup)
    writers_info.append(w_info)
writer_df['information'] = writers_info
writer_df.drop(columns=['link'], inplace=True)
writer_df.to_csv('./writer_page.csv', index=False, encoding='utf-8')

In [111]:
writer_df

Unnamed: 0,id,name,information
0,11871,ÿØÿßŸÜ€åŸÑ ÿßÿ≥ŸÖ€åÿ™,ÿØÿßŸÜ€åŸÑ ÿßÿ≥ŸÖ€åÿ™ ŸÜŸà€åÿ≥ŸÜÿØŸá Ÿà Ÿà€åÿ±ÿßÿ≥ÿ™ÿßÿ± ÿ∫€åÿ± ÿØÿßÿ≥ÿ™ÿßŸÜ€å ÿßÿ≥ÿ™...
1,31542,ÿ¨ŸàÿßÿØ ÿ™ÿ±ÿ¥€åÿ≤€å,ÿ¨ŸàÿßÿØ ÿ™ÿ±ÿ¥€åÿ≤€å ŸÖÿ™ŸàŸÑÿØ ÿ≥ÿßŸÑ 1364ÿå ŸÜŸà€åÿ≥ŸÜÿØŸá ÿß€åÿ±ÿßŸÜ€å ŸÖ€å ...
2,26943,ÿ®ÿßÿ® Ÿæÿ±ÿß⁄©ÿ™Ÿàÿ±,ÿ®ÿßÿ® Ÿæÿ±ÿß⁄©ÿ™Ÿàÿ± (Bob Proctor) (ŸÖÿ™ŸàŸÑÿØ €µ ⁄òŸàÿ¶€åŸá €±€π€≥€¥)...
3,14319,ÿ™ÿ±€å ÿØ€åÿ±€å,"ÿ™ÿ±ŸÜÿ≥ Ÿà€åŸÑ€åÿßŸÖ ""ÿ™ÿ±€å"" ÿØ€åÿ±€å (William Terence ""Terry..."
4,17747,ÿ≠ÿ≥ŸÜ ⁄Üÿßÿ®⁄©,ÿ≠ÿ≥ŸÜ ⁄Üÿßÿ®⁄© ŸÖÿ™ŸàŸÑÿØ ÿ≥ÿßŸÑ 1351ÿå ŸÜŸà€åÿ≥ŸÜÿØŸá ÿß€åÿ±ÿßŸÜ€å¬†ÿ®ÿß ÿ≠Ÿàÿ≤...
...,...,...,...
183,31297,ŸÖÿ±€å ÿ¢ŸÖÿßÿ™Ÿà,ŸÖÿ±€å ÿ¢ŸÖÿßÿ™Ÿà (ŸÖÿ™ŸàŸÑÿØ 3 ⁄òÿßŸÜŸà€åŸá 1961 ÿå ÿ®ŸÑŸà€åÿØÿ± ÿå ÿß€åŸÑ€å...
184,33432,ŸÅÿ±ÿßŸÜ⁄© ÿØÿßÿ±ÿßÿ®ÿßŸÜÿ™,ŸÅÿ±ÿßŸÜ⁄© ÿØÿßÿ±ÿßÿ®ŸàŸÜÿ™ (Frank Darabont) ⁄©ÿßÿ±⁄Øÿ±ÿØÿßŸÜÿå ŸÅ€åŸÑŸÖ...
185,24833,ÿ±Ÿáÿß ÿ≤ÿßÿØŸÖŸáÿ±,ÿ±Ÿáÿß ÿ≤ÿßÿØŸÖŸáÿ± ŸÖÿ™ŸàŸÑÿØ ÿ≥ÿßŸÑ 1350 ÿå ŸÜŸà€åÿ≥ŸÜÿØŸá €å ⁄©ÿ™ÿßÿ® Ÿáÿß€å...
186,85,ÿ±ŸàŸÖŸÜ ÿ±ŸàŸÑÿßŸÜ,ÿ±ŸàŸÖŸÜ ÿ±ŸàŸÑÿßŸÜÿå ÿ≤ÿßÿØŸá €å €≤€∂ ⁄òÿßŸÜŸà€åŸá €±€∏€∂€∂ÿå ÿØÿ±⁄Øÿ∞ÿ¥ÿ™Ÿá €å €≥...
