# FFNet metadata scraper
This notebook scrapes metadata from the Russian fanfic site [Ficbook.net](https://ficbook.net/). To make it work, put the URL for a particular fandom page (everything up to `&p=`) in as the *ScraperStem* value below, and set the range to be (1,some-number), where some-number is the final page of the paginated results for that fandom.

In [None]:
#Install selenium for Python-based browser control
import sys
!{sys.executable} -m pip install selenium

In [None]:
#Install undetected-chromedriver so you won't be identified as a bot
import sys
!{sys.executable} -m pip install undetected-chromedriver

In [None]:
#You may need to run something on the Apple Terminal to make chromedriver work
#xattr -d com.apple.quarantine /usr/local/bin/chromedriver
#is what worked for me

In [1]:
#Import libraries
import undetected_chromedriver as uc
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
from random import randint
import time
from time import sleep
from bs4 import BeautifulSoup
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import re

In [None]:
#Defines list of icon ID values that don't have useful info
pointlessicons = ['badge-with-icon', 'badge-secondary']

#Creates Pandas dataframe with the metadata
russianfanfic = pd.DataFrame(columns=["Title", "Storylink", "AuthName", "AuthID", "Shiptype", "Rating", "Status", "Likes", "Paid", "Fandom", "Charships", "Length", "Postdate", "Tags", "Blurb"])

#Launches the Undetected Selenium driver
driver = uc.Chrome()


Set the ScraperStem to the first part of the page for the fandom you're interested in, below, along with the page range.

In [22]:
#Base URL for a fandom, up to `?p=` which defines which page
ScraperStem = "https://ficbook.net/fanfiction/books/harri_potter?p=
#Set the range here, between 1 and the highest-number page for the fandom
for i in range(1,53):
    #Define the full URL as the base URL + the page number
    ScraperURL = ScraperStem + str(i)
    #Print the full URL
    print(ScraperURL)
    #Load the full URL
    driver.get(ScraperURL);
    #Wait 6 seconds
    time.sleep(6)
    #Find all the spoiler / hidden tags
    spoilers = driver.find_elements_by_class_name('show-hidden-tags-btn')
    #For every spoiler / hidden tag, click the tag to show its value
    for x in range(0,len(spoilers)):
        if spoilers[x].is_displayed():
            spoilers[x].click()
            sleep(randint(1,3))
    #Get the page source
    pageSource = driver.page_source
    #Parse the page source with Beautiful Soup
    soup = BeautifulSoup(pageSource)
    #Find the container for the fics
    fics = soup.find_all("div", {'class': 'js-toggle-description'})
    #For each fic
    for fic in fics:
        #Find the title container
        title = fic.find('h3', {'class': 'fanfic-inline-title'})
        #Find the story link
        storylink = title.a['href']
        #Find the text of the title
        title = title.text
        #Find the container with the ship type
        shiptype = fic.find('div', {'class': 'direction'})
        #Find the span with the ship type badge 
        shiptype = shiptype.find('span', {'class': 'badge-text'})
        #Get the text from the ship type
        shiptype = shiptype.text
        #Get all the containers with icons
        icons = fic.find_all("span", {'class': 'badge-with-icon'})
        #For each icon
        for icon in icons:
            #Get the class of the icon as the iconvalue
            iconvalues = icon["class"]
            #For each iconvalue
            for iconvalue in iconvalues:
                #If the iconvalue isn't in the pointlessicons list
                if iconvalue not in pointlessicons:
                    #If the iconvalue includes 'badge-status'
                    if 'badge-status' in iconvalue:
                        #The iconvalue is the text status
                        textstatus = iconvalue
        #Find a span tag with an icon
        rating = fic.find("strong", {'class': 'badge-with-icon'})
        if rating not in pointlessicons:
            #Get the badge text as the rating
            rating = rating.find('span', {'class': 'badge-text'}).text
        #Find a span with the class badge-likes
        likes = fic.find("span", {"class": 'badge-like'})
        if likes is not None:
            #If it's not empty, that's the number of likes
            likenumber = likes.find('span', {"class": 'badge-text'}).text
        else:
            #Otherwise assign it to empty
            likenumber = ''
        #Find a span with the class badge-translate
        translationicon = fic.find('span', {'class': 'badge-translate'})
        #If it's not empty, then it's translated
        if translationicon is not None:
            translation = 'translated'
        else:
            #Otherwise set translation to empty
            translation = ''
        #Find a span with the class badge-reward
        award = fic.find("span", {"class": 'badge-reward'})
        #If it's not empty, that's the award
        if award is not None:
            awardnumber = award.find('span', {"class": 'badge-text'}).text
        else:
            #Otherwise set award to empty
            award = ''
        #Find a div with the class hot-fanfic
        paid = fic.find("div", {"class": 'hot-fanfic'})
        #If it's not empty, set it to be 'paid'
        if paid is not None:
            paid = 'paid'
        else:
            paid = ''
        #Find a span with the class author
        author = fic.find('span', {'class': 'author'})
        #Author profile link is the link here
        authlink = author.a['href']
        #Author name is the text on this link
        authname = author.text
        #Find the table with metadata
        tables = fic.find_all('dd')
        #The fandom is the second value in the table
        fandom = tables[1].text
        #If there's 5 things in the table:
        if len(tables) == 4:
            #Set character ships to empty
            charships = ''
            #Length is the third value in the table
            length = tables[2].text
            #Post date is the 4th value in teh table
            postdate = tables[3].text
        #If there's 6 things in the table
        if len(tables) == 5:
            #Character ships are the third thing
            charships = tables[2].text
            #Length is the 4th thing
            length = tables[3].text
            #Post date is the 5th thing
            postdate = tables[4].text
        #Find a div with the class tags
        tagbox = fic.find("div", {"class": 'tags'})
        #If there are tags
        if tagbox is not None:
            #Find all links
            taglist = tagbox.find_all('a')
            #Make an empty list for tags
            tags = []
            #For each tag in the tag list
            for tag in taglist:
                #The tag is the text of the link in the tag box
                tag = tag.text
                #Add that tag to the list of tags
                tags.append(tag)
                #Combine all the things in the tag list, separated by pipes
                alltags = '|'.join(tags)
            #Blurb is the div with fanfic-description-text
        blurb = fic.find('div', {'class': 'fanfic-description-text'}).text
        #Create a new item with the metadata that's been scraped
        newitem = {"Title": title, "Storylink": storylink, "AuthName": authname, "AuthID": authlink, "Shiptype": shiptype, "Rating": rating, "Status": textstatus, "Likes": likenumber, "Paid": paid, "Fandom": fandom, "Charships": charships, "Award": award, "Translation": translation, "Length": length, "Postdate": postdate, "Tags": alltags, "Blurb": blurb}
        #Add the item to the Pandas dataframe
        russianfanfic = russianfanfic.append(newitem, ignore_index=True)
    #Wait 3-10 seconds before loading the new page
    sleep(randint(3,10))

https://ficbook.net/find?fandom_filter=fandom&fandom_group_id=1&fandom_ids%5B0%5D=3276&pages_range=pages_min=&pages_max=&statuses%5B0%5D=1&statuses%5B1%5D=2&statuses%5B2%5D=3&sizes%5B0%5D=2&sizes%5B1%5D=3&sizes%5B2%5D=4&ratings%5B0%5D=5&ratings%5B1%5D=6&ratings%5B2%5D=7&ratings%5B3%5D=8&ratings%5B4%5D=9&transl=1&directions%5B0%5D=1&directions%5B1%5D=2&directions%5B2%5D=3&directions%5B3%5D=4&directions%5B4%5D=7&directions%5B5%5D=6&directions%5B6%5D=5&likes_min=&likes_max=&rewards_min=&dateFilterCreate=1&date_create_min=2020-03-01&date_create_max=2020-03-31&date_update_min=&title=&sort=1&rnd=2642490&find=%D0%9D%D0%B0%D0%B9%D1%82%D0%B8!&p=1#result
https://ficbook.net/find?fandom_filter=fandom&fandom_group_id=1&fandom_ids%5B0%5D=3276&pages_range=pages_min=&pages_max=&statuses%5B0%5D=1&statuses%5B1%5D=2&statuses%5B2%5D=3&sizes%5B0%5D=2&sizes%5B1%5D=3&sizes%5B2%5D=4&ratings%5B0%5D=5&ratings%5B1%5D=6&ratings%5B2%5D=7&ratings%5B3%5D=8&ratings%5B4%5D=9&transl=1&directions%5B0%5D=1&directions%5B

https://ficbook.net/find?fandom_filter=fandom&fandom_group_id=1&fandom_ids%5B0%5D=3276&pages_range=pages_min=&pages_max=&statuses%5B0%5D=1&statuses%5B1%5D=2&statuses%5B2%5D=3&sizes%5B0%5D=2&sizes%5B1%5D=3&sizes%5B2%5D=4&ratings%5B0%5D=5&ratings%5B1%5D=6&ratings%5B2%5D=7&ratings%5B3%5D=8&ratings%5B4%5D=9&transl=1&directions%5B0%5D=1&directions%5B1%5D=2&directions%5B2%5D=3&directions%5B3%5D=4&directions%5B4%5D=7&directions%5B5%5D=6&directions%5B6%5D=5&likes_min=&likes_max=&rewards_min=&dateFilterCreate=1&date_create_min=2020-03-01&date_create_max=2020-03-31&date_update_min=&title=&sort=1&rnd=2642490&find=%D0%9D%D0%B0%D0%B9%D1%82%D0%B8!&p=14#result
https://ficbook.net/find?fandom_filter=fandom&fandom_group_id=1&fandom_ids%5B0%5D=3276&pages_range=pages_min=&pages_max=&statuses%5B0%5D=1&statuses%5B1%5D=2&statuses%5B2%5D=3&sizes%5B0%5D=2&sizes%5B1%5D=3&sizes%5B2%5D=4&ratings%5B0%5D=5&ratings%5B1%5D=6&ratings%5B2%5D=7&ratings%5B3%5D=8&ratings%5B4%5D=9&transl=1&directions%5B0%5D=1&directions%5

https://ficbook.net/find?fandom_filter=fandom&fandom_group_id=1&fandom_ids%5B0%5D=3276&pages_range=pages_min=&pages_max=&statuses%5B0%5D=1&statuses%5B1%5D=2&statuses%5B2%5D=3&sizes%5B0%5D=2&sizes%5B1%5D=3&sizes%5B2%5D=4&ratings%5B0%5D=5&ratings%5B1%5D=6&ratings%5B2%5D=7&ratings%5B3%5D=8&ratings%5B4%5D=9&transl=1&directions%5B0%5D=1&directions%5B1%5D=2&directions%5B2%5D=3&directions%5B3%5D=4&directions%5B4%5D=7&directions%5B5%5D=6&directions%5B6%5D=5&likes_min=&likes_max=&rewards_min=&dateFilterCreate=1&date_create_min=2020-03-01&date_create_max=2020-03-31&date_update_min=&title=&sort=1&rnd=2642490&find=%D0%9D%D0%B0%D0%B9%D1%82%D0%B8!&p=27#result
https://ficbook.net/find?fandom_filter=fandom&fandom_group_id=1&fandom_ids%5B0%5D=3276&pages_range=pages_min=&pages_max=&statuses%5B0%5D=1&statuses%5B1%5D=2&statuses%5B2%5D=3&sizes%5B0%5D=2&sizes%5B1%5D=3&sizes%5B2%5D=4&ratings%5B0%5D=5&ratings%5B1%5D=6&ratings%5B2%5D=7&ratings%5B3%5D=8&ratings%5B4%5D=9&transl=1&directions%5B0%5D=1&directions%5

https://ficbook.net/find?fandom_filter=fandom&fandom_group_id=1&fandom_ids%5B0%5D=3276&pages_range=pages_min=&pages_max=&statuses%5B0%5D=1&statuses%5B1%5D=2&statuses%5B2%5D=3&sizes%5B0%5D=2&sizes%5B1%5D=3&sizes%5B2%5D=4&ratings%5B0%5D=5&ratings%5B1%5D=6&ratings%5B2%5D=7&ratings%5B3%5D=8&ratings%5B4%5D=9&transl=1&directions%5B0%5D=1&directions%5B1%5D=2&directions%5B2%5D=3&directions%5B3%5D=4&directions%5B4%5D=7&directions%5B5%5D=6&directions%5B6%5D=5&likes_min=&likes_max=&rewards_min=&dateFilterCreate=1&date_create_min=2020-03-01&date_create_max=2020-03-31&date_update_min=&title=&sort=1&rnd=2642490&find=%D0%9D%D0%B0%D0%B9%D1%82%D0%B8!&p=40#result
https://ficbook.net/find?fandom_filter=fandom&fandom_group_id=1&fandom_ids%5B0%5D=3276&pages_range=pages_min=&pages_max=&statuses%5B0%5D=1&statuses%5B1%5D=2&statuses%5B2%5D=3&sizes%5B0%5D=2&sizes%5B1%5D=3&sizes%5B2%5D=4&ratings%5B0%5D=5&ratings%5B1%5D=6&ratings%5B2%5D=7&ratings%5B3%5D=8&ratings%5B4%5D=9&transl=1&directions%5B0%5D=1&directions%5

In [19]:
#Display the results
russianfanfic

Unnamed: 0,Title,Storylink,AuthName,AuthID,Shiptype,Rating,Status,Likes,Paid,Fandom,Charships,Length,Postdate,Tags,Blurb
0,\nИгра вслепую\n \n,/readfic/9699565,\n\nPak Yeon Hee\n,/authors/2353948,Джен,G,badge-status-finished,5495,,"\nРоулинг Джоан «Гарри Поттер», ...","\nСеверус Снейп, ...",\n 85 с...,\n 06.08.2020\n ...,AU|Курение|Магический реализм|Намеки на отноше...,\n Северус Тобиас Снейп — с...
1,\nМантии на все случаи \n \n,/readfic/9626520,\n\nPak Yeon Hee\n,/authors/2353948,Джен,G,badge-status-finished,5242,,"\nРоулинг Джоан «Гарри Поттер», ...","\nЭйлин Принц, ...",\n 44 с...,\n 21.07.2020\n ...,AU|Намеки на отношения|ООС|Отклонения от канон...,\n Кто-нибудь помнит мадам ...
2,\nПуть Меча и Магии\n \n,/readfic/9654638,\n\nSauron777\n,/authors/3426893,Джен,NC-17,badge-status-finished,5139,,"\nBleach, ...","\nОМП (Изаму Курооками/ГГ)/адекват!Сой Фонг, ...",\n 483 ...,\n 28.05.2021\n ...,AU|Антигерои|Бессмертие|Боги / Божественные су...,\n Завершающее приключение ...
3,\nРону нехорошо\n \n,/readfic/9645116,\n\nDramionaPoterianna\n,/authors/2015051,Гет,PG-13,badge-status-finished,4929,,"\nРоулинг Джоан «Гарри Поттер», ...","\nДрако Малфой/Гермиона Грейнджер, ...",\n 38 с...,\n 21.08.2020\n ...,AU|Бывшие|Гарри Поттер и Драко Малфой — друзья...,\n Рон расстался с Гермионо...
4,\nАмортенция\n \n,/readfic/9637088,\n\nLingShu\n,/authors/3407530,Слэш,NC-17,badge-status-finished,3846,,"\nРоулинг Джоан «Гарри Поттер», ...",\nСеверус Снейп/Гарри Поттер,\n 55 с...,\n 11.10.2020\n ...,Забота / Поддержка|Искусственно вызванные чувс...,\n Через несколько лет посл...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4120,\nКрайне интересные обстоятельства\n ...,/readfic/9323305,\n\nYoung_varvar444\n,/authors/3438811,Гет,PG-13,badge-status-finished,4,,"\nРоулинг Джоан «Гарри Поттер», ...","\nЛили Поттер, ...",\n 6 ст...,\n 23.04.2020\n ...,AU|Учебные заведения,\n AU — Северус — молодой п...
4121,\nАльтернатива\n \n,/readfic/9321178,\n\nRange_Phantom\n,/authors/4278727,Джен,NC-17,badge-status-in-progress,4,,"\nРоулинг Джоан «Гарри Поттер», ...",\nОМП,"\n планируется Макси, напис...",\n 23.04.2020\n ...,AU|Магия|Нецензурная лексика|Серая мораль|Убий...,\n В ту ночь величайший тём...
4122,\nШутка времени\n \n,/readfic/9289745,\n\nAdela_Phoenix\n,/authors/1656450,Фемслэш,NC-17,badge-status-in-progress,4,,"\nРоулинг Джоан «Гарри Поттер», ...","\nАллин Снейп/Рьяна Поттер, ...","\n планируется Миди, написа...",\n 21.04.2020\n ...,AU|Магия|Персонажи-лесбиянки|Развитие отношени...,"\n История прошлого, в кото..."
4123,\nПамять...\n \n,/readfic/9312060,\n\nВзРыВнАя ПлЮшКа ТёМнОгО лОрДа\n,/authors/2670064,Джен,G,badge-status-finished,4,,\nГарри Поттер,"\nСмерть, ...",\n 2 ст...,\n 21.04.2020\n ...,AU|Ангст|Драббл|Магический реализм|Открытый фи...,\n Никто не станет скорбеть...


In [20]:
#Remove newlines and tabs
cleanrussianfanfic = russianfanfic.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=[" "," "], regex=True, inplace=False)
cleanrussianfanfic

Unnamed: 0,Title,Storylink,AuthName,AuthID,Shiptype,Rating,Status,Likes,Paid,Fandom,Charships,Length,Postdate,Tags,Blurb
0,Игра вслепую,/readfic/9699565,Pak Yeon Hee,/authors/2353948,Джен,G,badge-status-finished,5495,,"Роулинг Джоан «Гарри Поттер», ...","Северус Снейп, ...",85 ст...,06.08.2020,AU|Курение|Магический реализм|Намеки на отноше...,Северус Тобиас Снейп — сы...
1,Мантии на все случаи,/readfic/9626520,Pak Yeon Hee,/authors/2353948,Джен,G,badge-status-finished,5242,,"Роулинг Джоан «Гарри Поттер», ...","Эйлин Принц, ...",44 ст...,21.07.2020,AU|Намеки на отношения|ООС|Отклонения от канон...,Кто-нибудь помнит мадам М...
2,Путь Меча и Магии,/readfic/9654638,Sauron777,/authors/3426893,Джен,NC-17,badge-status-finished,5139,,"Bleach, ...","ОМП (Изаму Курооками/ГГ)/адекват!Сой Фонг, ...",483 с...,28.05.2021,AU|Антигерои|Бессмертие|Боги / Божественные су...,Завершающее приключение М...
3,Рону нехорошо,/readfic/9645116,DramionaPoterianna,/authors/2015051,Гет,PG-13,badge-status-finished,4929,,"Роулинг Джоан «Гарри Поттер», ...","Драко Малфой/Гермиона Грейнджер, ...",38 ст...,21.08.2020,AU|Бывшие|Гарри Поттер и Драко Малфой — друзья...,Рон расстался с Гермионой...
4,Амортенция,/readfic/9637088,LingShu,/authors/3407530,Слэш,NC-17,badge-status-finished,3846,,"Роулинг Джоан «Гарри Поттер», ...",Северус Снейп/Гарри Поттер,55 ст...,11.10.2020,Забота / Поддержка|Искусственно вызванные чувс...,Через несколько лет после...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4120,Крайне интересные обстоятельства,/readfic/9323305,Young_varvar444,/authors/3438811,Гет,PG-13,badge-status-finished,4,,"Роулинг Джоан «Гарри Поттер», ...","Лили Поттер, ...",6 стр...,23.04.2020,AU|Учебные заведения,AU — Северус — молодой пр...
4121,Альтернатива,/readfic/9321178,Range_Phantom,/authors/4278727,Джен,NC-17,badge-status-in-progress,4,,"Роулинг Джоан «Гарри Поттер», ...",ОМП,"планируется Макси, написа...",23.04.2020,AU|Магия|Нецензурная лексика|Серая мораль|Убий...,В ту ночь величайший тёмн...
4122,Шутка времени,/readfic/9289745,Adela_Phoenix,/authors/1656450,Фемслэш,NC-17,badge-status-in-progress,4,,"Роулинг Джоан «Гарри Поттер», ...","Аллин Снейп/Рьяна Поттер, ...","планируется Миди, написан...",21.04.2020,AU|Магия|Персонажи-лесбиянки|Развитие отношени...,"История прошлого, в котор..."
4123,Память...,/readfic/9312060,ВзРыВнАя ПлЮшКа ТёМнОгО лОрДа,/authors/2670064,Джен,G,badge-status-finished,4,,Гарри Поттер,"Смерть, ...",2 стр...,21.04.2020,AU|Ангст|Драббл|Магический реализм|Открытый фи...,Никто не станет скорбеть ...


In [21]:
cleanrussianfanfic.to_tsv('/Users/qad/Documents/russianfanfic-2020-04.tsv', index=False, sep="\t")