# EFP Fanfic Metadata Scraper
This notebook scrapes metadata from the Italian fanfic site [EFP Fanfic](https://efpfanfic.net/). To make it work, put the URL for a particular fandom page (everything up to `pagina=`) in as the *ScraperStem* value below, and set the range to be (1,some-number), where some-number is the final page of the paginated results for that fandom.

In [None]:
#Import libraries
import requests
import xml.etree.ElementTree
import csv
import urllib.parse
from random import randint
import time
from time import sleep
import re
import regex
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
#Create a Pandas dataframe with the metadata fields
italianfanfic = pd.DataFrame(columns=["Title", "Storylink", "Color", "LastChap", "AuthName", "AuthID", "Published", "Updated", "Genre", "Chapters", "Status", "Shiptype", "Note", "Warning", "Characters", "Ships", "Contests", "Reviews", "Blurb"])

Here's the cell you should modify with the fandom base URL and the range.

In [None]:
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0 Chrome/74.0.3729.131'}
#URL base, which is the page for a particular fandom, up to the place that indicates the page
ScraperStem = "https://efpfanfic.net/categories.php?catid=47&parentcatid=47&offset=15&pagina="
#For each page in a particular range
#(We know the total range by looking at the first page for the fandom, and seeing what # the last page of results is)
for i in range(1172,3766):
    #The full URL combines the base with the page number
    ScraperURL = ScraperStem + str(i)
    #Print the page
    print(ScraperURL)
    #Retrieve the page
    page = requests.get(ScraperURL, headers=headers)
    c = page.content
    #Parse the page contents with Beautiful Soup
    soup = BeautifulSoup(c, 'html.parser')
    #Identify the container with the fics
    fics = soup.find_all("div", {"class": "storybloc"})
    #For each fic
    for fic in fics:
        #Find the div with the title
        title = fic.find('div', {'class': 'titlestoria'})
        #Get the link around the title
        storylink = title.a['href']
        #Get the title text
        titlename = title.text
        #Find the title bar that has the color/rating
        titlebar = fic.find('div', {'class': 'titlebloc'})
        #Get the color/rating (ID value of the div below)
        color = titlebar.find('div')
        color = color.get('id')
        #Look for the div that would indicate it's the last chapter
        lastchap = titlebar.find('div', {'class': 'ultimochap'})
        #If the last chapter div is not empty
        if lastchap.text is not None:
            #Then give lastchap a value
            lastchap = 'lastchapter'
            #Otherwise
        else:
            #Last chapter should be blank
            lastchap = ''
        #Find the blurb
        blurb = fic.find('div', {'class': 'introbloc'}).text
        #Find the metadata div
        metadata = fic.find('div', {'class', 'notebloc'})
        #If metadata isn't empty
        if metadata is not None:
            #Author ID has uid = [some number]
            authid = re.findall(r'uid=([0-9]*)', str(metadata))
            #Author name is the text inside the author ID link
            authname = metadata.find('a').text
            #Publication date comes after 'Pubblicata:'
            published = re.search(r'Pubblicata: ((\d\/*)+) ', str(metadata)).group(1)
            #Updated date comes after 'Aggiornata'
            updated = re.search(r'Aggiornata: ((\d\/*)+) ', str(metadata)).group(1)
            #Genre comes after 'Genere'
            genre = re.search(r'Genere: (.*?\|)', str(metadata))
            #If genre isn't empty
            if genre is not None:
                #Capture genre value from regex
                genre = genre.group(1)
            #Chapters come after 'Capitoli'
            chapters = re.search(r'Capitoli: (.*?\<)', str(metadata))
            #If chapters are not empty
            if chapters is not None:
                #Capture value of chapters
                chapters = chapters.group(1)
                chapters = re.search(r'(\d*)', chapters).group(0)
            #Capture the text after 'Capitoli' which should also include the status
            status = re.search(r'Capitoli: (.*?\<)', str(metadata))
            #Refine the text to capture the actual status
            if status is not None:
                status = status.group(1)
                status = re.search(r'(\| )(.*)(<)$', status).group(2)
            #Ship type comes after 'Tipo di coppia'
            shiptype = re.search(r'Tipo di coppia: ((.*?)\|)', str(metadata))
            #Capture value
            if shiptype is not None:
                shiptype = shiptype.group(2)
            #Note comes after text 'Note'
            note = re.search(r'Note: ((.*?)\|)', str(metadata))
            #Capture note value
            if note is not None:
                note = note.group(2)
            #Text warning comes after 'Avvertimenti'
            textwarning = re.search(r'Avvertimenti: (.*)', str(metadata))
            #Capture text warning value
            if textwarning is not None:
                textwarning = textwarning.group(1)
            else:
                #Or otherwise set it to blank
                textwarning =''
            #Characters come after Personaggi'
            characters = re.search(r'Personaggi: (.*)', str(metadata))
            #Capture character value
            if characters is not None:
                characters = characters.group(1)
            #Ships come after 'Coppie'
            ships = re.search(r'Coppie: (.*)', str(metadata))
            #Capture ship value
            if ships is not None:
                ships = ships.group(1)
            #Contest info comes after 'Contesto'
            contest = re.search(r'Contesto: ((.*?)\|)', str(metadata))
            #Capture contest info
            if contest is not None:
                contest = contest.group(2)
            #Reviews comes at the end before 'recension'
            reviews = re.search(r'>(\d+)</a> recension', str(metadata))
            #Capture reviews value
            if reviews is not None:
                reviews = reviews.group(1)
                #Create new item with the data that's been scraped
                newitem = {"Title": title, "Storylink": storylink, "Color": color, "LastChap": lastchap, "AuthName": authname, "AuthID": authid, "Published": published, "Updated": updated, "Genre": genre, "Chapters": chapters, "Status": status, "Shiptype": shiptype, "Note": note, "Warning": textwarning, "Characters": characters, "Ships": ships, "Contests": contest, "Reviews": reviews, "Blurb": blurb}
                #Append new item to the Pandas dataframe
                italianfanfic = italianfanfic.append(newitem, ignore_index=True)
    #Sleep 4-10 seconds before going to the next page
    sleep(randint(4,10))

In [None]:
#Display the data
italianfanfic

In [None]:
#Remove newlines and tabs, then display the data again
cleanitalianfanfic = italianfanfic.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=[" "," "], regex=True, inplace=False)
cleanitalianfanfic


In [None]:
#Save the results to a TSV file
cleanitalianfanfic.to_csv('/Users/qad/Documents/italianfanfic2021-2.tsv', index=False, sep="\t")