<a href="https://colab.research.google.com/github/quicksilverri/fanfic-popularuty-prediction/blob/main/fanfic_popularity_predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fanfiction parser

This notebook is used to parse info about fanfiction from ArchiveOfOurOwn.com (AO3)

# Import staff

In [None]:
import requests as req
from bs4 import BeautifulSoup
import pandas as pd 
import seaborn as sns
from time import time, sleep
import matplotlib.pyplot as plt

%matplotlib inline

# Parse the data

In [None]:
marvel_link =  'https://archiveofourown.org/works?commit=Sort+and+Filter&work_search%5Bsort_column%5D=authors_to_sort_on&work_search%5Bother_tag_names%5D=&work_search%5Bexcluded_tag_names%5D=&work_search%5Bcrossover%5D=&work_search%5Bcomplete%5D=&work_search%5Bwords_from%5D=&work_search%5Bwords_to%5D=&work_search%5Bdate_from%5D=&work_search%5Bdate_to%5D=&work_search%5Bquery%5D=&work_search%5Blanguage_id%5D=&tag_id=Marvel'
xmen_link = 'https://archiveofourown.org/works?work_search%5Bsort_column%5D=revised_at&work_search%5Bother_tag_names%5D=&exclude_work_search%5Bfreeform_ids%5D%5B%5D=11175&exclude_work_search%5Bfreeform_ids%5D%5B%5D=263297&work_search%5Bexcluded_tag_names%5D=&work_search%5Bcrossover%5D=&work_search%5Bcomplete%5D=&work_search%5Bwords_from%5D=&work_search%5Bwords_to%5D=&work_search%5Bdate_from%5D=&work_search%5Bdate_to%5D=&work_search%5Bquery%5D=&work_search%5Blanguage_id%5D=&commit=Sort+and+Filter&tag_id=Erik+Lehnsherr*s*Charles+Xavier'
link = marvel_link
pages = 1000

### Create Fanfic class

It's kinda long and simple, so it's hidden. 

In [None]:
def clean(list): 
  """Extracts text from list of bs4.Tags"""

  return [item.get_text() for item in list]

In [None]:
class Fanfic: 
  def __init__(self, fanfic):
    self.fic = fanfic
    self.info = {}

    self.header = self.fic.select('.heading a')
    
    self.get_header()
    self.get_chapters()
    self.get_stats()
    self.get_date()
    self.get_tags()
    self.get_square()

  def get_header(self):
    """Sets title, author nickname and list of fandoms into self.stats dict"""
    
    self.info['title'] = self.header[0].get_text()
    self.info['author'] = self.header[1].get_text()
    self.info['fandoms'] = clean(self.header[2:])

  def get_chapters(self): 
    """Sets number of chapters written and number of chapters intended
    into self.stats dict"""

    def smart_int(n):
      try: 
        return int(n)
      except:
        return None

    chapters = self.fic.select('dd.chapters')[0].get_text()
    written, total = map(smart_int, chapters.split('/'))

    self.info['written'] = written
    self.info['total'] = total

  def get_number(self, selector): 
     """Processes numerical data (removes comma so it can be turned
     into integer)"""

     try: 

       data = self.fic.select(selector)[0].get_text()
       data = int(data.replace(',', ''))
       return data
        
     except: return None

  def get_stats(self): 
     """Sets data collected in Stats section (words, hits, comments,
     bookmarks, collections and language) into self.info dict"""

     self.info['words'] = self.get_number('dd.words')
     self.info['hits'] = self.get_number('dd.hits') 
     self.info['comments'] = self.get_number('dd.comments') 
     self.info['bookmarks'] = self.get_number('dd.bookmarks')
     self.info['collections'] = self.get_number('dd.collections') 
     self.info['lang'] = self.fic.select('dd.language')[0].get_text()
     self.info['kudos'] = self.get_number('.kudos a')

  def get_date(self):  # add date of first publishing??
    """Sets date of the most recent update into self.dict"""

    self.info['date'] = self.fic.select('.datetime')[0].get_text() 
    
  def get_tags(self): 
    """Sets tag-like data into self.info dict"""

    self.info['characters'] = clean(self.fic.select('.characters a.tag'))
    self.info['parings'] = clean(self.fic.select('.relationships a.tag'))
    self.info['freeforms'] = clean(self.fic.select('.freeforms a.tag'))

  def get_square(self): 
    """Sets data from square to the left of fanfic title into self.info dict"""

    self.info['rating'] = self.fic.select('.rating .text')[0].get_text()
    self.info['category'] = clean(self.fic.select('.category .text'))
    self.info['completion'] = self.fic.select('.iswip .text')[0].get_text()
    self.info['warnings'] = clean(self.fic.select('.warnings a.tag'))

  def get_info(self):
     """Return all the data about Fanfic in a DataFrame"""

     df = pd.DataFrame([self.info])

     return df

### Create FanficParser

In [None]:
class FanficParser: 
  def __init__(self, initial_link):
    self.inlink = initial_link
    self.domain = self.get_domain()
    self.df = pd.DataFrame()
    self.link = self.inlink
    self.page = 1

    self.fanfics = []

  def get_domain(self):
     end_of_link = self.inlink.find('/', 8)
     domain = self.inlink[:end_of_link]
     return domain
  
  def parse_page(self): 
    page = req.get(self.link)
    soup = BeautifulSoup(page.content, 'html.parser')
    self.fanfics = soup.select('[role~=article]')

    try: 
      new_link_relative = soup.select('li.next a')[0].get('href')
      self.link = self.domain + new_link_relative

      print(f'{self.page} parsed')
      self.page += 1
    
    except: 
      print('page not parsed, let me wait a sec')
      sleep(20)

  def parse(self, n_pages): 
    start_time = time()
    
    for i in range(n_pages): 
      self.parse_page()
      
      for fanfic in self.fanfics:
        self.df = self.df.append(Fanfic(fanfic).get_info())

    print(f'total {self.df.shape[0]} fanfics')
    self.df.index = range(self.df.shape[0])
    
    end_time = time()
    print(f'time for execution {end_time - start_time}')

  def reset(self):
    self.df = pd.DataFrame()
    self.link = self.inlink
    self.page = 1
    print('Parser resetted')

## Parse info

In [None]:
parser = FanficParser(link)

In [None]:
parser.parse(300)

In [None]:
df = parser.df

In [None]:
df.shape

(11640, 20)

In [None]:
df.to_csv('new_fanfics.csv')