<a href="https://colab.research.google.com/github/quicksilverri/fanfic-popularuty-prediction/blob/main/fanfic_popularity_predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing staff

In [2]:
import requests as req
from bs4 import BeautifulSoup
import pandas as pd 

# Gathering data from the site 

In [3]:
link = 'https://archiveofourown.org/tags/Marvel/works?page=1'
page = req.get(link)

Let's get the first page:

In [4]:
soup = BeautifulSoup(page.content, 'html.parser')

In [5]:
fanfics = soup.select('[role~=article]')

Header 

In [109]:
headers = [fanfic.select('.heading a') for fanfic in fanfics]

titles = [header[0].get_text() for header in headers]
authors = [header[1].get_text() for header in headers]
fandoms = [[fandom.get_text() for fandom in header[2:]] for header in headers]

In [74]:
dates = [fanfic.select('.datetime')[0].get_text() for fanfic in fanfics]

Tags

In [108]:
characters_raw = [fanfic.select('.characters a.tag') for fanfic in fanfics]
characters = [[character.get_text() for character in characters_] 
              for characters_ in characters_raw]

parings_raw = [fanfic.select('.relationships a.tag') for fanfic in fanfics]
parings = [[paring.get_text() for paring in parings_]
           for parings_ in parings_raw]

Stats

In [6]:
language = [fanfic.select('dd.language')[0].get_text() for fanfic in fanfics]
chapters = None
words = [fanfic.select('dd.words')[0].get_text() for fanfic in fanfics]
kudos = None

In [11]:
int(words[2].replace(',', ''))

3248

Square info

In [None]:
rating = None
completion = None
relationships = None
warnings = None

In [None]:
descriptions = None

Here we put all the info in one DataFrame:

In [113]:
fanfics_df = pd.DataFrame()
fanfics_df['titles'] = pd.Series(titles)
fanfics_df['author'] = pd.Series(authors)
fanfics_df['fandoms'] = pd.Series(fandoms)
fanfics_df['date'] = pd.Series(dates)
fanfics_df['characters'] = pd.Series(characters)
fanfics_df['parings'] = pd.Series(parings)
fanfics_df['lang'] = pd.Series(language)

In [114]:
fanfics_df

Unnamed: 0,titles,author,fandoms,date,characters,parings,lang
0,Unnecessary Introductions,ijustliveherekay,"[Moon Knight (Comics), Moon Knight (TV 2022), ...",02 Jul 2022,"[Steven Grant (Marvel), Marc Spector, Jake Loc...",[Steven Grant & Khonshu (Moon Knight) & Jake L...,English
1,Beautiful War,habitlwt28,"[One Direction (Band), Marvel Cinematic Univer...",02 Jul 2022,"[Louis Tomlinson, Harry Styles, Zayn Malik, Ni...",[Harry Styles/Louis Tomlinson],English
2,Shallow,CarryOn117,"[Marvel Cinematic Universe, The Avengers (Marv...",02 Jul 2022,"[Tony Stark, Reader]","[Tony Stark/Reader, Avengers Team & Reader]",English
3,I LOVE YOU IN EVERY UNIVERSE,IAmButAFool,[Marvel Cinematic Universe],02 Jul 2022,"[Wanda Maximoff, Reader, Doctor Strange, Ameri...",[Wanda Maximoff/Reader],English
4,Last Wish,Bill_Longbow,"[hollyandvice (hiasobi_writes), Loran_Arameri,...",02 Jul 2022,"[Tony Stark, Steve Rogers]",[Steve Rogers/Tony Stark],English
5,Mates: Loba y ángel,SupercorpSlexie24,"[Marvel Cinematic Universe, Hawkeye (TV 2021)]",02 Jul 2022,"[Yelena Belova, Kate Bishop, Natasha Romanov (...","[Yelena Belova/Kate Bishop, Clint Barton/Laura...",Español
6,Daisy's Day (The Best Day Ever),tessathetesla,[Agents of S.H.I.E.L.D. (TV)],02 Jul 2022,"[Skye | Daisy Johnson, Daniel Sousa, Phil Coul...","[Skye | Daisy Johnson/Daniel Sousa, Phil Couls...",English
7,"longing, rusted, seventeen, daybreak",honeycombclaire,"[Marvel Cinematic Universe, The Avengers (Marv...",02 Jul 2022,"[Peter Parker, Tony Stark, James ""Bucky"" Barne...","[James ""Bucky"" Barnes & Peter Parker, Peter Pa...",English
8,Please Remember,mykinkyyandere,[Marvel Cinematic Universe],02 Jul 2022,"[Stephen Strange, Loki (Marvel), Reader]","[Stephen Strange/Reader, Loki/Reader]",English
9,A game to play,Kill_ua33,"[Marvel, Spider-Man - All Media Types, Deadpoo...",02 Jul 2022,"[Wade Wilson, Peter Parker, Avengers - Character]",[Peter Parker/Wade Wilson],English


We will concat DataFrames of each page into one DataFrame. 

# Creating parse function

### Create Fanfic class

In [108]:
class Fanfic: 
  def __init__(self, fanfic):
    self.fic = fanfic
    self.info = {}

    self.header = self.fic.select('.heading a')
    
    self.get_header()
    self.get_chapters()
    self.get_stats()


  def get_header(self):
    """Sets title, author nickname and list of fandoms into self.stats dict"""
    
    title = self.header[0].get_text()
    author = self.header[1].get_text()
    fandoms = [fandom.get_text() for fandom in self.header[2:]]
     
    self.info['title'] = title
    self.info['author'] = author
    self.info['fandoms'] = fandoms

  
  def get_chapters(self): 
    """Sets number of chapters written and number of chapters intended
    into self.stats dict"""

    def smart_int(n):
      try: 
        return int(n)
      except:
        return None

    chapters = self.fic.select('dd.chapters')[0].get_text()
    written, total = map(smart_int, chapters.split('/'))

    self.info['written'] = written
    self.info['total'] = total

  def get_number(self, selector): 
     """Processes numerical data (removes comma so it can be turned
     into integer)"""

     if self.fic.select(selector): 

       data = self.fic.select(selector)[0].get_text()
       data = int(data.replace(',', ''))
       return data
        
     return None

  def get_stats(self): 
     """Returns data collected in Stats section (words, hits, comments,
     bookmarks, collections and language)"""
      
     words = self.get_number('dd.words')
     hits = self.get_number('dd.hits')
     comments = self.get_number('dd.comments')
     bookmarks = self.get_number('dd.bookmarks')
     collections = self.get_number('dd.collections')

     lang = self.fic.select('dd.language')[0].get_text()
     
     self.info['words'] = words
     self.info['hits'] = hits 
     self.info['comments'] = comments 
     self.info['bookmarks'] = bookmarks
     self.info['collections'] = collections 
     self.info['lang'] = lang


  def get_info(self):
     """Return all the data about Fanfic in a DataFrame"""

     df = pd.DataFrame([self.info])

     return df

### More interesting things

In [102]:
ff = Fanfic(fanfics[1])

In [103]:
ff.info

{'author': 'denimbeans',
 'bookmarks': 583,
 'collections': 4,
 'comments': 2176,
 'fandoms': ['Percy Jackson and the Olympians - Rick Riordan',
  'Percy Jackson and the Olympians & Related Fandoms - All Media Types',
  'Marvel Cinematic Universe',
  'The Avengers (Marvel) - All Media Types'],
 'hits': 117920,
 'lang': 'English',
 'title': 'The Lost Soldiers',
 'total': None,
 'words': 98953,
 'written': 30}

In [None]:
def parse_page(link): 
  """Parses the given page and returns list of fanfics"""
  page = req.get(link)
  soup = BeautifulSoup(page.content, 'html.parser')
  fanfics = soup.select('[role~=article]')

  return fanfics 


In [19]:
fanfics[1].select('dl')

[<dl class="stats">
 <dt class="language">Language:</dt>
 <dd class="language">English</dd>
 <dt class="words">Words:</dt>
 <dd class="words">98,953</dd>
 <dt class="chapters">Chapters:</dt>
 <dd class="chapters"><a href="/works/36118126/chapters/100267578">30</a>/?</dd>
 <dt class="collections">Collections:</dt>
 <dd class="collections"><a href="/works/36118126/collections">4</a></dd>
 <dt class="comments">Comments:</dt>
 <dd class="comments"><a href="/works/36118126?show_comments=true&amp;view_full_work=true#comments">2176</a></dd>
 <dt class="kudos">Kudos:</dt>
 <dd class="kudos"><a href="/works/36118126?view_full_work=true#kudos">4510</a></dd>
 <dt class="bookmarks">Bookmarks:</dt>
 <dd class="bookmarks"><a href="/works/36118126/bookmarks">583</a></dd>
 <dt class="hits">Hits:</dt>
 <dd class="hits">117920</dd>
 </dl>]

In [137]:
chapters = [fanfic.select('dd.chapters')[0].get_text() for fanfic in fanfics]

In [139]:
chapters

['2/7',
 '23/?',
 '1/1',
 '1/1',
 '1/1',
 '6/?',
 '1/1',
 '3/17',
 '1/1',
 '1/1',
 '21/?',
 '25/?',
 '24/?',
 '1/1',
 '1/1',
 '13/?',
 '18/19',
 '1/1',
 '1/?',
 '13/14']

In [149]:
total

In [122]:
stats

[['English', '2,683', '2/7', '4', '68', '6', '321'],
 ['English', '49,618', '23/?', '4', '4', '1', '166'],
 ['English', '2,568', '1/1', '0'],
 ['English', '5,000', '1/1', '0'],
 ['English', '1,759', '1/1', '1', '0'],
 ['Español', '39,284', '6/?', '13', '55', '3', '1120'],
 ['English', '1,290', '1/1', '0'],
 ['English', '8,219', '3/17', '11', '91', '21', '889'],
 ['English', '2,733', '1/1', '0'],
 ['English', '921', '1/1', '2'],
 ['English', '112,927', '21/?', '131', '116', '18', '4547'],
 ['English', '68,359', '25/?', '1', '240'],
 ['English', '43,572', '24/?', '3', '135', '16', '4409'],
 ['English', '1,545', '1/1', '0'],
 ['English', '7,445', '1/1', '0'],
 ['English', '37,543', '13/?', '127', '605', '115', '20293'],
 ['English', '3,105', '18/19', '8', '9', '262'],
 ['English', '819', '1/1', '0'],
 ['English', '1,730', '1/?', '0'],
 ['English', '8,278', '13/14', '48', '41', '12', '1212']]