# Ship and Let Ship: A Brief Analysis of Shipping Trends on Tumblr

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from itertools import islice
import csv

## static test

In [2]:
link = "https://fanlore.org/wiki/Tumblr_Fandometrics%27_Year_in_Review_2014"
response = requests.get(link)
html = response.text

soup = BeautifulSoup(html)

# find every element under 'Most Reblogged Ships'
ship_list = soup.find('span', id='Most_Reblogged_Ships').find_next('ol')

In [3]:
ship_string = ship_list.select('li')[0].text

In [4]:
# full element
# Destiel - Dean Winchester & Castiel, Supernatural
ship_string = ship_list.select('li')[0].text

# first split
# ['Destiel', 'Dean Winchester & Castiel, Supernatural']
ship_parts = ship_string.split(' - ')

# ship name
# Destiel
ship_name = ship_parts[0]

# characters
# character1: Dean Winchester
# character2: Castiel
character_string = ship_parts[1].split(', ')
characters_all = character_string[0].split(' & ')
character1 = characters_all[0]
character2 = characters_all[1]

# show
# Supernatural
show = character_string[1]

ship_dict = {
    'name': ship_name,
    'character1': character1,
    'character2': character2,
    'show': show
}

print(ship_dict)

{'name': 'Destiel', 'character1': 'Dean Winchester', 'character2': 'Castiel', 'show': 'Supernatural'}


### print all the ship info from 2014

In [5]:
ship_list = soup.find('span', id='Most_Reblogged_Ships').find_next('ol')

ships = []

# enumerate starting from 1
for index, ship in enumerate(ship_list.select('li'), 1):
    ship_string = ship.text

    ship_parts = re.split(r' - | -', ship_string)

    name = ship_parts[0]

    character_string = ship_parts[1].split(', ')
    characters_all = character_string[0].split(' & ')
    character1 = characters_all[0]
    character2 = characters_all[1]

    show = character_string[1]

    year = link[-4:]

    ship_info = {
        'year': year,
        'rank': index,
        'name': name,
        'character 1': character1,
        'character 2': character2,
        'show': show,
    }

    ships.append(ship_info)

ships

[{'year': '2014',
  'rank': 1,
  'name': 'Destiel',
  'character 1': 'Dean Winchester',
  'character 2': 'Castiel',
  'show': 'Supernatural'},
 {'year': '2014',
  'rank': 2,
  'name': 'Johnlock',
  'character 1': 'John Watson',
  'character 2': 'Sherlock Holmes',
  'show': 'Sherlock'},
 {'year': '2014',
  'rank': 3,
  'name': 'Larry Stylinson',
  'character 1': 'Harry Styles',
  'character 2': 'Louis Tomlinson',
  'show': 'One Direction'},
 {'year': '2014',
  'rank': 4,
  'name': 'Captain Swan',
  'character 1': 'Captain Hook',
  'character 2': 'Emma Swan',
  'show': 'Once Upon a Time'},
 {'year': '2014',
  'rank': 5,
  'name': 'Ereri',
  'character 1': 'Eren Jaeger',
  'character 2': 'Levi Ackerman',
  'show': 'Attack on Titan'},
 {'year': '2014',
  'rank': 6,
  'name': 'Sterek',
  'character 1': 'Stiles Stilinski',
  'character 2': 'Derek Hale',
  'show': 'Teen Wolf'},
 {'year': '2014',
  'rank': 7,
  'name': 'Troyler',
  'character 1': 'Troye Sivan',
  'character 2': 'Tyler Oakley',

In [6]:
df = pd.DataFrame(ships)
df

Unnamed: 0,year,rank,name,character 1,character 2,show
0,2014,1,Destiel,Dean Winchester,Castiel,Supernatural
1,2014,2,Johnlock,John Watson,Sherlock Holmes,Sherlock
2,2014,3,Larry Stylinson,Harry Styles,Louis Tomlinson,One Direction
3,2014,4,Captain Swan,Captain Hook,Emma Swan,Once Upon a Time
4,2014,5,Ereri,Eren Jaeger,Levi Ackerman,Attack on Titan
5,2014,6,Sterek,Stiles Stilinski,Derek Hale,Teen Wolf
6,2014,7,Troyler,Troye Sivan,Tyler Oakley,YouTubers
7,2014,8,MakoHaru,Makoto Tachibana,Haruka Nanase,Free!
8,2014,9,Narry,Harry Styles,Niall Horan,One Direction
9,2014,10,JeanMarco,Jean Kirstein,Marco Bott,Attack on Titan


# now try do a for loop for each year 2014-2020 (reminder 2021 & 2022 not counted)

In [7]:
years = [2014, 2015]
all_ships = []

for year in years:
    link = f"https://fanlore.org/wiki/Tumblr_Fandometrics%27_Year_in_Review_{year}"
    response = requests.get(link)
    html = response.text

    soup = BeautifulSoup(html)

    ship_list = soup.find('span', id='Most_Reblogged_Ships').find_next('ol')

    ships = []

    # enumerate starting from 1
    for index, ship in enumerate(ship_list.select('li'), 1):
        ship_string = ship.text

        ship_parts = re.split(r' - | -', ship_string)

        name = ship_parts[0]

        character_string = ship_parts[1].split(', ')
        characters_all = character_string[0].split(' & ')
        character1 = characters_all[0]
        character2 = characters_all[1]

        show = character_string[1]

        year = link[-4:]

        ship_info = {
            'year': year,
            'rank': index,
            'name': name,
            'character 1': character1,
            'character 2': character2,
            'show': show,
        }

        ships.append(ship_info)

    all_ships.extend(ships)


In [8]:
df = pd.DataFrame(all_ships)
df

Unnamed: 0,year,rank,name,character 1,character 2,show
0,2014,1,Destiel,Dean Winchester,Castiel,Supernatural
1,2014,2,Johnlock,John Watson,Sherlock Holmes,Sherlock
2,2014,3,Larry Stylinson,Harry Styles,Louis Tomlinson,One Direction
3,2014,4,Captain Swan,Captain Hook,Emma Swan,Once Upon a Time
4,2014,5,Ereri,Eren Jaeger,Levi Ackerman,Attack on Titan
5,2014,6,Sterek,Stiles Stilinski,Derek Hale,Teen Wolf
6,2014,7,Troyler,Troye Sivan,Tyler Oakley,YouTubers
7,2014,8,MakoHaru,Makoto Tachibana,Haruka Nanase,Free!
8,2014,9,Narry,Harry Styles,Niall Horan,One Direction
9,2014,10,JeanMarco,Jean Kirstein,Marco Bott,Attack on Titan


In [9]:
# now do 2021 and 2022

In [10]:
years = [2014, 2015, 2016, 2017, 2018, 2019, 2020]

ship_ids = {
    2014: 'Most_Reblogged_Ships',
    2015: 'Most_Reblogged_Ships',
    2016: '2016.27s_Top_Ships',
    2017: '2017.27s_Top_Ships',
    2018: '2018.27s_Top_Ships',
    2019: 'Top_100_Ships',
    2020: '100_Ships'
}

all_ships = []

for year in years:
    link = f"https://fanlore.org/wiki/Tumblr_Fandometrics%27_Year_in_Review_{year}"
    response = requests.get(link)
    html = response.text

    soup = BeautifulSoup(html)

    ship_id = ship_ids.get(year)
    ship_list = soup.find('span', id=ship_id).find_next('ol')

    ships = []

    # enumerate starting from 1
    for index, ship in islice(enumerate(ship_list.select('li'), 1), 20):
        ship_string = ship.text

        ship_parts = re.split(r' - | -', ship_string)

        name = ship_parts[0]

        character_string = ship_parts[1].split(', ')
        characters_all = character_string[0].split(' & ')
        character1 = characters_all[0]
        character2 = characters_all[1]

        show_ranked = character_string[1]
        show = re.sub(r'[-+]\d+', '', show_ranked).strip()
                
        year = link[-4:]

        ship_info = {
            'year': year,
            'rank': index,
            'name': name,
            'character 1': character1,
            'character 2': character2,
            'show': show,
        }

        ships.append(ship_info)

    all_ships.extend(ships)


In [11]:
df = pd.DataFrame(all_ships)
df

Unnamed: 0,year,rank,name,character 1,character 2,show
0,2014,1,Destiel,Dean Winchester,Castiel,Supernatural
1,2014,2,Johnlock,John Watson,Sherlock Holmes,Sherlock
2,2014,3,Larry Stylinson,Harry Styles,Louis Tomlinson,One Direction
3,2014,4,Captain Swan,Captain Hook,Emma Swan,Once Upon a Time
4,2014,5,Ereri,Eren Jaeger,Levi Ackerman,Attack on Titan
...,...,...,...,...,...,...
135,2020,16,Drarry,Draco Malfoy,Harry Potter,the Harry Potter universe
136,2020,17,Gallavich,Ian Gallagher,Mickey Milkovich,Shameless
137,2020,18,Stucky,Steve Rogers,Bucky Barnes,the Marvel universe
138,2020,19,Zutara,Zuko,Katara,Avatar: The Last Airbender


# 2013

In [12]:
link = "https://fandom.tumblr.com/post/162440709774/most-reblogged-ships-of-2013-makoharu-nanase"
response = requests.get(link)
html = response.text

soup = BeautifulSoup(html)


In [13]:
ships_2013 = soup.find('figcaption').find_next("ol")

ships_2013.select('li')[0]

name = ships_2013.select('li')[0].find("a").text
# MakoHaru

characters_info = ships_2013.select('li')[0].contents[2].strip().split(' & ')
characters = [character.strip().replace(',', '') for character in characters_info]
# ['Nanase Haruka', 'Tachibana Makoto']

show = ships_2013.select('li')[0].contents[3].text
# Free!

In [14]:
ships_2013_info = []

for index, ship in islice(enumerate(ships_2013.select('li'), 1), 20):
    name = ship.find("a").text

    characters_info = ship.contents[2].strip().split(' & ')
    characters = [character.strip().replace(',', '') for character in characters_info]
    character1 = characters[0]
    character2 = characters[1]

    show = ship.contents[3].text

    ships_info = {
            'year': 2013,
            'rank': index,
            'name': name,
            'character 1': character1,
            'character 2': character2,
            'show': show,
        }
    
    ships_2013_info.append(ships_info)
    
all_ships.extend(ships_2013_info)




In [15]:
ships_2013_info
df_2013 = pd.DataFrame(ships_2013_info)

merged_df = pd.concat([df, df_2013], axis=0)

merged_df_final = merged_df.sort_values(by=['year', 'rank'], ascending=True)
merged_df_final

Unnamed: 0,year,rank,name,character 1,character 2,show
0,2013,1,MakoHaru,Nanase Haruka,Tachibana Makoto,Free!
1,2013,2,J2,Jared Padalecki,Jensen Ackles,actors
2,2013,3,Larry Stylinson,Harry Styles,Louis Tomlinson,One Direction
3,2013,4,Destiel,Dean Winchester,Castiel,Supernatural
4,2013,5,Johnlock,John Watson,Sherlock,Sherlock
...,...,...,...,...,...,...
135,2020,16,Drarry,Draco Malfoy,Harry Potter,the Harry Potter universe
136,2020,17,Gallavich,Ian Gallagher,Mickey Milkovich,Shameless
137,2020,18,Stucky,Steve Rogers,Bucky Barnes,the Marvel universe
138,2020,19,Zutara,Zuko,Katara,Avatar: The Last Airbender


# 2021

In [16]:
link = "https://fanlore.org/wiki/Tumblr%E2%80%99s_Year_in_Review_2021"
response = requests.get(link)
html = response.text

soup = BeautifulSoup(html)

ships_2021 = soup.find('span', id='Ships').find_next('ol')


In [17]:
ships_2021_info = []

for index, ship in enumerate(ships_2021.select('li'), 1):
    ship_string = ship.text

    ship_parts = re.split(r' - | -', ship_string)

    name = ship_parts[0]

    character_string = ship_parts[1].split(', ')
    characters_all = character_string[0].split(' & ')
    character1 = characters_all[0]
    character2 = characters_all[1]

    show = character_string[1]
    show = re.sub(r'\+\d+', '', show)

    year = link[-4:]

    ship_info = {
        'year': year,
        'rank': index,
        'name': name,
        'character 1': character1,
        'character 2': character2,
        'show': show,
    }

    ships_2021_info.append(ship_info)

all_ships.extend(ships_2021_info)

In [18]:
df_2021 = pd.DataFrame(ships_2021_info)

merged_df = pd.concat([merged_df, df_2021], axis=0)

merged_df_final = merged_df.sort_values(by=['year', 'rank'], ascending=True)
merged_df_final

Unnamed: 0,year,rank,name,character 1,character 2,show
0,2013,1,MakoHaru,Nanase Haruka,Tachibana Makoto,Free!
1,2013,2,J2,Jared Padalecki,Jensen Ackles,actors
2,2013,3,Larry Stylinson,Harry Styles,Louis Tomlinson,One Direction
3,2013,4,Destiel,Dean Winchester,Castiel,Supernatural
4,2013,5,Johnlock,John Watson,Sherlock,Sherlock
...,...,...,...,...,...,...
15,2021,16,Zukka,Zuko,Sokka,Avatar: The Last Airbender
16,2021,17,Bumbleby,Yang Xiao Long,Blake Belladonna,RWBY
17,2021,18,Jonmartin,Jonathan Sims,Martin Blackwood,The Magnus Archives
18,2021,19,Gallavich,Ian Gallagher,Mickey Milkovich,Shameless


# 2022

In [19]:
link = "https://fandom.tumblr.com/post/702448526072987648/ships-if-its-not-canon-at-least-theres-always"
response = requests.get(link)
html = response.text

soup = BeautifulSoup(html)

ships_2022 = soup.find('figcaption').find_next("ol")

In [79]:
ships_2022 = soup.find('figcaption').find_next("ol")

ships_2022.select('li')[0]

name = ships_2022.select('li')[0].find("a").text
# MakoHaru

characters_info = ships_2022.select('li')[0].contents[1].strip()
characters_info = re.split(' &amp; | & ', characters_info)
characters = [character.strip().replace(',', '') for character in characters_info]
print(ships_2022.select('li')[2].contents[-2])

# show = ships_2022.select('li')[0].contents[2].text

# ships_2022


Dean Winchester & Castiel, 


In [83]:
ships_2022_info = []

for index, ship in islice(enumerate(ships_2022.select('li'), 1), 20):
    name = ship.find("a").text

    characters_info = ship.contents[-2].strip().split(' & ')
    characters = [character.strip().replace(',', '') for character in characters_info]
    character1 = characters[0]
    character2 = characters[1]

    show = ship.contents[-1].text

    ships_info = {
            'year': 2022,
            'rank': index,
            'name': name,
            'character 1': character1,
            'character 2': character2,
            'show': show,
        }
    
    ships_2022_info.append(ships_info)
    
all_ships.extend(ships_2022_info)

In [94]:
df_2022 = pd.DataFrame(ships_2022_info)

merged_df = pd.concat([merged_df, df_2022], axis=0)

merged_df_final = merged_df.sort_values(by=['year', 'rank'], ascending=True)
merged_df_final

Unnamed: 0,year,rank,name,character 1,character 2,show
0,2013,1,MakoHaru,Nanase Haruka,Tachibana Makoto,Free!
1,2013,2,J2,Jared Padalecki,Jensen Ackles,actors
2,2013,3,Larry Stylinson,Harry Styles,Louis Tomlinson,One Direction
3,2013,4,Destiel,Dean Winchester,Castiel,Supernatural
4,2013,5,Johnlock,John Watson,Sherlock,Sherlock
...,...,...,...,...,...,...
15,2021,16,Zukka,Zuko,Sokka,Avatar: The Last Airbender
16,2021,17,Bumbleby,Yang Xiao Long,Blake Belladonna,RWBY
17,2021,18,Jonmartin,Jonathan Sims,Martin Blackwood,The Magnus Archives
18,2021,19,Gallavich,Ian Gallagher,Mickey Milkovich,Shameless


In [87]:
merged_df_final.to_csv('ships.csv', index=False, encoding='utf-8')

Hannibal 
the Marvel universe 
Supergirl 
9-1-1  
