In [1]:
import sys
import os
os.chdir("../../")

print(sys.version)

3.8.2 (default, Mar 25 2020, 17:03:02) 
[GCC 7.3.0]


In [2]:
import slim

In [3]:
"""
A script that downloads and parses Salmon Scotland mortality reports.
"""

from bs4 import BeautifulSoup
from requests import get
import os
from pathlib import Path
import tabula
import numpy as np

WEBSITE = "https://www.salmonscotland.co.uk"
REPORT_URL = f"{WEBSITE}/reports/monthly-mortality-rate-%s-%d"


def download(month: str, year: int):
    report_out_folder = Path("output/reports/")
    if year == 2018:
        filename = report_out_folder / f"SS-{year}.pdf"
    else:    
        filename = report_out_folder / f"SS-{month}-{year}.pdf"
    os.makedirs(str(report_out_folder), exist_ok=True)
    
    if not filename.exists():
        if year == 2018:
            # All monthly data were merged in a single pdf apparently
            url = "https://www.salmonscotland.co.uk/reports/scottish-salmon-survival-rates-2018"
        else:
            url = REPORT_URL % (month, year)
        parse_page = get(url).content
        parser = BeautifulSoup(parse_page, "html.parser")
        div = parser.find("div", class_="download-link")
        a = div.find("a")
        download_link = WEBSITE + a["href"]

        report_out_folder = Path("output/reports/")
        if year == 2018:
            report_out_folder / f"SS-{year}.pdf"
        else:
            filename = report_out_folder / f"SS-{month}-{year}.pdf"
        os.makedirs(str(report_out_folder), exist_ok=True)
        downloaded_pdf = get(download_link)

        with filename.open("wb") as f:
            f.write(downloaded_pdf.content)

    return filename

In [20]:
import pandas as pd
import traceback

column_names = ["company", "site", "mortality", "note", "cumulative_mortality"]

def get_pdf_page(pdf, page, month, year):
    # parsing of multiple pages breaks
    candidate = tabula.read_pdf(pdf, pages=page, pandas_options={"header": "none"})
    
    if len(candidate) == 0:
        return pd.DataFrame({column: {} for column in column_names})
    df = candidate[0]
    if page == 2:
        df = df.iloc[3:].reset_index(drop=True)
    
    if len(df.columns) == 4:
        cumulative_mort = df[3].copy()
        df[3] = np.nan
        df[4] = cumulative_mort
    elif len(df.columns) == 3:
        df[3] = np.nan
        df[4] = np.nan
        
    df = df.set_axis(column_names, axis=1)
    

    df["month"] = month
    df["year"] = year
    df["year"] = df["year"].astype(np.int64)
    return df

def get_pdf_pages(pdf, month, year):
    if (month == "December" and year == 2020):
        range_ = range(1, 6)
        
    elif year == 2018:
        # merged pdfs, regular except for september and october
        month_no = datetime.datetime.strptime(month, "%B").month
        if month_no not in [9, 10, 12]:
            range_ = range(2+8*(month_no-1), 2+8*(month_no-1)+6)
        elif month == 9:
            range_ = range(66, 72)
        elif month == 10:
            range_ = range(73, 79)
        else:
            range_ = range(88, 95)
            
    elif year == 2021:
        if month in ["January", "March"]:
            range_ = range(1, 10)
        elif month in ["February"]:
            range_ = range(1, 9)
        elif month in ["August"]:
            range_ = range(3, 11)
        else:
            range_ = range(3, 10)
    else:
        range_ = range(2, 8)
    
    dfs = []
    for page in range_:
        try:
            dfs.append(get_pdf_page(pdf, page, month, year))
        except:
            traceback.print_exc()
        
    return pd.concat(dfs, ignore_index=True, sort=False)

In [23]:
import datetime


def extract_year_report(year):
    def parse_and_extract(month):
        print(f"Parsing {year}-{month}")
        month_label = datetime.date(year, month, 1).strftime('%B')
        try:
            location = download(month_label, year)
            return get_pdf_pages(location, month_label, year)
        except:
            print(f"Unexpected error when fetching {year}-{month}")
            traceback.print_exc()
            
    dfs = [parse_and_extract(month) for month in range(1, 13)]
        
    df = pd.concat(dfs, ignore_index=True)
    df[~df["company"].isnull()].reset_index(drop=True)
    df["year"] = df["year"].apply(np.uint64)
    
    return df

def collate_years(range_):
    return pd.concat((extract_year_report(y) for y in range_), ignore_index=True, sort=False)

In [24]:
# This is going to take some time

REPORT_LOCATION = "output/reports/SS-full-2018-2022.csv"
if not Path(REPORT_LOCATION).exists():
    df = collate_years(range(2018, 2022))
    df = df.replace({"Ardgadden": "Ardgaddan", "Ardcastle Bay": "Ardcastle"})
    df.to_csv(REPORT_LOCATION)
else:
    df = pd.read_csv(REPORT_LOCATION)

df

Parsing 2018-1
Parsing 2018-2
Parsing 2018-3
Parsing 2018-4
Parsing 2018-5
Parsing 2018-6
Parsing 2018-7
Parsing 2018-8
Parsing 2018-9
Here
Parsing 2018-10
Here
Parsing 2018-11
Parsing 2018-12
Here
Parsing 2019-1
Parsing 2019-2
Parsing 2019-3
Parsing 2019-4
Parsing 2019-5
Parsing 2019-6
Parsing 2019-7
Parsing 2019-8
Parsing 2019-9
Parsing 2019-10
Parsing 2019-11
Parsing 2019-12
Parsing 2020-1
Parsing 2020-2
Parsing 2020-3
Parsing 2020-4
Parsing 2020-5
Parsing 2020-6
Parsing 2020-7
Parsing 2020-8
Parsing 2020-9
Parsing 2020-10
Parsing 2020-11
Parsing 2020-12
Parsing 2021-1


Got stderr: Feb 13, 2022 11:06:38 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Feb 13, 2022 11:06:39 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>

Got stderr: Feb 13, 2022 11:06:45 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Feb 13, 2022 11:06:45 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>



Parsing 2021-2


Got stderr: Feb 13, 2022 11:06:45 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Feb 13, 2022 11:06:46 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>



Parsing 2021-3


Got stderr: Feb 13, 2022 11:06:51 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Feb 13, 2022 11:06:52 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>



Parsing 2021-4
Parsing 2021-5
Parsing 2021-6
Parsing 2021-7
Parsing 2021-8
Parsing 2021-9
Parsing 2021-10
Parsing 2021-11
Parsing 2021-12
Unexpected error when fetching 2021-12


Traceback (most recent call last):
  File "/tmp/ipykernel_121718/3953251607.py", line 9, in parse_and_extract
    location = download(month_label, year)
  File "/tmp/ipykernel_121718/163114849.py", line 33, in download
    a = div.find("a")
AttributeError: 'NoneType' object has no attribute 'find'


Unnamed: 0,company,site,mortality,note,cumulative_mortality,month,year
0,Cooke Aquaculture (Scotland),Balta Isle,6.0,IPN,In production,January,2018
1,Cooke Aquaculture (Scotland),Bastaness,Fallow,,Fallow,January,2018
2,Cooke Aquaculture (Scotland),Bay of Cleat (North),0.7,,In production,January,2018
3,Cooke Aquaculture (Scotland),Bay of Cleat (South),Farm fallowed in Jan.,,16.0,January,2018
4,Cooke Aquaculture (Scotland),Bay of Ham,Fallow,,Fallow,January,2018
...,...,...,...,...,...,...,...
9111,Scottish Sea Farms Ltd,Wyre,0.9 (Farm stocked in Nov.),,In production,November,2021
9112,Wester Ross Fisheries Ltd,Ardessie A,0.3,,In production,November,2021
9113,Wester Ross Fisheries Ltd,Ardessie B,0.6,,In production,November,2021
9114,Wester Ross Fisheries Ltd,Ardmair,1.8,,In production,November,2021


## Marine Scotland parsing

In [25]:
import json
import re

MS_REPORT_JSON = "https://data.marine.gov.scot/api/3/action/package_show?id=55aa8a12-135e-463e-802b-fb661fa02b73&page=0"
REPORT_JSON_LOCATION = Path("output/reports")

def get_lice_counts_json(year):
    report_json_location = REPORT_JSON_LOCATION / f"MS_{year}.csv"
    if not report_json_location.exists():
        parsed = json.loads(get(MS_REPORT_JSON).content)
        titles_urls = [(res["name"], res["url"]) for res in parsed["result"][0]["resources"]]
        titles, urls = zip(*titles_urls)

        ranges = [range(*tuple(map(int, 
                      re.findall(r"(\d+)-(\d+)", s)[0])
                             )) for s in titles]
        url = next(url for idx, url in enumerate(urls) if year in ranges[idx])
        with report_json_location.open("wb") as f:
            response = get(url, headers={'Content-type': 'application/json'})
            f.write(response.content)

    return report_json_location

def get_lice_counts(year):
    return pd.read_csv(str(get_lice_counts_json(year)))

In [26]:
lice_counts_2019 = get_lice_counts(2019)

In [28]:
lice_counts_2019

Unnamed: 0,Site ID,Site Name,Year,Month,Lice Count,Comments
0,BALT1,Balta Island,2018,January,0.03,
1,BALT1,Balta Island,2018,February,0.07,
2,BALT1,Balta Island,2018,March,0.15,
3,BALT1,Balta Island,2018,April,0.1,
4,BALT1,Balta Island,2018,May,0.06,
...,...,...,...,...,...,...
8274,BRO1,"Corry, Loch Broom",2020,December,0,
8275,WHA2,North Voe,2020,December,F,
8276,MCLN1,MacLeans Nose,2020,December,0,
8277,KIL1,Petersport South (Kilerivagh),2020,December,F,


In [29]:
lice_counts_2019["Site Name"]

0                        Balta Island
1                        Balta Island
2                        Balta Island
3                        Balta Island
4                        Balta Island
                    ...              
8274                Corry, Loch Broom
8275                        North Voe
8276                    MacLeans Nose
8277    Petersport South (Kilerivagh)
8278                     Puldrite Bay
Name: Site Name, Length: 8279, dtype: object

## Extract Fyne 

In [32]:
from slim.simulation.config import Config

config = Config("config_data/config.json", "config_data/Fyne_complete")
farm_names = [farm.name for farm in config.farms]
farm_names

['Tarbert South',
 'Rubha Stillaig',
 'Glenan Bay',
 'Meall Mhor',
 'Gob a Bharra',
 'Strondoir Bay',
 'Ardgaddan',
 'Ardcastle',
 'Quarry Point']

In [33]:
df_fyne = df[df["site"].isin(farm_names)]
df_fyne

Unnamed: 0,company,site,mortality,note,cumulative_mortality,month,year
113,Scottish Salmon Company Ltd,Ardcastle,0.4,,In production,January,2018
114,Scottish Salmon Company Ltd,Ardgaddan,0.6,,In production,January,2018
122,Scottish Salmon Company Ltd,Glenan Bay,0.4,,In production,January,2018
123,Scottish Salmon Company Ltd,Gob a Bharra,0.4,,In production,January,2018
134,Scottish Salmon Company Ltd,Meall Mhor,0.2,,In production,January,2018
...,...,...,...,...,...,...,...
9049,Scottish Salmon Company Ltd,Meall Mhor,0.4,,In production,November,2021
9053,Scottish Salmon Company Ltd,Quarry Point,0.6,,In production,November,2021
9055,Scottish Salmon Company Ltd,Rubha Stillaig,Fallow,,Fallow,November,2021
9061,Scottish Salmon Company Ltd,Strondoir Bay,0.0,,In production,November,2021


In [34]:
fyne_sites_data = lice_counts_2019[lice_counts_2019["Site Name"].isin(farm_names)]
fyne_sites_data

Unnamed: 0,Site ID,Site Name,Year,Month,Lice Count,Comments
1499,FFMC43,Ardcastle,2018,January,0,
1500,FFMC43,Ardcastle,2018,February,0,
1501,FFMC43,Ardcastle,2018,March,0,
1502,FFMC43,Ardcastle,2018,April,0,
1503,FFMC43,Ardcastle,2018,May,0,
...,...,...,...,...,...,...
8208,FFMC14,Meall Mhor,2020,December,0.28,
8213,FFMC29,Quarry Point,2020,December,0.43,
8215,FFMC66,Rubha Stillaig,2020,December,0.17,
8220,FFMC70,Strondoir Bay,2020,December,0.34,


In [35]:
lice_counts_2019["Site Name"].unique()

array(['Balta Island', 'West of Skeo Taing site (Balta Harbour)',
       'Bastaness', 'Basta Voe North West (Kirkabister)',
       'Bay of Cleat North', 'Bay of Cleat', 'Bay of Ham', 'Bay of Vady',
       'Bow of Hascosay', 'Bay of Meil', 'Point of Burkwell (Site 5)',
       'Burrastow', 'Carness Bay', 'Chalmers Hope', 'Cloudin',
       'Cava South', 'Djuba Wick', 'West Fara', 'Skelwick Skerry',
       'Flaeshins', 'Hogan (Site 1)', 'Kirk Noust', 'Lyrawa Bay',
       'Mid Taing', 'Ness of Copister', 'North Sandwick', 'Ouse Ness',
       'Pegal Bay', 'Quanterness (West Shargun Shoal)',
       'Stead of Aithness', 'Turness', 'Uyea Isle', 'Swarta Skerry',
       'Vee Taing', 'Vest Ness', 'Wick of Belmont North',
       'Wick of Vatsetter', 'Winna Ness', 'Setterness East (Bomlo)',
       'Bight of Foraness', 'Cole Deep', 'Collafirth Delting Site 3',
       'Corlarach', 'Papa, East Head of Scalloway', 'East of Langa',
       'East of Papa Little', 'Easter Score Holm',
       'Geo of Vallada

In [36]:
fyne_sites_data["Site Name"].unique()

array(['Ardcastle', 'Ardgaddan', 'Glenan Bay', 'Gob a Bharra',
       'Meall Mhor', 'Quarry Point', 'Rubha Stillaig', 'Strondoir Bay',
       'Tarbert South'], dtype=object)

In [37]:
# We are ready to join the two dataframes

fyne_data_joined = fyne_sites_data.set_index(["Year", "Month", "Site Name"])\
                    .join(df_fyne.set_index(["year", "month", "site"]).rename_axis(["Year", "Month", "Site Name"]),
                          how="right")
fyne_data_joined

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Site ID,Lice Count,Comments,company,mortality,note,cumulative_mortality
Year,Month,Site Name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018,January,Ardcastle,FFMC43,0,,Scottish Salmon Company Ltd,0.4,,In production
2018,January,Ardgaddan,FFMC47,0.01,,Scottish Salmon Company Ltd,0.6,,In production
2018,January,Glenan Bay,FFMC10,0,,Scottish Salmon Company Ltd,0.4,,In production
2018,January,Gob a Bharra,FFMC46,0,,Scottish Salmon Company Ltd,0.4,,In production
2018,January,Meall Mhor,FFMC14,0,,Scottish Salmon Company Ltd,0.2,,In production
...,...,...,...,...,...,...,...,...,...
2021,November,Meall Mhor,,,,Scottish Salmon Company Ltd,0.4,,In production
2021,November,Quarry Point,,,,Scottish Salmon Company Ltd,0.6,,In production
2021,November,Rubha Stillaig,,,,Scottish Salmon Company Ltd,Fallow,,Fallow
2021,November,Strondoir Bay,,,,Scottish Salmon Company Ltd,0.0,,In production


In [38]:
# final clean-ups

def clean_up(df):
    def replace(mortality):
        if (x := re.match("(\d+(\.\d+)?)", mortality)):
            return float(x.group())
        return np.nan

    df_copy = df.copy()


    df_copy["mortality"] = df_copy["mortality"].fillna("").apply(replace)
    df_copy["lice_count"] = df_copy["Lice Count"].fillna("").apply(replace)
    df_copy = df_copy.rename({
        "Comments": "mortality_comment",
        "Site ID": "site_id",
        "note": "lice_note"
    }, axis=1)[["site_id", "lice_count", "mortality", "cumulative_mortality", "lice_note", "mortality_comment"]]

    return df_copy

fyne_data_df = clean_up(fyne_data_joined)


In [39]:
from datetime import datetime

def to_day(x):
    return datetime.strptime(f"{x['Year']} {x['Month']} 15", "%Y %B %d")


fyne_data_df = fyne_data_df.reset_index()
fyne_data_df["date"] = fyne_data_df.apply(to_day, axis=1)
fyne_data_df = fyne_data_df.drop(columns=["Year", "Month"])\
                            .set_index(["date", "Site Name"])\
                            .rename_axis(index={"Site Name": "site_name"})
fyne_data_df

Unnamed: 0_level_0,Unnamed: 1_level_0,site_id,lice_count,mortality,cumulative_mortality,lice_note,mortality_comment
date,site_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-01-15,Ardcastle,FFMC43,0.00,0.4,In production,,
2018-01-15,Ardgaddan,FFMC47,0.01,0.6,In production,,
2018-01-15,Glenan Bay,FFMC10,0.00,0.4,In production,,
2018-01-15,Gob a Bharra,FFMC46,0.00,0.4,In production,,
2018-01-15,Meall Mhor,FFMC14,0.00,0.2,In production,,
...,...,...,...,...,...,...,...
2021-11-15,Meall Mhor,,,0.4,In production,,
2021-11-15,Quarry Point,,,0.6,In production,,
2021-11-15,Rubha Stillaig,,,,Fallow,,
2021-11-15,Strondoir Bay,,,0.0,In production,,


In [40]:
fyne_data_df.to_csv("config_data/Fyne_complete/report.csv")