In [1]:
import sys
import os
os.chdir("../../")

print(sys.version)

3.10.2 (main, Jan 15 2022, 19:56:27) [GCC 11.1.0]


In [2]:
import slim

In [97]:
"""
A script that downloads and parses Salmon Scotland mortality reports.
"""

from bs4 import BeautifulSoup
from requests import get
import os
from pathlib import Path
import tabula

WEBSITE = "https://www.salmonscotland.co.uk"
REPORT_URL = f"{WEBSITE}/reports/monthly-mortality-rate-%s-%d"


def download(month: str, year: int):
    report_out_folder = Path("output/reports/")
    filename = report_out_folder / f"SS-{month}-{year}.pdf"
    os.makedirs(str(report_out_folder), exist_ok=True)
    
    if not filename.exists():
        url = REPORT_URL % (month, year)
        parse_page = get(url).content
        parser = BeautifulSoup(parse_page, "html.parser")
        div = parser.find("div", class_="download-link")
        a = div.find("a")
        download_link = WEBSITE + a["href"]

        report_out_folder = Path("output/reports/")
        filename = report_out_folder / f"SS-{month}-{year}.pdf"
        os.makedirs(str(report_out_folder), exist_ok=True)
        downloaded_pdf = get(download_link)

        with filename.open("wb") as f:
            f.write(downloaded_pdf.content)

    return filename

In [154]:
import pandas as pd

pdf = "output/reports/SS-November-2021.pdf"

column_names = ["company", "site", "mortality", "note", "cumulative_mortality"]

complete_df = None
def get_pdf_page(pdf, page, month, year):
    # parsing of multiple pages breaks
    candidate = tabula.read_pdf(pdf, pages=page, pandas_options={"header": "none"})
    
    if len(candidate) == 0:
        return pd.DataFrame({column: {} for column in column_names})
    df = candidate[0]
    if page == 2:
        df = df.iloc[3:].reset_index(drop=True)
    
    if len(df.columns) == 4:
        cumulative_mort = df[3].copy()
        df[3] = np.nan
        df[4] = cumulative_mort
    elif len(df.columns) == 3:
        df[3] = np.nan
        df[4] = np.nan
        
    df = df.set_axis(column_names, axis=1)
    

    df["month"] = month
    df["year"] = year
    return df

"""
async def get_pdf_pages(pdf, month, year):
    dfs = await asyncio.gather(*[get_pdf_page(pdf, page, month, year) for page in range(3, 7)])
    return pd.concat(dfs, ignore_index=True, sort=False)
"""

def get_pdf_pages(pdf, month, year):
    if month == 12 and year == 2020:
        range_ = range(0, 7)
    else:
        range_ = range(2, 8)
    dfs = [get_pdf_page(pdf, page, month, year) for page in range_]
    return pd.concat(dfs, ignore_index=True, sort=False)

In [157]:
# extract all 2020...
import asyncio
import datetime

"""
async def extract_year_report(year):
    async def parse_and_extract(month):
        print(f"Parsing {month}")
        month_label = datetime.date(year, month, 1).strftime('%B')
        location = await download(month_label, year)
        return await get_pdf_pages(location, month, year)
    
    results = asyncio.gather(*map(parse_and_extract, range(1, 13)))
    dfs = await results
    return dfs #pd.concat(dfs, ignore_index=True, keys=["company", "month", "year"])
"""
def extract_year_report(year):
    def parse_and_extract(month):
        print(f"Parsing {month}")
        month_label = datetime.date(year, month, 1).strftime('%B')
        location = download(month_label, year)
        return get_pdf_pages(location, month, year)
    
    dfs = [parse_and_extract(month) for month in range(1, 13)]
    return pd.concat(dfs, ignore_index=True)

In [158]:
df = extract_year_report(2020)

Parsing 1
Parsing 2
Parsing 3
Parsing 4
Parsing 5
Parsing 6
Parsing 7
Parsing 8
Parsing 9
Parsing 10
Parsing 11


'pages' argument isn't specified.Will extract only from page 1 by default.


Parsing 12


In [159]:
df

Unnamed: 0,company,site,mortality,note,cumulative_mortality,month,year
0,Cooke Aquaculture (Scotland),Balta Isle,0.5,,In production,1.0,2020.0
1,Cooke Aquaculture (Scotland),Bastaness,0.6,,In production,1.0,2020.0
2,Cooke Aquaculture (Scotland),Bay of Cleat (North),0.1,,In production,1.0,2020.0
3,Cooke Aquaculture (Scotland),Bay of Ham,Fallow,,Fallow,1.0,2020.0
4,Cooke Aquaculture (Scotland),Bay of Vady,Fallow,,Fallow,1.0,2020.0
...,...,...,...,...,...,...,...
2339,Scottish Sea Farms Ltd,Wyre,0.01,,,12.0,2020.0
2340,Wester Ross Fisheries Ltd,Ardessie A,0.00,,,12.0,2020.0
2341,Wester Ross Fisheries Ltd,Ardessie B,0.00,,,12.0,2020.0
2342,Wester Ross Fisheries Ltd,Ardmair,0.04,,,12.0,2020.0


In [161]:
filtered_df = df[~df["company"].isnull()].reset_index(drop=True)
filtered_df.to_csv("config_data/mortality_full.csv")