# Lundbeck - List

## Beachten
* Kein PDF vorhanden. Lundbeck bietet nur eine Website. Vorgehen:
 * Website öffnen: https://www.lundbeck.com/ch/de/ueber-uns/lundbeck-schweiz-ag/HCP/transparenz
 * Captcha ausfüllen
 * Kompletter Quellcode der Website (`Rechtsklick` -> `Seite Speichern unter`) rauskopieren und in Datei `website_dump.html` speichern
* Type wurde vermutet: Wenn kein Titel (Dr., etc.), dann hco

In [1]:
import sys
sys.path.insert(0, '../../../lib/')

from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import copy
import importlib
import re

import pdfexport
importlib.reload(pdfexport)

from pdfexport import *

## Import HTML with Beautiful Soup

In [2]:
with open("website_dump.html") as fp:
    soup = BeautifulSoup(fp, 'html.parser')
    
tbl = soup.find("div", class_="table-trnspdscl")

In [3]:
def extract_text(text):
    c = copy.copy(text)
    c.span.decompose()
    return c.text.strip()
    
    
record_list = []
    
for row in tbl.find_all("div", recursive=False):
    #Skip header
    if 'headers' in row['class']:
        continue
    
    #Is main row (not "detail")
    if 'row' in row['class']:
        #print(row)
        name_div = row.find("div", class_="hcohcpname")
        
        type_ = 'hco'
        c = copy.copy(name_div)
        if c.div:
            c.div.decompose()
            type_ = 'hcp'
        name = c.text.strip()
        address = row.find("div", class_="address").text.strip()
        year = row.find("div", class_="year").text.strip()
        record = {'name': name, 'address': address, 'year': year, 'type': type_}
        
    #Is Detail
    if 'row-details' in row['class']:
        
        #Extract all "fee"
        for fee in row.find_all("div", class_='fee'):

            #travel_accommodation
            if fee.text.find('Travel & Accommodations') >= 0:
                record['travel_accommodation'] = extract_text(fee)
                
            #registration_fees
            elif fee.text.find('Registration Fees') >= 0:
                record['registration_fees'] = extract_text(fee)
                
            #sponsorship
            elif fee.text.find('Sponsorship agreement with') >= 0:
                record['sponsorship'] = extract_text(fee)
                
            #donations_grants
            elif fee.text.find('Donations and Grants') >= 0:
                record['donations_grants'] = extract_text(fee)
                
            #fees
            elif fee.text.find('Fees') >= 0:
                record['fees'] = extract_text(fee)
                
            #related_expenses
            elif fee.text.find('Related expenses agreed') >= 0:
                record['related_expenses'] = extract_text(fee)

        #Add to list
        record_list.append(record)

In [4]:
#Load Dataframe
df = pd.DataFrame(record_list)

## Format Table

In [5]:
df_export = df.copy()

#Year 2017
df_export = df_export[df_export.year == '2017']

#Add missing fields
df_export.insert(0, 'total', '')
df_export.insert(0, 'plz', '')
df_export.insert(0, 'country', 'ch')
df_export.insert(0, 'uci', '')

#Convert to Numbers & Sum
df_export = cleanup_number(df_export)
df_export = amounts_to_number(df_export)
df_export = sum_amounts(df_export)

#Extract place from adress (take last comma)
df_export['location'] = df_export['address'].apply(lambda s: re.search(r'(,)(?!.*,).*', s)[0][2:])
df_export['address'] = df_export['address'].apply(lambda s: s.replace(re.search(r'(,)(?!.*,).*', s)[0], ""))

#Reorder columns
df_export = df_export[fix_columns[:-1]]

#basic string conversion
df_export = basic_string_conversion(df_export)

export_list(df_export, 'lundbeck')

saved


In [None]:
#write_to_csv(df_export, 'tmp.csv')
#write_to_excel(df_export, 'tmp.xlsx')