# Takeda - List
## Beachten
* Für die OCR-Erkennung wurde ABBYY Fine Reader for Mac verwendet (Lizenz vorhanden)
* Wichtig: Sprachen einstellen auf: Deutsch, Englisch, Italienisch, Französisch
* Als CSV exportieren
* HCP & HCO manuell setzen

In [1]:
import sys
sys.path.insert(0, '../../../lib/')

import tabula
import numpy as np
import pandas as pd
import importlib
import re

import pdfexport
importlib.reload(pdfexport)

from pdfexport import *
import consts

## Import CSV

In [8]:
# Read CSV into DataFrame
header = ['name', 'location', 'country', 'address', 'uci', 'donations_grants', 'sponsorship', 'registration_fees','travel_accommodation', 'fees', 'related_expenses', 'empty1', 'total', 'empty2']
df = pd.read_csv("2018_efpia_transparency_disclosure_tov_2018_switzerland_v1.csv", sep=";", header=None, names=header)

## Format Table

In [13]:
df_export = df.copy()

#Shift
df_export[df_export['name'].isna()] = df_export.shift(-1, axis='columns')

#Remove empty
df_export.drop(columns=['empty1', 'empty2'], inplace=True)

#Remove rows which have no values
df_export = df_export.dropna(subset=['total'], how='all')
df_export = df_export.reset_index(drop=True)

#Remove rows by string comparision
#df_export = df_export[~df_export['name'].str.contains('(Art. 1.01)', na=False)]
df_export = df_export[~df_export['name'].str.contains('Number of Recipients', na=False)]
df_export = df_export[~df_export['name'].str.contains('Full Name', na=False)]
df_export = df_export[~df_export['name'].str.contains('Aggregate amount', na=False)]

#Remove \r
df_export = remove_carination(df_export, " ")

#Set Type
index_hco = df_export[df_export['name'].str.contains("Ass. Frto", na=False)].index[0]
df_export['type'] = np.where(df_export.index < index_hco, 'hcp', 'hco')

#Revert name
df_export = revert_name(df_export, ' ')

#Clean Numbers
df_export = cleanup_number(df_export)
df_export = remove_in_numbers(df_export, '-')
df_export = remove_in_numbers(df_export, '—')
df_export = remove_in_numbers(df_export, '*')
df_export = replace_in_number(df_export, 'O', '0')
df_export = replace_in_number(df_export, 'l', '1')

#basic string conversion
df_export = basic_string_conversion(df_export)

#Export PLZ
df_export['plz'] = df_export['location'].str.extract(r'(^\S*)', expand=True)

## Clean Errors
Some Numeric collums have errors. Try to recalculate them. If more than one cell per row is invalid, write "Error"

In [14]:
def fix_errors(df_export, field):
    other_fields = number_fields.copy()
    other_fields.remove(field)
    
    #Loop each row
    for index, row in df_export[df_export[field] == 'ERROR'].iterrows():
        
        #Check if other fields are numeric. If not, break
        has_strings = False
        for f in other_fields:
            if re.match(r'^[0-9.]*$', str(row[f])) == None:
                has_strings = True
                break
            
        #if no strings, calculate!
        if not has_strings:
            if field != 'total':
                without_total = other_fields.copy()
                without_total.remove('total')
                df_export.loc[index, field] = (
                    pd.to_numeric(df_export.loc[index, 'total']) 
                    - pd.to_numeric(df_export.loc[index, without_total[0]])
                    - pd.to_numeric(df_export.loc[index, without_total[1]])
                    - pd.to_numeric(df_export.loc[index, without_total[2]])
                    - pd.to_numeric(df_export.loc[index, without_total[3]])
                    - pd.to_numeric(df_export.loc[index, without_total[4]])
                )
            else:
                without_total = other_fields.copy()
                df_export.loc[index, field] = (
                    pd.to_numeric(df_export.loc[index, without_total[0]])
                    + pd.to_numeric(df_export.loc[index, without_total[1]])
                    + pd.to_numeric(df_export.loc[index, without_total[2]])
                    + pd.to_numeric(df_export.loc[index, without_total[3]])
                    + pd.to_numeric(df_export.loc[index, without_total[4]])
                    + pd.to_numeric(df_export.loc[index, without_total[5]])
                )
                           
    return df_export

  
#Fill ERROR
for field in number_fields:
    df_export.loc[df_export[field].str.contains(r'^[0-9.]*$', regex=True) == False, field] = 'ERROR'
   
#Fill na in numeric fields
for f in number_fields:
    df_export[f] = df_export[f].fillna(value=0)

#Fix Errors
for f in number_fields:
    df_export = fix_errors(df_export, f)  

In [15]:
#Export
add_warning(manually=True)
export_list(df_export, 'takeda')

donations_grants not a number
sponsorship not a number
registration_fees not a number
travel_accommodation not a number
fees not a number
related_expenses not a number
total not a number
saved


In [46]:
#write_to_csv(df_export, 'tmp.csv', True)
#write_to_excel(df_export, 'tmp.xlsx', open=True)