# Galderma - List

In [1]:
import sys
sys.path.insert(0, '../../../lib/')

import tabula
import numpy as np
import pandas as pd
import importlib
import re

import pdfexport
importlib.reload(pdfexport)

from pdfexport import *

## Import PDF

In [2]:
# Read pdf into DataFrame
df_list = tabula.read_pdf("CH_Transparency Report_2019.pdf", pages='all', lattice=True, multiple_tables = True)

In [3]:
#Take df with more than one column
df_list_new = []
for i in range(0, len(df_list)):
    if len(df_list[i].columns) > 1:
        df_list_new.append(df_list[i])
        
df = pd.concat(df_list_new)
df = df.reset_index(drop=True)

## Format Table

In [4]:
df_export = df.copy()

#Rename Columns
df_export.columns = ['name', 'location', 'country', 'address', 'uci', 'donations_grants', 'sponsorship', 'registration_fees','travel_accommodation', 'fees', 'related_expenses', 'empty1', 'total', 'empty2']

#Shift row
df_export[df_export.name.isna()] = df_export[df_export.name.isna()].shift(-1, axis='columns')

#Add Type by "INDIVIDUAL NAMED"
individuals = df_export[df_export['name'].str.contains("INDIVIDUAL NAMED ", na=False)]
df_export['type'] = np.where(df_export.index < individuals.index[1], 'hcp', 'hco')

#Remove empty
df_export.drop(columns=['empty1', 'empty2'], inplace=True)

#Remove rows which have no values
df_export = df_export.dropna(subset=['total'], how='all')
df_export = df_export.reset_index(drop=True)

#Replace N/A-Strings
df_export.loc[df_export['donations_grants'] == 'N/A', 'donations_grants'] = np.NaN
df_export.loc[df_export['sponsorship'] == 'N/A', 'sponsorship'] = np.NaN

#Remove " - " in Address
df_export['address'] = df_export['address'].str.replace(r'^ - ', '', regex=True)
df_export['address'] = df_export['address'].str.replace(r'^-\s', ' ', regex=True)

#Convert to Numbers
df_export = replace_in_number(df_export, ',', '')
df_export = cleanup_number(df_export)
df_export = amounts_to_number(df_export)

#Remove Carrinations
df_export = remove_carination(df_export, " ")

#Revert name
df_export = revert_name(df_export, ' ')

#Capitalize name
df_export['name'] = df_export['name'].str.title()

#Add PLZ
df_export = add_plz(df_export)

#basic string conversion
df_export = basic_string_conversion(df_export)

export_list(df_export, 'galderma')

saved


In [None]:
#write_to_csv(df_export, 'tmp.csv')
#write_to_excel(df_export, 'tmp.xlsx')