# A. Menarini - List
## Beachten
* Für die OCR-Erkennung wurde ABBYY Fine Reader for Mac verwendet (Lizenz vorhanden)
* Wichtig: Sprachen einstellen auf: Deutsch, Englisch, Italienisch, Französisch
* Als CSV exportieren
* Bilder wurden einzeln über Entwicklermodus im Chrome gespeichert und in Photoshop zu einem grossen Bild zusammengefügt. Als Einzelbilder wurden die Tabellen teils nicht richtig erkannt
* Erste Zeile wird manuell entfernt
* HCP/HCO manually

In [1]:
import sys
sys.path.insert(0, '../../../lib/')

import tabula
import numpy as np
import pandas as pd
import importlib
import re

import pdfexport
importlib.reload(pdfexport)

from pdfexport import *
import consts

## Import CSV

In [2]:
# Read CSV into DataFrame
header = ['name', 'location', 'country', 'address', 'uci', 'donations_grants', 'sponsorship', 'registration_fees','travel_accommodation', 'fees', 'related_expenses', 'empty', 'total']
conv = {'donations_grants': str, 'sponsorship': str, 'registration_fees': str, 'travel_accommodation': str
       , 'fees': str, 'related_expenses': str, 'total': str}
df = pd.read_csv("menarini_ocr.csv", sep=";", names=header, converters=conv)

## Format Table

In [3]:
df_export = df.copy()

#Rename Columns
#df_export.columns = ['name', 'location', 'country', 'address', 'uci', 'donations_grants', 'sponsorship', 'registration_fees','travel_accommodation', 'fees', 'related_expenses', 'empty', 'total']

#Remove empty
df_export.drop(columns=['empty'], inplace=True)

#Set Na
df_export = df_export.replace("", np.NaN)

#Remove rows which have no values
df_export = df_export.dropna(subset=['total'], how='all')
df_export = df_export.dropna(subset=['name'], how='all')

#Remove first row
df_export = df_export.drop(index=[1])

#Set Type
index_hco = df_export[df_export['name'].str.contains("ADEL", na=False)].index[0]
df_export['type'] = np.where(df_export.index < index_hco, 'hcp', 'hco')

#Remove rows by string comparision
df_export = df_export[~df_export['name'].str.contains('Aggregate amount attributable', na=False)]
df_export = df_export[~df_export['name'].str.contains('Transfers ofValue re Research', na=False)]
df_export = df_export[~df_export['name'].str.contains('1 Number of Recipients', na=False)]

#Revert Name
df_export.loc[df_export.type == 'hcp', 'name'] = revert_name(df_export[df_export.type == 'hcp']['name'], ' ')

#Clean Numbers
df_export = cleanup_number(df_export)
df_export = remove_in_numbers(df_export, '*')
df_export = remove_in_numbers(df_export, 'Ft')

#Fix Country
df_export['country'] = 'Switzerland'

#basic string conversion
df_export = remove_carination(df_export)
df_export = basic_string_conversion(df_export)

#Add PLZ
df_export = add_plz(df_export)

revert_name: Be sure: Only revert hcp, not hco!


## Clean Errors
Some Numeric collums have errors. Try to recalculate them. If more than one cell per row is invalid, write "Error"

In [4]:
df_export_tmp = df_export.copy()

In [5]:
def fix_errors(df_export, field):
    other_fields = number_fields.copy()
    other_fields.remove(field)
    
    #Loop each row
    for index, row in df_export[df_export[field] == 'ERROR'].iterrows():
        
        #Check if other fields are numeric. If not, break
        has_strings = False
        for f in other_fields:
            if re.match(r'^[0-9.]*$', str(row[f])) == None:
                has_strings = True
                break
            
        #if no strings, calculate!
        if not has_strings:
            if field != 'total':
                #print(df_export[index:index+1])
                without_total = other_fields.copy()
                without_total.remove('total')
                df_export.loc[index, field] = (
                    pd.to_numeric(df_export.loc[index, 'total']) 
                    - pd.to_numeric(df_export.loc[index, without_total[0]])
                    - pd.to_numeric(df_export.loc[index, without_total[1]])
                    - pd.to_numeric(df_export.loc[index, without_total[2]])
                    - pd.to_numeric(df_export.loc[index, without_total[3]])
                    - pd.to_numeric(df_export.loc[index, without_total[4]])
                )
            else:
                without_total = other_fields.copy()
                df_export.loc[index, field] = (
                    pd.to_numeric(df_export.loc[index, without_total[0]])
                    + pd.to_numeric(df_export.loc[index, without_total[1]])
                    + pd.to_numeric(df_export.loc[index, without_total[2]])
                    + pd.to_numeric(df_export.loc[index, without_total[3]])
                    + pd.to_numeric(df_export.loc[index, without_total[4]])
                    + pd.to_numeric(df_export.loc[index, without_total[5]])
                )
                v = (
                    pd.to_numeric(df_export.loc[index, without_total[2]])
                    + pd.to_numeric(df_export.loc[index, without_total[3]])
                    + pd.to_numeric(df_export.loc[index, without_total[4]])
                )
                           
    return df_export

df_export = df_export_tmp.copy()

#empty to nan
#df_export = df_export.replace('', np.NaN)
#Fill na in numeric fields
for f in number_fields:
    df_export[f] = df_export[f].fillna('0')

#Fill ERROR
for field in number_fields:
    if not np.issubdtype(df_export[field].dtype, np.number):
        df_export.loc[df_export[field].str.contains(r'^[0-9.]*$', regex=True) == False, field] = 'ERROR'
        
#When no decimal, then add between last and second last
def add_decimal_if_not(field):
    if field != 'ERROR' and field != '' and field != '0':
        #print(field)
        return field[:-1] + '.' + field[-1]
    else:
        return field

for field in number_fields:
    df_export.loc[df_export[field].str.contains(r'\.', regex=True) == False, field] = df_export[field].apply(add_decimal_if_not)

#Add Error to empty location
df_export.loc[df_export.location.isna(), 'location'] = 'ERROR'

#Fix Errors
for f in number_fields:
    #if not np.issubdtype(df_export[f].dtype, np.number):
    df_export = fix_errors(df_export, f)
    
#Remove Zeros
df_export = df_export.replace('0', np.NaN)

#write_to_excel(df_export, 'tmp.xlsx', open=True)

In [6]:
#Export
add_warning(manually=True)
export_list(df_export, 'menarini')
#write_to_excel(df_export, 'tmp.xlsx', open=True)

donations_grants not a number
sponsorship not a number
registration_fees not a number
travel_accommodation not a number
fees not a number
total not a number
saved


In [None]:
#write_to_csv(df_export, 'tmp.csv', True)
#write_to_excel(df_export, 'tmp.xlsx')