# Create Database Import

In [1]:
import pandas as pd
import numpy as np

## Set Year

In [2]:
const_year = 2017

## Import files

In [3]:
df0 = pd.read_csv('../../data/3. transformation/4_hcp_grouped.csv')
df1 = pd.read_csv('../../data/3. transformation/4_hco_grouped.csv')
df_accumulation_raw = pd.read_csv('../../data/3. transformation/0_accumulations_all.csv')

df_pharma = pd.read_csv('sources/liste_companies - connections.csv')
df_pharma_source_raw = pd.read_csv('sources/pharma_source.csv')
plz_names = ['REC_ART', 'ONRP', 'BFSNR', 'PLZ_TYP', 'POSTLEITZAHL', 'PLZ_ZZ', 'GPLZ', 'ORTBEZ18', 
             'ORTBEZ27', 'KANTON', 'SPRACHCODE', 'SPRACHCODE_ABW', 'BRIEFZ_DURCH', 'GILT_AB_DAT', 'PLZ_BRIEFZUST', 'PLZ_COFF']
df_plz_raw = pd.read_csv('sources/Post_Adressdaten20190122_edited.csv')

## Create Group Key
HCO and HCP both have same indexes. Generate unique values.

In [4]:
df_hcp = df0.copy()
df_hco = df1.copy()

group_key = 1

def set_groupkey(dataframe, startkey):
    for g in dataframe['group'].unique():
        dataframe.loc[dataframe['group'] == g, 'group_key'] = startkey
        startkey += 1
        #print(g)

    dataframe.drop(axis=1, columns=['group'])
    return (dataframe, startkey)

df_hcp, group_key = set_groupkey(df_hcp, group_key)
df_hco, group_key = set_groupkey(df_hco, group_key)

#Concat
df_data = pd.concat([df_hcp, df_hco], sort=False)

## Clean and format

In [5]:
#Clean
df_data.drop(axis=1, columns=['group'], inplace=True)

#Format
df_data['group_key'] = df_data['group_key'].astype('int')

#Reindex
df_data.reset_index(drop=True, inplace=True)
df_data.index += 1

## table `transaction`
If it throws error at line: `df_transaction['tra_fk_pharma'].astype('int')`: You have a mistake in your data. Perhaps a company is written wrong or new

In [6]:
df_transaction = df_data.copy()

#abstract index
df_transaction = df_transaction.reset_index()

#Join with pharma
df_transaction = df_transaction.merge(right = df_pharma, how='left', left_on='source', right_on='pha_key')

#Drop
df_transaction.drop(axis=1, columns=['plz', 'uci', 'country', 'type', 'source', 'pha_key', 'pha_name', 'main_address'], inplace=True)

#Rename
df_transaction.columns = ['tra_id', 
                          'tra_name_original', 
                          'tra_location_original', 
                          'tra_address_original',
                          'tra_donations_grants',
                          'tra_sponsorship',
                          'tra_registration_fees',
                          'tra_travel_accommodation',
                          'tra_fees',
                          'tra_related_expenses',
                          'tra_total',
                          'tra_fk_recipient',
                          'tra_fk_pharma']

#Add Year
df_transaction['tra_year'] = const_year

#Format
df_transaction['tra_fk_pharma'] = df_transaction['tra_fk_pharma'].astype('int')

#Reorder dataset
df_transaction = df_transaction[[
                          'tra_id', 
                          'tra_fk_pharma',
                          'tra_fk_recipient',
                          'tra_year',
                          'tra_donations_grants',
                          'tra_sponsorship',
                          'tra_registration_fees',
                          'tra_travel_accommodation',
                          'tra_fees',
                          'tra_related_expenses',
                          'tra_total',
                          'tra_name_original',
                          'tra_location_original', 
                          'tra_address_original'
                        ]]


## table `recipient`
Import PLZ and create `zip_shadow` with all possible Zips for this location.  
Source: https://www.post.ch/de/geschaeftlich/themen-a-z/adressen-pflegen-und-geodaten-nutzen/adress-und-geodaten

In [7]:
df_recipient = df_data.copy()
df_plz = df_plz_raw.copy()

#Only main_address
df_recipient = df_recipient[df_recipient.main_address == 1]

#Group PLZ by location
df_plz['plz_shaddow'] = df_plz['POSTLEITZAHL'].astype('str')
df_plz['ORTBEZ18'] = df_plz['ORTBEZ18'].str.lower()
df_plz = df_plz.groupby('ORTBEZ18')['plz_shaddow'].agg(lambda col: ','.join(col))
df_plz = df_plz.to_frame()

#abstract index
df_recipient['id'] = df_recipient['group_key']

#Lowercase location
df_recipient['location_lower'] = df_recipient['location'].str.lower()

#Join recipient and zip
df_recipient = df_recipient.merge(right = df_plz, how='left', left_on='location_lower', right_on='ORTBEZ18')

#covert plz
df_recipient['plz'] = df_recipient['plz'].astype('int', errors='ignore')

#Drop
df_recipient.drop(axis=1, columns=['group_key', 'location_lower', 'main_address', 'donations_grants', 'sponsorship', 'registration_fees', 'travel_accommodation', 'fees', 'related_expenses', 'total', 'source'], inplace=True)

#rename
df_recipient.columns = [
        'rec_name',
        'rec_location',
        'rec_country',
        'rec_address',
        'rec_plz',
        'rec_uci',
        'rec_type',
        'rec_id',
        'rec_plz_shaddow'
    ]

#Reorder Columns
df_recipient = df_recipient[[
        'rec_id',
        'rec_name',
        'rec_address',
        'rec_location',
        'rec_plz',
        'rec_plz_shaddow',
        'rec_country',
        'rec_uci',
        'rec_type'
    ]]


## table `accumulations`

In [8]:
df_accumulation = df_accumulation_raw.copy()

#Select no "count"
df_accumulation = df_accumulation[df_accumulation['type'].isin(['hcp_amount', 'hco_amount', 'rnd'])]

#Rename hcp_acount & hco_acmount
df_accumulation['type'] = df_accumulation['type'].str.replace('_amount', '')

#Join with pharma
df_accumulation = df_accumulation.merge(right = df_pharma, how='left', left_on='source', right_on='pha_key')

#Drop Columns
df_accumulation.drop(columns=['pha_key', 'pha_name', 'source'], inplace=True)

#abstract index
df_accumulation.index += 1
df_accumulation.reset_index(drop=False, inplace=True)

#add year
df_accumulation['year'] = const_year

#Rename Columns
df_accumulation.columns = [
    'acc_id',
    'acc_type',
    'acc_donations_grants',
    'acc_sponsorship',
    'acc_registration_fees',
    'acc_travel_accommodation',
    'acc_fees',
    'acc_related_expenses',
    'acc_total',
    'acc_fk_pharma',
    'acc_year'
]

#Reorder Columns
df_accumulation = df_accumulation[[
    'acc_id',
    'acc_fk_pharma',
    'acc_year',
    'acc_donations_grants',
    'acc_sponsorship',
    'acc_registration_fees',
    'acc_travel_accommodation',
    'acc_fees',
    'acc_related_expenses',
    'acc_total',
    'acc_type'
    
]]

## table `pharma_source`

In [9]:
df_pharma_source = df_pharma_source_raw.copy()

#Join with pharma
df_pharma_source = df_pharma_source.merge(right = df_pharma, how='left', left_on='phf_key', right_on='pha_key')

#drop columns
df_pharma_source.drop(columns=['pha_name', 'pha_key', 'phf_key', 'pha_name'], inplace=True)

#abstract index
df_pharma_source.index += 1
df_pharma_source.reset_index(drop=False, inplace=True)

#Rename
df_pharma_source.columns = [
    'phf_id',
    'phf_source',
    'phf_fk_pharma'
]

#Reorder
df_pharma_source = df_pharma_source[[
    'phf_id',
    'phf_fk_pharma',
    'phf_source'
]]


## table `pharma`

In [10]:
df_pharma.drop(axis=1, columns=['pha_key'], inplace=True)

## Export files

In [11]:
df_pharma.to_csv('../../data/4. database/1_pharma.csv', index=False)
df_pharma_source.to_csv('../../data/4. database/2_pharma_source.csv', index=False)
df_recipient.to_csv('../../data/4. database/3_recipient.csv', index=False)
df_transaction.to_csv('../../data/4. database/4_transaction.csv', index=False)
df_accumulation.to_csv('../../data/4. database/5_accumulation.csv', index=False)

## Test
Concat files together. Still the same result?

In [None]:
df_test = df_transaction.merge(right = df_recipient, how='left', left_on='tra_fk_recipient', right_on='rec_id')
df_test = df_test.merge(right = df_pharma, how='left', left_on='tra_fk_pharma', right_on='pha_id')

import sys
sys.path.insert(0, '../../data/lib/')
import importlib

import pdfexport
importlib.reload(pdfexport)

from pdfexport import *

df_test = df_test.sort_values('rec_id')

df_test = df_test.set_index([df_test.rec_id, df_test.tra_id])
df_test.head(10)


write_to_excel(df_test, 'tmp.xlsx', open=True, index=True)
