# Create Database Import

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(0, '../../data/lib/')
from consts import *

## Set Year

In [2]:
const_year = 2017

## Import files

In [3]:
df0 = pd.read_csv('../../data/3. transformation/4_hcp_grouped.csv', dtype = dataframe_types)
df1 = pd.read_csv('../../data/3. transformation/4_hco_grouped.csv', dtype = dataframe_types)
df_accumulation_raw = pd.read_csv('../../data/3. transformation/0_accumulations_all.csv', dtype = dataframe_types)

df_pharma = pd.read_csv('sources/liste_companies - connections.csv')
df_pharma_source_raw = pd.read_csv('sources/pharma_source.csv')
plz_names = ['REC_ART', 'ONRP', 'BFSNR', 'PLZ_TYP', 'POSTLEITZAHL', 'PLZ_ZZ', 'GPLZ', 'ORTBEZ18', 
             'ORTBEZ27', 'KANTON', 'SPRACHCODE', 'SPRACHCODE_ABW', 'BRIEFZ_DURCH', 'GILT_AB_DAT', 'PLZ_BRIEFZUST', 'PLZ_COFF']
df_plz_raw = pd.read_csv('sources/Post_Adressdaten20190122_edited.csv')

## Create Group Key
HCO and HCP both have same indexes. Generate unique values for the address.

In [4]:
df_hcp = df0.copy()
df_hco = df1.copy()

group_key = 1

def set_groupkey(dataframe, startkey):
    for g in dataframe['group'].unique():
        dataframe.loc[dataframe['group'] == g, 'group_key'] = startkey
        startkey += 1
        #print(g)

    dataframe.drop(axis=1, columns=['group'])
    return (dataframe, startkey)

df_hcp, group_key = set_groupkey(df_hcp, group_key)
df_hco, group_key = set_groupkey(df_hco, group_key)

#Concat
df_data = pd.concat([df_hcp, df_hco], sort=False)

## Clean and format

In [5]:
#Clean
df_data.drop(axis=1, columns=['group'], inplace=True)

#Format
df_data['group_key'] = df_data['group_key'].astype('int')

#Reindex
df_data.reset_index(drop=True, inplace=True)
df_data.index += 1

## table `transaction_category`
Create this table by hand. IDs have to be this way!

In [6]:
category_data = [
    {
        'trc_id': 1,
        'trc_name': 'donations_grants'
    },
    {
        'trc_id': 2,
        'trc_name': 'sponsorship'
    },
    {
        'trc_id': 3,
        'trc_name': 'registration_fees'
    },
    {
        'trc_id': 4,
        'trc_name': 'travel_accommodation'
    },
    {
        'trc_id': 5,
        'trc_name': 'fees'
    },
    {
        'trc_id': 6,
        'trc_name': 'related_expenses'
    },
    {
        'trc_id': 7,
        'trc_name': 'total'
    }
]

df_transaction_category =  pd.DataFrame(category_data)

## table `transaction`
If you have conversion errors, your data is not clean eq there are strings in value fields

In [7]:
def add_transaction(df_new, row, column):
    
    if not np.isnan(row[column]) and (row[column] != 0):
        
        category = df_transaction_category.loc[df_transaction_category.trc_name == column, 'trc_id']
        df_new = df_new.append({
          'tra_id': 0,
          'tra_fk_pharma': row['pha_id'],
          'tra_fk_recipient': row['group_key'],
          'tra_year': const_year,
          'tra_fk_transaction_category': category.iloc[0],
          'tra_value': row[column],
          'tra_name_original': row['name'],
          'tra_location_original': row['location'], 
          'tra_address_original': row['address']
        }, ignore_index=True)
        
    return df_new

#Create empty dataframe
col_names =  [
              'tra_id', 
              'tra_fk_pharma',
              'tra_fk_recipient',
              'tra_year',
              'tra_fk_transaction_category',
              'tra_value',
              'tra_name_original',
              'tra_location_original', 
              'tra_address_original'
            ]

df_transaction  = pd.DataFrame(columns = col_names)

#Copy dataframe
df_data_temp = df_data.copy()

#Join with pharma
df_data_temp = df_data_temp.merge(right = df_pharma, how='left', left_on='source', right_on='pha_key')

#Iter Addresses
for index, row in df_data_temp.iterrows():
    df_transaction = add_transaction(df_transaction, row, 'donations_grants')
    df_transaction = add_transaction(df_transaction, row, 'sponsorship')
    df_transaction = add_transaction(df_transaction, row, 'registration_fees')
    df_transaction = add_transaction(df_transaction, row, 'travel_accommodation')
    df_transaction = add_transaction(df_transaction, row, 'fees')
    df_transaction = add_transaction(df_transaction, row, 'related_expenses')

#Set tra_id to index + 1
df_transaction['tra_id'] = df_transaction.index + 1

In [8]:
"""
df_transaction = df_data.copy()

#abstract index
df_transaction = df_transaction.reset_index()

#Join with pharma
df_transaction = df_transaction.merge(right = df_pharma, how='left', left_on='source', right_on='pha_key')

#Drop
df_transaction.drop(axis=1, columns=['plz', 'uci', 'country', 'type', 'source', 'pha_key', 'pha_name', 'main_address'], inplace=True)

#Rename
df_transaction.columns = ['tra_id', 
                          'tra_name_original', 
                          'tra_location_original', 
                          'tra_address_original',
                          'tra_donations_grants',
                          'tra_sponsorship',
                          'tra_registration_fees',
                          'tra_travel_accommodation',
                          'tra_fees',
                          'tra_related_expenses',
                          'tra_total',
                          'tra_fk_recipient',
                          'tra_fk_pharma']

#Add Year
df_transaction['tra_year'] = const_year

#Format
df_transaction['tra_fk_pharma'] = df_transaction['tra_fk_pharma'].astype('int')

#Reorder dataset
df_transaction = df_transaction[[
                          'tra_id', 
                          'tra_fk_pharma',
                          'tra_fk_recipient',
                          'tra_year',
                          'tra_donations_grants',
                          'tra_sponsorship',
                          'tra_registration_fees',
                          'tra_travel_accommodation',
                          'tra_fees',
                          'tra_related_expenses',
                          'tra_total',
                          'tra_name_original',
                          'tra_location_original', 
                          'tra_address_original'
                        ]]
"""

"\ndf_transaction = df_data.copy()\n\n#abstract index\ndf_transaction = df_transaction.reset_index()\n\n#Join with pharma\ndf_transaction = df_transaction.merge(right = df_pharma, how='left', left_on='source', right_on='pha_key')\n\n#Drop\ndf_transaction.drop(axis=1, columns=['plz', 'uci', 'country', 'type', 'source', 'pha_key', 'pha_name', 'main_address'], inplace=True)\n\n#Rename\ndf_transaction.columns = ['tra_id', \n                          'tra_name_original', \n                          'tra_location_original', \n                          'tra_address_original',\n                          'tra_donations_grants',\n                          'tra_sponsorship',\n                          'tra_registration_fees',\n                          'tra_travel_accommodation',\n                          'tra_fees',\n                          'tra_related_expenses',\n                          'tra_total',\n                          'tra_fk_recipient',\n                          'tra_fk_pharma']

## table `recipient`
Import PLZ and create `zip_shadow` with all possible Zips for this location.  
Source: https://www.post.ch/de/geschaeftlich/themen-a-z/adressen-pflegen-und-geodaten-nutzen/adress-und-geodaten

In [9]:
df_recipient = df_data.copy()
df_plz = df_plz_raw.copy()

#Only main_address
df_recipient = df_recipient[df_recipient.main_address == 1]

#Group PLZ by location
df_plz['plz_shaddow'] = df_plz['POSTLEITZAHL'].astype('str')
df_plz['ORTBEZ18'] = df_plz['ORTBEZ18'].str.lower()
df_plz = df_plz.groupby('ORTBEZ18')['plz_shaddow'].agg(lambda col: ','.join(col))
df_plz = df_plz.to_frame()

#abstract index
df_recipient['id'] = df_recipient['group_key']

#Lowercase location
df_recipient['location_lower'] = df_recipient['location'].str.lower()

#Join recipient and zip
df_recipient = df_recipient.merge(right = df_plz, how='left', left_on='location_lower', right_on='ORTBEZ18')

#covert plz
df_recipient['plz'] = df_recipient['plz'].astype('int', errors='ignore')

#Drop
df_recipient.drop(axis=1, columns=['group_key', 'location_lower', 'main_address', 'donations_grants', 'sponsorship', 'registration_fees', 'travel_accommodation', 'fees', 'related_expenses', 'total', 'source'], inplace=True)

#Add rec_zero_money = 0
df_recipient['rec_zero_money'] = 0

#rename
df_recipient.columns = [
        'rec_name',
        'rec_location',
        'rec_country',
        'rec_address',
        'rec_plz',
        'rec_uci',
        'rec_type',
        'rec_id',
        'rec_plz_shaddow',
        'rec_zero_money'
    ]

#Reorder Columns
df_recipient = df_recipient[[
        'rec_id',
        'rec_name',
        'rec_address',
        'rec_location',
        'rec_plz',
        'rec_plz_shaddow',
        'rec_country',
        'rec_uci',
        'rec_zero_money',
        'rec_type'
    ]]


## table `accumulations`

In [10]:
def add_accumulation(df_new, row, column):
    if not np.isnan(row[column]) and (row[column] != 0):
        
        category = df_transaction_category.loc[df_transaction_category.trc_name == column, 'trc_id']
        df_new = df_new.append({
          'acc_id': 0,
          'acc_fk_pharma': row['pha_id'],
          'acc_year': const_year,
          'acc_fk_transaction_category': category.iloc[0],
          'acc_value': row[column],
          'acc_type': row['type'],
        }, ignore_index=True)
        
    return df_new

#Create empty dataframe
col_names =  [
              'acc_id', 
              'acc_fk_pharma',
              'acc_year',
              'acc_fk_transaction_category',
              'acc_value',
              'acc_type'
            ]

df_accumulation_new  = pd.DataFrame(columns = col_names)

#Copy dataframe
df_accumulation = df_accumulation_raw.copy()

#Convert floats
"""
df_accumulation = convert_float(df_accumulation, 'donations_grants')
df_accumulation = convert_float(df_accumulation, 'sponsorship')
df_accumulation = convert_float(df_accumulation, 'registration_fees')
df_accumulation = convert_float(df_accumulation, 'travel_accommodation')
df_accumulation = convert_float(df_accumulation, 'fees')
df_accumulation = convert_float(df_accumulation, 'related_expenses')
df_accumulation = convert_float(df_accumulation, 'total')
"""

#Select amounts (no counts)
df_accumulation = df_accumulation[df_accumulation['type'].isin(['hcp_amount', 'hco_amount', 'rnd'])]

#Rename hcp_acount & hco_acmount
df_accumulation['type'] = df_accumulation['type'].str.replace('_amount', '')

#Join with pharma
df_accumulation = df_accumulation.merge(right = df_pharma, how='left', left_on='source', right_on='pha_key')

#Iter Accumulations
for index, row in df_accumulation.iterrows():
    df_accumulation_new = add_accumulation(df_accumulation_new, row, 'donations_grants')
    df_accumulation_new = add_accumulation(df_accumulation_new, row, 'sponsorship')
    df_accumulation_new = add_accumulation(df_accumulation_new, row, 'registration_fees')
    df_accumulation_new = add_accumulation(df_accumulation_new, row, 'travel_accommodation')
    df_accumulation_new = add_accumulation(df_accumulation_new, row, 'fees')
    df_accumulation_new = add_accumulation(df_accumulation_new, row, 'related_expenses')
    

#Add RnD
for index, row in df_accumulation[df_accumulation.type == 'rnd'].iterrows():
    if not np.isnan(row['total']) and (row['total'] != 0):
        category = df_transaction_category.loc[df_transaction_category.trc_name == 'total', 'trc_id']
        df_accumulation_new = df_accumulation_new.append({
          'acc_id': 0,
          'acc_fk_pharma': row['pha_id'],
          'acc_year': const_year,
          'acc_fk_transaction_category': category.iloc[0],
          'acc_value': row['total'],
          'acc_type': row['type'],
        }, ignore_index=True)

#Set acc_id to index + 1
df_accumulation_new['acc_id'] = df_accumulation_new.index + 1

In [11]:
"""
df_accumulation = df_accumulation_raw.copy()

#Select no "count"
df_accumulation = df_accumulation[df_accumulation['type'].isin(['hcp_amount', 'hco_amount', 'rnd'])]

#Rename hcp_acount & hco_acmount
df_accumulation['type'] = df_accumulation['type'].str.replace('_amount', '')

#Join with pharma
df_accumulation = df_accumulation.merge(right = df_pharma, how='left', left_on='source', right_on='pha_key')

#Drop Columns
df_accumulation.drop(columns=['pha_key', 'pha_name', 'source'], inplace=True)

#abstract index
df_accumulation.index += 1
df_accumulation.reset_index(drop=False, inplace=True)

#add year
df_accumulation['year'] = const_year

#Rename Columns
df_accumulation.columns = [
    'acc_id',
    'acc_type',
    'acc_donations_grants',
    'acc_sponsorship',
    'acc_registration_fees',
    'acc_travel_accommodation',
    'acc_fees',
    'acc_related_expenses',
    'acc_total',
    'acc_fk_pharma',
    'acc_year'
]

#Reorder Columns
df_accumulation = df_accumulation[[
    'acc_id',
    'acc_fk_pharma',
    'acc_year',
    'acc_donations_grants',
    'acc_sponsorship',
    'acc_registration_fees',
    'acc_travel_accommodation',
    'acc_fees',
    'acc_related_expenses',
    'acc_total',
    'acc_type'
    
]]
"""

'\ndf_accumulation = df_accumulation_raw.copy()\n\n#Select no "count"\ndf_accumulation = df_accumulation[df_accumulation[\'type\'].isin([\'hcp_amount\', \'hco_amount\', \'rnd\'])]\n\n#Rename hcp_acount & hco_acmount\ndf_accumulation[\'type\'] = df_accumulation[\'type\'].str.replace(\'_amount\', \'\')\n\n#Join with pharma\ndf_accumulation = df_accumulation.merge(right = df_pharma, how=\'left\', left_on=\'source\', right_on=\'pha_key\')\n\n#Drop Columns\ndf_accumulation.drop(columns=[\'pha_key\', \'pha_name\', \'source\'], inplace=True)\n\n#abstract index\ndf_accumulation.index += 1\ndf_accumulation.reset_index(drop=False, inplace=True)\n\n#add year\ndf_accumulation[\'year\'] = const_year\n\n#Rename Columns\ndf_accumulation.columns = [\n    \'acc_id\',\n    \'acc_type\',\n    \'acc_donations_grants\',\n    \'acc_sponsorship\',\n    \'acc_registration_fees\',\n    \'acc_travel_accommodation\',\n    \'acc_fees\',\n    \'acc_related_expenses\',\n    \'acc_total\',\n    \'acc_fk_pharma\',\n 

## table `pharma_source`

In [12]:
df_pharma_source = df_pharma_source_raw.copy()

#Join with pharma
df_pharma_source = df_pharma_source.merge(right = df_pharma, how='left', left_on='phf_key', right_on='pha_key')

#drop columns
df_pharma_source.drop(columns=['pha_name', 'pha_key', 'phf_key', 'pha_name'], inplace=True)

#abstract index
df_pharma_source.index += 1
df_pharma_source.reset_index(drop=False, inplace=True)

#Rename
df_pharma_source.columns = [
    'phf_id',
    'phf_source',
    'phf_fk_pharma'
]

#Reorder
df_pharma_source = df_pharma_source[[
    'phf_id',
    'phf_fk_pharma',
    'phf_source'
]]


## table `pharma`

In [13]:
df_pharma.drop(axis=1, columns=['pha_key'], inplace=True)

## Export files

In [14]:
df_transaction_category.to_csv('../../data/4. database/0_transaction_category.csv', index=False)
df_pharma.to_csv('../../data/4. database/1_pharma.csv', index=False)
df_pharma_source.to_csv('../../data/4. database/2_pharma_source.csv', index=False)
df_recipient.to_csv('../../data/4. database/3_recipient.csv', index=False)
df_transaction.to_csv('../../data/4. database/4_transaction.csv', index=False)
df_accumulation_new.to_csv('../../data/4. database/5_accumulation.csv', index=False)