# Create Database Import

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(0, '../../data/lib/')
from consts import *

## Import files

In [2]:
df0 = pd.read_csv('../../data/3. transformation/6_hcp_grouped.csv', dtype = dataframe_types)
df1 = pd.read_csv('../../data/3. transformation/6_hco_grouped.csv', dtype = dataframe_types)
df_accumulation_raw = pd.read_csv('../../data/3. transformation/1_accumulations_all.csv', dtype = dataframe_types)

df_pharma = pd.read_csv('sources/liste_companies.csv')
df_pharma_source_raw = pd.read_csv('sources/pharma_source.csv')
plz_names = ['REC_ART', 'ONRP', 'BFSNR', 'PLZ_TYP', 'POSTLEITZAHL', 'PLZ_ZZ', 'GPLZ', 'ORTBEZ18', 
             'ORTBEZ27', 'KANTON', 'SPRACHCODE', 'SPRACHCODE_ABW', 'BRIEFZ_DURCH', 'GILT_AB_DAT', 'PLZ_BRIEFZUST', 'PLZ_COFF']
df_plz_raw = pd.read_csv('sources/Post_Adressdaten20190122_edited.csv')

## Create Group Key
HCO and HCP both have same indexes. Generate unique values for the address.

In [3]:
df_hcp = df0.copy()
df_hco = df1.copy()

group_key = 1

def set_groupkey(dataframe, startkey):
    for g in dataframe['group'].unique():
        dataframe.loc[dataframe['group'] == g, 'group_key'] = startkey
        startkey += 1
        #print(g)

    dataframe.drop(axis=1, columns=['group'])
    return (dataframe, startkey)

df_hcp, group_key = set_groupkey(df_hcp, group_key)
df_hco, group_key = set_groupkey(df_hco, group_key)

#Concat
df_data = pd.concat([df_hcp, df_hco], sort=False)

## Clean and format

In [4]:
#Clean
df_data.drop(axis=1, columns=['group'], inplace=True)

#Format
df_data['group_key'] = df_data['group_key'].astype('int')

#Reindex
df_data.reset_index(drop=True, inplace=True)
df_data.index += 1

## table `transaction_category`
Create this table by hand. IDs have to be this way!

In [5]:
category_data = [
    {
        'trc_id': 1,
        'trc_name': 'donations_grants'
    },
    {
        'trc_id': 2,
        'trc_name': 'sponsorship'
    },
    {
        'trc_id': 3,
        'trc_name': 'registration_fees'
    },
    {
        'trc_id': 4,
        'trc_name': 'travel_accommodation'
    },
    {
        'trc_id': 5,
        'trc_name': 'fees'
    },
    {
        'trc_id': 6,
        'trc_name': 'related_expenses'
    },
    {
        'trc_id': 7,
        'trc_name': 'total'
    }
]

df_transaction_category =  pd.DataFrame(category_data)

## table `transaction`

If you have conversion errors, your data is not clean eq there are strings in value fields

In [6]:
c = 0

def add_transaction(df_new, row, column, cat_id):
    if not np.isnan(row[column]) and (row[column] != 0):
        global c
        c += 1
        
        df_new = df_new.append({
          'tra_id': 0,
          'tra_fk_pharma': row['pha_id'],
          'tra_fk_recipient': row['group_key'],
          'tra_year': row['year'],
          'tra_fk_transaction_category': cat_id,
          'tra_value': row[column],
          'tra_name_original': row['name'],
          'tra_location_original': row['location'], 
          'tra_address_original': row['address']
        }, ignore_index=True)
        
    return df_new

#Create empty dataframe
col_names =  [
              'tra_id', 
              'tra_fk_pharma',
              'tra_fk_recipient',
              'tra_year',
              'tra_fk_transaction_category',
              'tra_value',
              'tra_name_original',
              'tra_location_original', 
              'tra_address_original'
            ]

df_transaction  = pd.DataFrame(columns = col_names)

#Copy dataframe
df_data_temp = df_data.copy()

#TEMP!!!!
#df_data_temp = df_data[0:10].copy()
print(len(df_data_temp))

#Join with pharma
df_data_temp = df_data_temp.merge(right = df_pharma, how='left', left_on='source', right_on='pha_key')

#Prepare transaction category id
cat_donations_grants = df_transaction_category.loc[df_transaction_category.trc_name == 'donations_grants', 'trc_id'].iloc[0]
cat_sponsorship = df_transaction_category.loc[df_transaction_category.trc_name == 'sponsorship', 'trc_id'].iloc[0]
cat_registration_fees = df_transaction_category.loc[df_transaction_category.trc_name == 'registration_fees', 'trc_id'].iloc[0]
cat_travel_accommodation = df_transaction_category.loc[df_transaction_category.trc_name == 'travel_accommodation', 'trc_id'].iloc[0]
cat_fees = df_transaction_category.loc[df_transaction_category.trc_name == 'fees', 'trc_id'].iloc[0]
cat_related_expenses = df_transaction_category.loc[df_transaction_category.trc_name == 'related_expenses', 'trc_id'].iloc[0]

#Iter Addresses
for index, row in df_data_temp.iterrows():
    df_transaction = add_transaction(df_transaction, row, 'donations_grants', cat_donations_grants)
    df_transaction = add_transaction(df_transaction, row, 'sponsorship', cat_sponsorship)
    df_transaction = add_transaction(df_transaction, row, 'registration_fees', cat_registration_fees)
    df_transaction = add_transaction(df_transaction, row, 'travel_accommodation', cat_travel_accommodation)
    df_transaction = add_transaction(df_transaction, row, 'fees', cat_fees)
    df_transaction = add_transaction(df_transaction, row, 'related_expenses', cat_related_expenses)

#Set tra_id to index + 1
df_transaction['tra_id'] = df_transaction.index + 1

print(c)

30630
44504


## table `recipient`
Import PLZ and create `zip_shadow` with all possible Zips for this location.  
Source: https://www.post.ch/de/geschaeftlich/themen-a-z/adressen-pflegen-und-geodaten-nutzen/adress-und-geodaten

In [11]:
df_recipient = df_data.copy()
df_plz = df_plz_raw.copy()

#Only main_address
df_recipient = df_recipient[df_recipient.main_address == 1]

#Remove year
#df_recipient.drop(columns='_export_information', axis=1, inplace=True)
df_recipient.drop(columns=['year', 'address_expand', 'location_expand', 'name_expand'], axis=1, inplace=True, errors='ignore')

#Group PLZ by location
df_plz['plz_shadow'] = df_plz['POSTLEITZAHL'].astype('str')
df_plz['ORTBEZ18'] = df_plz['ORTBEZ18'].str.lower()
df_plz = df_plz.groupby('ORTBEZ18')['plz_shadow'].agg(lambda col: ','.join(col))
df_plz = df_plz.to_frame()

#abstract index
df_recipient['id'] = df_recipient['group_key']

#Lowercase location
df_recipient['location_lower'] = df_recipient['location'].str.lower()

#Join recipient and zip
df_recipient = df_recipient.merge(right = df_plz, how='left', left_on='location_lower', right_on='ORTBEZ18')

#covert plz
df_recipient['plz'] = df_recipient['plz'].astype('int', errors='ignore')

#Drop
df_recipient.drop(axis=1, columns=['group_key', 'location_lower', 'main_address', 'donations_grants', 'sponsorship', 'registration_fees', 'travel_accommodation', 'fees', 'related_expenses', 'total', 'source'], inplace=True)

#Add rec_zero_money = 0
df_recipient['rec_zero_money'] = 0


#rename
df_recipient.columns = [
        'rec_name',
        'rec_location',
        'rec_country',
        'rec_address',
        'rec_plz',
        'rec_uci',
        'rec_type',
        'rec_id',
        'rec_plz_shadow',
        'rec_zero_money'
    ]

#Reorder Columns
df_recipient = df_recipient[[
        'rec_id',
        'rec_name',
        'rec_address',
        'rec_location',
        'rec_plz',
        'rec_plz_shadow',
        'rec_country',
        'rec_uci',
        'rec_zero_money',
        'rec_type'
    ]]


## table `accumulations`

In [12]:
def add_accumulation(df_new, row, column):
    if not np.isnan(row[column]) and (row[column] != 0):
        
        category = df_transaction_category.loc[df_transaction_category.trc_name == column, 'trc_id']
        df_new = df_new.append({
          'acc_id': 0,
          'acc_fk_pharma': row['pha_id'],
          'acc_year': row['year'],
          'acc_fk_transaction_category': category.iloc[0],
          'acc_value': row[column],
          'acc_type': row['type'],
        }, ignore_index=True)
        
    return df_new

#Create empty dataframe
col_names =  [
              'acc_id', 
              'acc_fk_pharma',
              'acc_year',
              'acc_fk_transaction_category',
              'acc_value',
              'acc_type'
            ]

df_accumulation_new  = pd.DataFrame(columns = col_names)

#Copy dataframe
df_accumulation = df_accumulation_raw.copy()

#Convert floats
"""
df_accumulation = convert_float(df_accumulation, 'donations_grants')
df_accumulation = convert_float(df_accumulation, 'sponsorship')
df_accumulation = convert_float(df_accumulation, 'registration_fees')
df_accumulation = convert_float(df_accumulation, 'travel_accommodation')
df_accumulation = convert_float(df_accumulation, 'fees')
df_accumulation = convert_float(df_accumulation, 'related_expenses')
df_accumulation = convert_float(df_accumulation, 'total')
"""

#Select amounts (no counts)
df_accumulation = df_accumulation[df_accumulation['type'].isin(['hcp_amount', 'hco_amount', 'rnd'])]

#Rename hcp_acount & hco_acmount
df_accumulation['type'] = df_accumulation['type'].str.replace('_amount', '')

#Join with pharma
df_accumulation = df_accumulation.merge(right = df_pharma, how='left', left_on='source', right_on='pha_key')

#Iter Accumulations
for index, row in df_accumulation.iterrows():
    df_accumulation_new = add_accumulation(df_accumulation_new, row, 'donations_grants')
    df_accumulation_new = add_accumulation(df_accumulation_new, row, 'sponsorship')
    df_accumulation_new = add_accumulation(df_accumulation_new, row, 'registration_fees')
    df_accumulation_new = add_accumulation(df_accumulation_new, row, 'travel_accommodation')
    df_accumulation_new = add_accumulation(df_accumulation_new, row, 'fees')
    df_accumulation_new = add_accumulation(df_accumulation_new, row, 'related_expenses')
    

#Add RnD
for index, row in df_accumulation[df_accumulation.type == 'rnd'].iterrows():
    if not np.isnan(row['total']) and (row['total'] != 0):
        category = df_transaction_category.loc[df_transaction_category.trc_name == 'total', 'trc_id']
        df_accumulation_new = df_accumulation_new.append({
          'acc_id': 0,
          'acc_fk_pharma': row['pha_id'],
          'acc_year': row['year'],
          'acc_fk_transaction_category': category.iloc[0],
          'acc_value': row['total'],
          'acc_type': row['type'],
        }, ignore_index=True)

#Set acc_id to index + 1
df_accumulation_new['acc_id'] = df_accumulation_new.index + 1

## table `pharma_source`

In [13]:
df_pharma_source = df_pharma_source_raw.copy()

#Join with pharma
df_pharma_source = df_pharma_source.merge(right = df_pharma, how='left', left_on='phs_key', right_on='pha_key')

#drop columns
df_pharma_source.drop(columns=['pha_name', 'pha_key', 'phs_key', 'pha_name'], inplace=True)

#abstract index
df_pharma_source.index += 1
df_pharma_source.reset_index(drop=False, inplace=True)

#Rename
df_pharma_source.columns = [
    'phs_id',
    'phs_source',
    'phs_fk_pharma'
]

#Reorder
df_pharma_source = df_pharma_source[[
    'phs_id',
    'phs_fk_pharma',
    'phs_source'
]]


## table `pharma`

In [14]:
df_pharma.drop(axis=1, columns=['pha_key'], inplace=True)

## Create CSV Files

In [15]:
df_transaction_category.to_csv('../../data/4. database/0_transaction_category.csv', index=False)
df_pharma.to_csv('../../data/4. database/1_pharma.csv', index=False)
df_pharma_source.to_csv('../../data/4. database/2_pharma_source.csv', index=False)
df_recipient.to_csv('../../data/4. database/3_recipient.csv', index=False)
df_transaction.to_csv('../../data/4. database/4_transaction.csv', index=False)
df_accumulation_new.to_csv('../../data/4. database/5_accumulation.csv', index=False)

## Create SQL Files
```sql
DELETE FROM accumulation WHERE acc_id > 0;
DELETE FROM pharma_source WHERE phs_id > 0;
DELETE FROM pharma WHERE pha_id > 0;
DELETE FROM recipient WHERE rec_id > 0;
DELETE FROM transaction WHERE tra_id > 0;
DELETE FROM transaction_category WHERE trc_id > 0;
```

In [16]:
def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]

def export_sql(df, tablename, path):
    #Columns
    columns = list(map(lambda x: "`" + x + "`", df.columns))
    columns = ', '.join(columns)

    #Values
    df = df.fillna("")
    rows = []
    #qry
    for row in df.values:
        row_s = list(map(lambda x: "'" + str(x).replace("'", "\\'") + "'", row))
        row_s = '\n(' +  ', '.join(row_s) + ')'
        rows.append(row_s)

    sublists  = list(chunks(rows, 3000))
    inserts = []
    for sub in sublists:
        inserts.append('INSERT INTO %s (%s) VALUES %s;' % (tablename, columns, ', '.join(sub)))
        
    all_inserts = '\n'.join(inserts)

    #Safe File
    text_file = open(path, "w")
    text_file.write("START TRANSACTION;\n%s\nCOMMIT;\n" % all_inserts)
    text_file.close

In [17]:
export_sql(df_transaction_category, 'transaction_category', '../../data/4. database/0_transaction_category.sql')
export_sql(df_pharma, 'pharma', '../../data/4. database/1_pharma.sql')
export_sql(df_pharma_source, 'pharma_source', '../../data/4. database/2_pharma_source.sql')
export_sql(df_recipient, 'recipient', '../../data/4. database/3_recipient.sql')
export_sql(df_transaction, 'transaction', '../../data/4. database/4_transaction.sql')
export_sql(df_accumulation_new, 'accumulation', '../../data/4. database/5_accumulation.sql')

In [18]:
#Concat files
filenames = ['../../data/4. database/0_transaction_category.sql',
             '../../data/4. database/1_pharma.sql',
             '../../data/4. database/2_pharma_source.sql',
             '../../data/4. database/3_recipient.sql',
             '../../data/4. database/4_transaction.sql',
             '../../data/4. database/5_accumulation.sql'
             ]
with open('../../data/4. database/data_dump.sql', 'w') as outfile:
    for fname in filenames:
        with open(fname) as infile:
            for line in infile:
                outfile.write(line)