# Summorize Pharma COUNT and create xlsx-Files

In [1]:
import pandas as pd
import mysql.connector
import sys
sys.path.insert(0, '../../data/lib/')
import consts
import mysql.connector

In [2]:
year = 2019

## Load DB

In [3]:
db_connection = mysql.connector.connect(
    host="localhost",
    user="root",
    passwd="",
    database = "pharmagelder"
)

df_recipient = pd.read_sql("""SELECT
  rec_id as id,
  rec_name as name,
  rec_address as address,
  rec_location as location,
  rec_type as type,
  sum(tra_value) as value,
  pha_name as pharma
 FROM transaction
  LEFT JOIN transaction_category ON tra_fk_transaction_category = transaction_category.trc_id
  LEFT JOIN recipient ON tra_fk_recipient = recipient.rec_id
  LEFT JOIN pharma ON tra_fk_pharma = pharma.pha_id
  WHERE tra_year = %s
  GROUP BY tra_fk_recipient, tra_fk_pharma
  ORDER BY rec_name DESC""" % year, con=db_connection)

In [4]:
df_db = df_recipient.groupby(['pharma', 'type']).agg({'name': 'count', 'value': 'sum'}).reset_index()
df_db.rename(columns={'name': 'count', 'value': 'total'}, inplace=True)
df_db.head()

Unnamed: 0,pharma,type,count,total
0,Nordic Pharma,hco,1,2340.0
1,Nordic Pharma,hcp,1,638.0
2,A. Menarini AG,hco,113,1388257.0
3,A. Menarini AG,hcp,222,229002.0
4,AbbVie,hco,148,5728593.0


## Load Count

In [5]:
df_acc = pd.read_csv('../../data/3. transformation/2_accumulations_cleaned.csv', encoding='UTF-8')

In [6]:
df_acc_count = df_acc[df_acc.type != 'rnd'].copy()

# Join with Pharma Names
df_pha = pd.read_csv('../6. database export/sources/liste_companies.csv')
df_acc_count = df_acc_count.merge(df_pha, how='left', left_on='source', right_on='pha_key')
df_acc_count.drop(columns=['pha_key', 'source'], inplace=True)

df_acc_count.rename(columns={'amount': 'total', 'pha_name': 'pharma'}, inplace=True)

# Copy NaN-List
df_nan = df_acc_count[df_acc_count.type.isin(['hcp_count', 'hco_count'])].copy()
df_nan = df_nan[df_nan.total.isna()]
df_nan['sum'] = df_nan[['donations_grants', 'sponsorship', 'registration_fees', 'travel_accommodation', 'fees', 'related_expenses']].sum(axis=1)

df_nan = df_nan[df_nan['sum'] > 0]
df_nan['is_nan'] = True
df_nan = df_nan[['pharma', 'is_nan']].drop_duplicates(subset=['pharma'])

# Transform DF
df_acc_count['category'] = df_acc_count['type'].apply(lambda x: 'amount' if x in ['hcp_amount', 'hco_amount'] else 'count')
df_acc_count['type'] = df_acc_count['type'].apply(lambda x: 'hcp' if x in ['hcp_amount', 'hcp_count'] else 'hco')
df_acc_count = pd.pivot_table(df_acc_count, index=['pharma', 'type'], columns='category', values='total' ).reset_index()


In [7]:
df_nan

Unnamed: 0,pharma,is_nan
57,Daiichi Sankyo,True
66,Eli Lilly SA,True
113,A. Menarini AG,True
149,Novartis,True
162,Otsuka Seiyaku,True
179,Sandoz Pharmaceuticals AG,True
184,Sanofi,True
188,Servier,True
195,Shire Pharmaceuticals,True


In [8]:
## Concat
df_new = pd.concat([df_db, df_acc_count], sort=False)
df_new = df_new.groupby(['pharma', 'type']).agg({'count': 'sum', 'total': 'sum'}).reset_index()

# Merge Nan
df_new = df_new.merge(df_nan, how='left', on='pharma')

# AVG
df_new.loc[df_new.is_nan != True, 'avg'] = round(df_new['total'] / df_new['count'])

df_new.head()

Unnamed: 0,pharma,type,count,total,is_nan,avg
0,Nordic Pharma,hco,1.0,2340.0,,2340.0
1,Nordic Pharma,hcp,1.0,638.0,,638.0
2,A. Menarini AG,hco,114.0,1388257.0,True,
3,A. Menarini AG,hcp,222.0,229002.0,True,
4,AbbVie,hco,148.0,5728593.0,,38707.0


In [9]:
#Write to Excel
writer = pd.ExcelWriter('../../data/99. analyzes/excel/pharmas/anzahl_empfaenger.xlsx', options={'encoding':'utf-8'})

df_new.to_excel(writer, 'data', index=True)

writer.save()