# Import Modules

In [1]:
import json  
import pandas as pd  
from pandas.io.json import json_normalize  
import numpy as np
import time

# Data Load

In [2]:
def loading_data(temp):
    data = pd.DataFrame()
    data['safetyreportid'] = temp['safetyreportid']  # read 'safetyreportid' column
    data['country'] = temp['occurcountry']  # read 'country' column
    
    data['patient.reaction'] = temp['patient.reaction'].apply(lambda x: x) # read 'patient.reaction' column
    stack = data.apply(lambda x: pd.Series(x['patient.reaction']), axis=1).stack().reset_index(level=1, drop=True) # split patient.reaction(list) to each different row
    stack.name = 'patient.reaction' # set the name
    data = data.drop('patient.reaction', axis=1) # drop patient.reaction column
    data = data.join(stack)
        
    data['reactionmeddrapt'] = data['patient.reaction'].apply(lambda x: x['reactionmeddrapt'])  # read 'reactionmeddrapt' from 'patient.reaction'
    data.drop('patient.reaction', axis = 1, inplace=True)  # remove unnecessary columns
    return data
    

In [3]:
start = time.time() 

df = pd.DataFrame()
for i in range(1,134):
    a = 'data (' + str(i) + ').json'
    with open(a) as f:
        d = json.load(f) 
    result = json_normalize(d['results']) 
    temp = loading_data(result)
    df = pd.concat([df, temp], sort=False)

    
end = time.time()
(end - start)/60

66.9860805273056

In [5]:
# df.to_csv('C:\AstraZeneca\df.csv')

In [60]:
# df = pd.read_csv('C:\AstraZeneca\df.csv')

# Data Cleansing

In [63]:
# Drop missing value

df.dropna(inplace=True)

### Aggregation with the number of cases by a country

In [64]:
cnt_cntr = pd.DataFrame({'total_case': df['country'].value_counts()})
df = pd.DataFrame(df.groupby(['country','reactionmeddrapt']).size(), columns=['count']).reset_index('reactionmeddrapt')
df = df.join(cnt_cntr)

### Add percent of cases by a country

In [67]:
df['percent'] = df['count'] / df['total_case']

In [68]:
df = df.reset_index().rename(columns={'index': 'country'})

### Add rank by a country

In [77]:
df['rank'] = df.groupby('country')['percent'].rank("dense", ascending=False)

In [80]:
# Final data

df.to_csv('C:/AstraZeneca/table.csv')