# Import Modules

In [2]:
import json  
import pandas as pd  
from pandas.io.json import json_normalize  
import numpy as np
import time

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# Data Load

In [10]:
def loading_data(temp):
    data = pd.DataFrame()
    data['safetyreportid'] = temp['safetyreportid']  # read 'safetyreportid' column
        
    patientdrug = temp['patient.drug'].apply(lambda x: x[0])  # read 'patientdrug' column
    data['openfda'] = patientdrug.apply(lambda x: x['openfda'] if 'openfda' in x.keys() else np.nan) # read 'openfda' line from dictionary
    data['generic_name'] = data['openfda'].apply(lambda x: x if type(x) != dict else (x['generic_name'] if 'generic_name' in x.keys() else np.nan)) # read 'generic_name' line from 'openfda' 
     
    stack = data.apply(lambda x: pd.Series(x['generic_name']), axis=1).stack().reset_index(level=1, drop=True) # split generic_name(list) to each different row
    stack.name = 'generic_name' # set the name
    data = data.drop('generic_name', axis=1) # drop generic_name column
    data = data.join(stack)
        
    data.drop('openfda', axis = 1, inplace=True)  # remove unnecessary columns
    return data
    

In [18]:
start = time.time() 

df = pd.DataFrame()
for i in range(1,134):
    a = 'data (' + str(i) + ').json'
    with open(a) as f:
        d = json.load(f) 
    result = json_normalize(d['results']) 
    temp = loading_data(result)
    df = pd.concat([df, temp], sort=False)

    
end = time.time()
(end - start)/60

65.46581843694051

In [20]:
# df.to_csv('C:/AstraZeneca/market.csv')

In [141]:
# df = pd.read_csv('C:/AstraZeneca/market.csv', index_col=0)

# Data Cleansing

### Missing Values

In [142]:
# Drop Missing Values
df.dropna(inplace = True)

### String Manipulation

In [143]:
# Some data have 'comma' and 'AND' in  a generic_name column. Replace 'AND' to 'comma' to make it List.

df['generic_name'] = df['generic_name'].apply(lambda x: x.replace(", AND", ","))   
df['generic_name'] = df['generic_name'].apply(lambda x: x.replace(" AND", ","))   

In [144]:
# Change string to List and split each element into different rows

df = pd.DataFrame(df['generic_name'].str.split(',').tolist(), index = df['safetyreportid']).stack().reset_index().drop('level_1', axis=1)

# Rename columns

df.columns = ['safetyreportid', 'generic_name']

# Remove space to make it clear (Some data have space while others don't have it in spite of the same generic name)

df['generic_name'] = df['generic_name'].apply(lambda x: x.replace(" ", ""))   

# Pivot Table for Association Analysis

In [145]:
# Add vaue columns for aggregation

df['values'] = 1

In [146]:
# Count number of generic name by an ID

id_cnt = df['safetyreportid'].value_counts()
df.set_index('safetyreportid', inplace = True)

# Bring data having two or more generic names because data having only one generic name is meaningless in association
df = df[id_cnt > 1].reset_index()

  import sys


In [147]:
# Group by to make it smaller

df = df.groupby(['safetyreportid','generic_name'])['values'].sum().reset_index()

In [148]:
# Pivot Table

table = pd.pivot_table(df, values='values', index=['safetyreportid'],
                    columns=['generic_name'], aggfunc=np.sum).fillna(0)

In [150]:
# Encode_units function to have only 0 or 1

def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1    

In [151]:
table = table.applymap(encode_units)

In [152]:
# Final table
table.head(20)

generic_name,Unnamed: 1_level_0,(COAGULATIONFACTORIX(RECOMBINANT),(DAUNORUBICIN,(METHYLPHENIDATEHYDROCHLORIDE),(SALINE),.ALPHA.-TOCOPHEROL,.ALPHA.-TOCOPHEROLACETATE,.BETA.-CAROTENE,0.01%,0.02%/0.005%,...,ZINCOXIDE,ZINCOXIDE8%,ZINCOXIDESUNSCREEN,ZINCOXIDESUNSREEN,ZINCSULFATE,ZINCUMBROMATUM,ZINCUMMETALLICUM,ZINCVALERATEDIHYDRATE,ZIPRASIDONEHCL,ZIPRASIDONEHYDROCHLORIDE
safetyreportid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4154661,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5946536,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6056280,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6075952,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6089495,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6108212,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6117053,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6140824,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6158684,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6179494,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Association Analysis

In [153]:
frequent_items = apriori(table, min_support=0.01, use_colnames=True)

In [155]:
# Count number of itemsets

frequent_items_list = frequent_items
frequent_items_list['itemsets'] = frequent_items_list['itemsets'].apply(lambda x: list(x))
frequent_items_list['cnt'] = frequent_items_list['itemsets'].apply(lambda x: len(x))

In [156]:
# Check the result having two or more itemsets sorted by a value of support

frequent_items_list[frequent_items_list['cnt'] > 1].sort_values('support', ascending = False)

Unnamed: 0,support,itemsets,cnt
91,0.069959,"[SACUBITRIL, VALSARTAN]",2
64,0.067445,"[AVOBENZONE, OCTOCRYLENE]",2
63,0.067383,"[OCTISALATE, AVOBENZONE]",2
89,0.067366,"[OCTISALATE, OCTOCRYLENE]",2
96,0.067366,"[OCTISALATE, AVOBENZONE, OCTOCRYLENE]",3
87,0.049562,"[METHOTREXATESODIUM, METHOTREXATE]",2
78,0.040119,"[ESOMEPRAZOLESODIUM, ESOMEPRAZOLEMAGNESIUM]",2
84,0.026827,"[IBUPROFEN, IBUPROFEN200MG]",2
82,0.026827,"[IBUPFROFEN, IBUPROFEN]",2
83,0.026827,"[IBUPFROFEN, IBUPROFEN200MG]",2


In [158]:
# Association Rules to check Lift and Confidence

rules = association_rules(frequent_items, metric="lift", min_threshold=1)
rules.sort_values('lift', ascending = False).head(30)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
21,(CITALOPRAMHYDROBROMIDE),(CITALOPRAM),0.010023,0.010023,0.010023,1.0,99.774632,0.009922,inf
20,(CITALOPRAM),(CITALOPRAMHYDROBROMIDE),0.010023,0.010023,0.010023,1.0,99.774632,0.009922,inf
67,(SILDENAFILCITRATE),(SILDENAFIL),0.010516,0.010516,0.010516,1.0,95.090124,0.010406,inf
66,(SILDENAFIL),(SILDENAFILCITRATE),0.010516,0.010516,0.010516,1.0,95.090124,0.010406,inf
50,(LAMOTRIGINECHEWABLEDISPERSIBLE),(LAMOTRIGINE),0.010868,0.010868,0.010868,1.0,92.011488,0.01075,inf
51,(LAMOTRIGINE),(LAMOTRIGINECHEWABLEDISPERSIBLE),0.010868,0.010868,0.010868,1.0,92.011488,0.01075,inf
12,(LEVODOPA),(CARBIDOPA),0.011203,0.011203,0.011203,1.0,89.261398,0.011078,inf
13,(CARBIDOPA),(LEVODOPA),0.011203,0.011203,0.011203,1.0,89.261398,0.011078,inf
61,(RANITIDINE),(RANITIDINEHYDROCHLORIDE),0.011243,0.01122,0.01122,0.997981,88.945987,0.011094,489.693257
60,(RANITIDINEHYDROCHLORIDE),(RANITIDINE),0.01122,0.011243,0.01122,1.0,88.945987,0.011094,inf
