In [1]:
!pip install --quiet tinydb

[33mYou are using pip version 10.0.1, however version 20.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [7]:
from tinydb import TinyDB
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

#  Raw data
According to 'https://open.fda.gov/apis/drug/event/searchable-fields' there are two fields associated with country information:
- occurcountry : The name of the country where the event occurred rather than 
- primarysourcecountry : Country of the reporter of the event
Here we use `occurcountry` fields, but future analyses can look at `occurcountry` and relationship between two fields

records_reaction_by_country = TinyDB('db_parsed_reaction_by_country.json').all()
records_pd_reaction_by_country_pd = pd.DataFrame(records_reaction_by_country).drop_duplicates()
print('number of unique records', records_pd_reaction_by_country_pd.shape)

#  Process data
Convert a collection of reactionmeddrapt documents as defined in `https://open.fda.gov/apis/drug/label/searchable-fields` to a matrix of token counts.
Using min_df of 3000 gives us most frequent adverse effects.

In [8]:
vectoriser_adverse_effect = CountVectorizer(min_df=3000, 
                                            binary=True, 
                                            token_pattern=r"(?u)[\w -]+", 
                                            dtype=np.bool)
matrix1_reactionmeddrapt = vectoriser_adverse_effect.fit_transform(
    records_pd_reaction_by_country_pd['reactionmeddrapt'])
X_reactionmeddrapt = matrix1_reactionmeddrapt.toarray()

#  Results
Using fitering, list adverse effects commonly reported in different countries.

In [21]:
adverse_effect = vectoriser_adverse_effect.get_feature_names()
adverse_effect_filt = []
countries = []
for i in range(len(adverse_effect)):
    cross_tab = pd.crosstab(records_pd_reaction_by_country_pd['occurcountry'], X_reactionmeddrapt[:,i], normalize='index')
    frequent_ind = cross_tab[:][1] > 0.05
    if frequent_ind.sum() > 0 and frequent_ind.sum() < 5:
        countries.append(' '.join(list(cross_tab[:][1][frequent_ind].index)))
        adverse_effect_filt.append(adverse_effect[i])

In [22]:
df = pd.DataFrame({'countries': countries, 'adverse_effect': adverse_effect_filt})

In [23]:
df

Unnamed: 0,countries,adverse_effect
0,AL KH,abdominal discomfort
1,SD,abdominal distension
2,VE,adverse event
3,BD ME,alopecia
4,HT MT,atrial fibrillation
5,AD PY,back pain
6,AZ PS,blood glucose increased
7,AZ PA,blood pressure increased
8,PF,bronchitis
9,BF,chest discomfort


In [25]:
df.to_csv('adverse_effects_by_countries.csv')