In [1]:
import os
import shutil
import requests
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import warnings
# import luigi
import sys
import boto3
from boto3.s3.transfer import S3Transfer

In [2]:
def extractZip(url, source_dir, data_dir):
    
    r = requests.get(url)
    z = ZipFile(BytesIO(r.content))
    z.extractall(source_dir)
    moveFile(source_dir, data_dir)

In [3]:
def moveFile(source_dir, data_dir):
    
    RootDir1 = os.getcwd() + '\\' + source_dir
    TargetFolder = os.getcwd() + '\\' + data_dir
    for root, dirs, files in os.walk((os.path.normpath(RootDir1)), topdown=False):
        for name in files:
            if name.endswith('.csv'):
                SourceFolder = os.path.join(root, name)
                shutil.move(SourceFolder, TargetFolder)

In [4]:
def getFileLinks():

    source_dir = "FAERSsrc"
    data_dir = "FAERSdata"
    host_url = "http://www.nber.org"
    target_page = ["http://www.nber.org/data/fda-adverse-event-reporting-system-faers-data.html"]
    
    if not os.path.isdir(source_dir):
        os.makedirs(source_dir)
    if not os.path.isdir(data_dir):
        os.makedirs(data_dir)

    for page_url in target_page:
        try:
            page_bs = BeautifulSoup(urlopen(page_url), "lxml")
        except:
            page_bs = BeautifulSoup(urlopen(page_url))

        for url in page_bs.find_all("a"):
            a_string = str(url.string)

            if "csv" in a_string:
                url = host_url + url["href"]
                if ("2018" in str(url) or "2017" in str(url) or "2016" in str(url) or "2015" in str(url) \
                    or "2014" in str(url)) and ("demo" in str(url) or "drug" in str(url) or "reac" in str(url) \
                                                or "outc" in str(url)):
                        extractZip(url, source_dir, data_dir)

In [12]:
def getCombinedFile():

    directoryPath = os.getcwd() + "/FAERSdata"
    demo = pd.DataFrame(columns=['primaryid', 'caseid', 'mfr_dt', 'init_fda_dt', 'rept_cod', 'mfr_num', 'mfr_sndr', 'age',
                                 'sex', 'wt', 'wt_cod', 'occp_cod', 'occr_country'])
    drug = pd.DataFrame(columns=['primaryid', 'caseid', 'role_cod', 'drugname', 'route', 'dose_amt', 'dose_unit',
                                 'dose_form', 'dose_freq'])
    reaction = pd.DataFrame(columns=['primaryid', 'caseid', 'pt'])
    outcome = pd.DataFrame(columns=['primaryid', 'caseid', 'outc_cod'])
    print("Reading files and creating dataframes for each file type!!", "\n")
    for filename in os.listdir(directoryPath):
        if "demo" in filename:
            demo_df = pd.read_csv(directoryPath + "/" + filename, low_memory=False, sep=",", error_bad_lines=False)
            demo_df.drop(['caseversion', 'i_f_code', 'lit_ref', 'event_dt', 'auth_num', 'fda_dt', 'age_cod', 'age_grp',
                          'e_sub', 'rept_dt', 'to_mfr', 'reporter_country'], inplace=True, axis=1, errors='ignore')
            demo_df = demo_df.loc[(demo_df['wt_cod'] == 'KG')]
            demo_df = demo_df[pd.notnull(demo_df['age'])]
            demo_df = demo_df[1:]
            demo = demo.append(demo_df, ignore_index=True)
        if "drug" in filename:
            durg_df = pd.read_csv(directoryPath + "/" + filename, low_memory=False, sep=",", error_bad_lines=False)
            durg_df.drop(['drug_seq', 'val_vbm', 'dose_vbm', 'cum_dose_chr', 'prod_ai', 'cum_dose_unit', 'dechal',
                          'rechal', 'lot_num', 'exp_dt', 'nda_num'], inplace=True, axis=1, errors='ignore')
            durg_df = durg_df[pd.notnull(durg_df['dose_amt'])]
            durg_df = durg_df[pd.notnull(durg_df['dose_unit'])]
            durg_df = durg_df.loc[(durg_df['role_cod'] == 'PS')]
            durg_df = durg_df[1:]
            drug = drug.append(durg_df, ignore_index=True)
        if "reac" in filename:
            reac_df = pd.read_csv(directoryPath + "/" + filename, low_memory=False, sep=",", error_bad_lines=False)
            reac_df = reac_df.groupby('primaryid')
            reac_df = reac_df.filter(lambda x: len(x) == 1)
            reac_df = reac_df[1:]
            reaction = reaction.append(reac_df, ignore_index=True)
        if "outc" in filename:
            out_df = pd.read_csv(directoryPath + "/" + filename, low_memory=False, sep=",", error_bad_lines=False)
            out_df = out_df.groupby('primaryid')
            out_df = out_df.filter(lambda x: len(x) == 1)
            out_df = out_df[1:]
            outcome = outcome.append(out_df, ignore_index=True)

    print("Dataframes created. Starting wrangling and combining files!!", "\n")        
    demo_durg_df = pd.merge(drug, demo, on=('primaryid', 'caseid'), how='left')
    demo_durg_df['sex'] = demo_durg_df['sex'].fillna('NS')
    demodurgreact_df = pd.merge(demo_durg_df, reaction, on=('primaryid', 'caseid'), how='inner')
    demodrugreactout_df = pd.merge(demodurgreact_df, outcome, on=('primaryid', 'caseid'), how='inner')
    demodrugreactout_df.drop(['drug_rec_act'], inplace=True, axis=1, errors='ignore')
    demodrugreactout_df['occp_cod'] = demodrugreactout_df['occp_cod'].fillna('OT')
    demodrugreactout_df['rept_cod'] = demodrugreactout_df['rept_cod'].fillna('EXP')
    demodrugreactout_df['mfr_sndr'] = demodrugreactout_df['mfr_sndr'].fillna('Others')
    demodrugreactout_df['route'] = demodrugreactout_df['route'].fillna('Unknown')
    demodrugreactout_df['dose_form'] = demodrugreactout_df['dose_form'].fillna('Others')
    demodrugreactout_df['dose_freq'] = demodrugreactout_df['dose_freq'].fillna('Others')

    demodrugreactout_df.to_csv('MergedFile.csv', header=True, index=False);
    print("Combined file created", "\n")

In [11]:
def fileUploadToS3(AWS_ACCESS_KEY, AWS_SECRET_KEY):
    
    conn = boto3.client('s3', aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_KEY)
    transfer = S3Transfer(conn)

    response = conn.list_buckets()    
    existent = []
    for bucket in response["Buckets"]:
        existent.append(bucket['Name'])

    bucket_name = 'team1finalproject-faers'
    #homepath = os.path.expanduser('~')
    target_dir = './'
    filenames = []
    file_list = os.listdir(target_dir)
    for file in file_list:
        if '.csv' in file:
            filenames.append(file)

    if bucket_name in existent:
        print('Bucket already exists!!', '\n')
        print('Combined File upload started to s3!!!!!', '\n')
        for files in filenames:
            upload_filename = files
#             transfer.upload_file(os.path.join(target_dir, files), bucket_name, upload_filename, \
#                                  extra_args={'ACL': 'public-read'})
            transfer.upload_file(os.path.join(target_dir, files), bucket_name, upload_filename)
        print('File uploaded to s3!!!!!','\n')
            
    else:
        print('Bucket not present. Creating bucket!!', '\n')
#        conn.create_bucket(Bucket=bucket_name, ACL='public-read-write')
        conn.create_bucket(Bucket=bucket_name)
        print('File upload started to s3!!!!!', '\n')
        for files in filenames:
            upload_filename = files
#             transfer.upload_file(os.path.join(target_dir, files), bucket_name, upload_filename, \
#                                  extra_args={'ACL': 'public-read'})
            transfer.upload_file(os.path.join(target_dir, files), bucket_name, upload_filename)
        print('File uploaded to s3!!!!!','\n')

In [13]:
if __name__ == '__main__':
    
    #getFileLinks()
    getCombinedFile()
    #fileUploadToS3('AKIAIRFUODTPVPPQTITA', '3Cn78F7sR9mpPvR/8RVeG5MuLhIHHm+CSkrnU7wZ')

Reading files and creating dataframes for each file type!! 



of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Dataframes created. Starting wrangling and combining files!! 

Combined file created 

