In [1]:
import os
import shutil
import requests
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import warnings
import sys
import boto3
from boto3.s3.transfer import S3Transfer

In [10]:
def downloadFiles(download_url, source_dir, data_dir):
    
    response = requests.get(download_url)
    content = ZipFile(BytesIO(response.content))
    content.extractall(source_dir)
    moveFile(source_dir, data_dir)

In [3]:
def moveFile(source_dir, data_dir):
    
    SourceFolder = os.getcwd() + '\\' + source_dir
    TargetFolder = os.getcwd() + '\\' + data_dir
    for root, dirs, files in os.walk((os.path.normpath(SourceFolder)), topdown=False):
        for name in files:
            if name.endswith('.csv'):
                ActualFile = os.path.join(root, name)
                shutil.move(ActualFile, TargetFolder)

In [8]:
def getFileLinks():

    source_dir = "FAERSsrc"
    data_dir = "FAERSdata"
    host_url = "http://www.nber.org"
    target_page = ["http://www.nber.org/data/fda-adverse-event-reporting-system-faers-data.html"]
    
    if not os.path.isdir(source_dir):
        print('Creating Source Directory!!', '\n')
        os.makedirs(source_dir)
        print('Source Directory Created!!', '\n')
    print('Source Directory already present!!', '\n')
    
    if not os.path.isdir(data_dir):
        print('Creating Target Directory!!', '\n')
        os.makedirs(data_dir)
        print('Target Directory Created!!', '\n')
    print('Target Directory already present!!', '\n')

    for page_url in target_page:
        print('Opening target page!!', '\n')
        try:
            soup = BeautifulSoup(urlopen(page_url), "lxml")
        except:
            soup = BeautifulSoup(urlopen(page_url))
            
        print('Downloading Zip files!!', '\n')

        for url in soup.find_all("a"):
            a_string = str(url.string)

            if "csv" in a_string:
                download_url = host_url + url["href"]
                if ("2018" in str(download_url) or "2017" in str(download_url) or "2016" in str(download_url) or 
                    "2015" in str(download_url) or "2014" in str(download_url)) and ("demo" in str(download_url) or 
                                                                                     "drug" in str(download_url) or 
                                                                                     "reac" in str(download_url) or 
                                                                                     "outc" in str(download_url)):
                    downloadFiles(download_url, source_dir, data_dir)
        print('Files Downloaded, Unzipped and moved to Target Directory!!', '\n')

In [5]:
def getCombinedFile():

    directoryPath = os.getcwd() + "/FAERSdata"
    
    demo = pd.DataFrame(columns=['primaryid', 'caseid', 'mfr_dt', 'init_fda_dt', 'rept_cod', 'mfr_num', 'mfr_sndr', 'age',
                                 'sex', 'wt', 'wt_cod', 'occp_cod', 'occr_country'])
    
    drug = pd.DataFrame(columns=['primaryid', 'caseid', 'role_cod', 'drugname', 'route', 'dose_amt', 'dose_unit',
                                 'dose_form', 'dose_freq'])
    
    reaction = pd.DataFrame(columns=['primaryid', 'caseid', 'pt'])
    
    outcome = pd.DataFrame(columns=['primaryid', 'caseid', 'outc_cod'])
    
    print("Reading files and creating dataframes for each file type!!", "\n")
    
    for filename in os.listdir(directoryPath):
        if "demo" in filename:
            demo_df = pd.read_csv(directoryPath + "/" + filename, low_memory=False, sep=",", error_bad_lines=False)
            demo_df.drop(['caseversion', 'i_f_code', 'lit_ref', 'event_dt', 'auth_num', 'fda_dt', 'age_cod', 'age_grp',
                          'e_sub', 'rept_dt', 'to_mfr', 'reporter_country'], inplace=True, axis=1, errors='ignore')
            demo_df = demo_df.loc[(demo_df['wt_cod'] == 'KG')]
            demo_df = demo_df[pd.notnull(demo_df['age'])]
            demo_df = demo_df[1:]
            demo = demo.append(demo_df, ignore_index=True)
            
        if "drug" in filename:
            drug_df = pd.read_csv(directoryPath + "/" + filename, low_memory=False, sep=",", error_bad_lines=False)
            drug_df.drop(['drug_seq', 'val_vbm', 'dose_vbm', 'cum_dose_chr', 'prod_ai', 'cum_dose_unit', 'dechal',
                          'rechal', 'lot_num', 'exp_dt', 'nda_num'], inplace=True, axis=1, errors='ignore')
            drug_df = drug_df[pd.notnull(drug_df['dose_amt'])]
            drug_df = drug_df[pd.notnull(drug_df['dose_unit'])]
            drug_df = drug_df.loc[(drug_df['role_cod'] == 'PS')]
            drug_df = drug_df[1:]
            drug = drug.append(drug_df, ignore_index=True)
            
        if "reac" in filename:
            reac_df = pd.read_csv(directoryPath + "/" + filename, low_memory=False, sep=",", error_bad_lines=False)
            reac_df = reac_df.groupby('primaryid')
            reac_df = reac_df.filter(lambda x: len(x) == 1)
            reac_df = reac_df[1:]
            reaction = reaction.append(reac_df, ignore_index=True)
            
        if "outc" in filename:
            out_df = pd.read_csv(directoryPath + "/" + filename, low_memory=False, sep=",", error_bad_lines=False)
            out_df = out_df.groupby('primaryid')
            out_df = out_df.filter(lambda x: len(x) == 1)
            out_df = out_df[1:]
            outcome = outcome.append(out_df, ignore_index=True)

    print("Dataframes created. Starting wrangling and combining files!!", "\n")     
    
    demo_drug_df = pd.merge(drug, demo, on=('primaryid', 'caseid'), how='left')
    demo_drug_df['sex'] = demo_drug_df['sex'].fillna('NS')
    demodrugreac_df = pd.merge(demo_drug_df, reaction, on=('primaryid', 'caseid'), how='inner')
    demodrugreacout_df = pd.merge(demodrugreac_df, outcome, on=('primaryid', 'caseid'), how='inner')
    
    demodrugreacout_df.drop(['drug_rec_act'], inplace=True, axis=1, errors='ignore')
    demodrugreacout_df['occp_cod'] = demodrugreacout_df['occp_cod'].fillna('OT')
    demodrugreacout_df['rept_cod'] = demodrugreacout_df['rept_cod'].fillna('EXP')
    demodrugreacout_df['mfr_sndr'] = demodrugreacout_df['mfr_sndr'].fillna('Others')
    demodrugreacout_df['route'] = demodrugreacout_df['route'].fillna('Unknown')
    demodrugreacout_df['dose_form'] = demodrugreacout_df['dose_form'].fillna('Others')
    demodrugreacout_df['dose_freq'] = demodrugreacout_df['dose_freq'].fillna('Others')

    demodrugreacout_df.to_csv('MergedFile.csv', header=True, index=False);
    print("Combined file created!!", "\n")

In [3]:
def fileUploadToS3(AWS_ACCESS_KEY, AWS_SECRET_KEY):
    
    conn = boto3.client('s3', aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_KEY)
    transfer = S3Transfer(conn)

    response = conn.list_buckets()    
    existent = []
    for bucket in response["Buckets"]:
        existent.append(bucket['Name'])

    bucket_name = 'team1finalproject-faers'
    target_dir = './'
    filenames = []
    file_list = os.listdir(target_dir)
    for file in file_list:
        if '.csv' in file:
            filenames.append(file)

    if bucket_name in existent:
        print('Bucket already exists!!', '\n')
        print('Combined File upload started to s3!!!!!', '\n')
        for files in filenames:
            upload_filename = files
            transfer.upload_file(os.path.join(target_dir, files), bucket_name, upload_filename, \
                                 extra_args={'ACL': 'public-read'})
        print('File uploaded to s3!!!!!','\n')
            
    else:
        print('Bucket not present. Creating bucket!!', '\n')
        conn.create_bucket(Bucket=bucket_name)
        print('File upload started to s3!!!!!', '\n')
        for files in filenames:
            upload_filename = files
            transfer.upload_file(os.path.join(target_dir, files), bucket_name, upload_filename, \
                                 extra_args={'ACL': 'public-read'})
        print('File uploaded to s3!!!!!','\n')

In [4]:
if __name__ == '__main__':
    
    getFileLinks()
    getCombinedFile()
    fileUploadToS3('AKIAJVCQNSVSLV3MM6QQ', 'wLF7jZ4yOqDLx3NROSA90V7ocT6lhBGlCSOl9iq0')

Bucket already exists!! 

Combined File upload started to s3!!!!! 

File uploaded to s3!!!!! 

