In [3]:
import requests
import zipfile
import pandas as pd
import logging
import os
import shutil
import glob
import boto
from boto.s3.connection import Location
from boto.s3.key import Key
import time
import datetime

In [4]:
# Creating a log file 'problem2_log.log'

root = logging.getLogger()
root.setLevel(logging.DEBUG)
ch1 = logging.FileHandler('problem2_log.log')
ch1.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
ch1.setFormatter(formatter)
root.addHandler(ch1)

In [5]:

#input the parameters

print ("Please input the S3 Access Key")
accessKey = input()
logging.info("Access Key = %s" % accessKey)


Please input the S3 Access Key
AKIAJXHKVBG5ZPESTMLQ


In [6]:
print ("Please input the S3 Secret Access Key")
secretAccessKey = input()
logging.info("Secret Access Key = %s" % secretAccessKey)

Please input the S3 Secret Access Key
rYA1AlHu5gdS7qlW4SjXcvhhHjZRrOhfQaX1fw1z


In [7]:

print ("Please input your location")
location = input()
if location not in ['APNortheast', 'APSoutheast', 'APSoutheast2', 'EU', 'EUCentral1', 'SAEast', 'USWest', 'USWest2']:
    location = 'Default'
logging.info("Location = %s" % location)


Please input your location
us-east


In [8]:
year_range = range(2003, 2018)

print ("Please input the Year")
year = input()
if int(year) not in year_range:
    logging.error("Invalid year. Please enter a valid year between 2003 and 2017.")
    exit()
logging.info("Year = %s", year)


Please input the Year
2010


In [9]:
#Verifying the AWS account

AWS_ACCESS_KEY_ID = accessKey
AWS_SECRET_ACCESS_KEY = secretAccessKey

try:
    s3_connection = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
    region = s3_connection.get_all_regions()
    print ('connected to S3!')

except:
    logging.info("Amazon keys are invalid!")
    exit()

In [None]:
#Directory CLeansing  

zip_dir = year + '_zips'
unzipped_dir = year + '_unzipped'
try:
    if not os.path.exists(zip_dir):
        os.makedirs(zip_dir, mode=0o777)
    else:
        shutil.rmtree(os.path.join(os.path.dirname(__file__), zip_dir), ignore_errors=False)
        os.makedirs(zip_dir, mode=0o777)

    if not os.path.exists(unzipped_dir):
        os.makedirs(unzipped_dir, mode=0o777)
    else:
        shutil.rmtree(os.path.join(os.path.dirname(__file__), unzipped_dir), ignore_errors=False)
        os.makedirs(unzipped_dir, mode=0o777)
    logging.info('Directories cleanup completed.')
except Exception as e:
    logging.error(str(e))
    exit()

In [None]:
#Generation of URLs
domain = "http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/"
urls = []
year_range = range(2003, 2017)
month_quarter = {'Qtr1': ['01', '02', '03'], 'Qtr2': ['04', '05', '06'],
                 'Qtr3': ['07', '08', '09'], 'Qtr4': ['10', '11', '12']}

for key, value in month_quarter.items():
    for v in value:
        url = domain + str(year) + '/' + str(key) + '/' + 'log' + str(year) + str(v) + '01.zip'
        logging.info('url to download zip %s', url)
        urls.append(url)

In [None]:

# Download the 12 zip files from the URLs

try:
    for i in range(0, 12):
        month_zip_dir = zip_dir + '/' + str(i) + '.zip'
        month_unzipped_dir = unzipped_dir + '/' + str(i)
        r = requests.get(urls[i],allow_redirects=True)
        open(month_zip_dir, 'wb').write(r.content)
        if os.path.getsize(month_zip_dir) <= 4515: #catching empty file
            os.remove(month_zip_dir)
            logging.warning('Log file %s is empty.', i)
        else:
            logging.info('Log file %s successfully downloaded', i)
            try:
                zip_ref = zipfile.ZipFile(month_zip_dir, 'r')
                for file in zip_ref.namelist():
                    if file.endswith('.csv'):
                        zip_ref.extract(file, unzipped_dir)
                        zip_ref.close()
                        logging.info('Log file %s was successfully unzipped', i)
            except Exception as e:
                logging.error(str(e))
                exit()
except Exception as e:  # Catching file not found
    logging.warning('Log %s not found...Skipping ahead!', i)
    exit()

In [13]:

file_lists = glob.glob(unzipped_dir + "/*.csv")

all_csv_df_dict = {period: pd.read_csv(period) for period in file_lists}
logging.info('All the csv read into individual dataframes')


In [14]:
try:
    for k, v in all_csv_df_dict.items():
        st = all_csv_df_dict[k]
        for key, value in st.items():
            key_drop = {'cik', 'accession', 'ip', 'date', 'time'}
            key_max = {'idx', 'browser', 'code', 'find', 'extention', 'zone'}
            df = pd.DataFrame(st[key])
            null_count = df.isnull().sum()
            logging.info("count of null in  %s is %s" % (key, null_count))
            most_used_value = pd.DataFrame(df.groupby(key).size().rename('cnt')).idxmax()[0]
            if key == "idx":
                incorrect_idx = (~df.isin([0.0, 1.0])).sum()
                logging.info("count of incorrect idx is %s" % incorrect_idx)
                st[key] = st[key].fillna(most_used_value)
                logging.info("fill the null value in column %s with the most used value" % key)
            elif key == "norefer":
                incorrect_norefer = (~df.isin([0.0, 1.0])).sum()
                logging.info("count of incorrect norefer is %s" % incorrect_norefer)
                st[key] = st[key].fillna('1')
                logging.info("fill the null value in column %s with 1" % key)
            elif key == "noagent":
                incorrect_noagent = (~df.isin([0.0, 1.0])).sum()
                logging.info("count of incorrect noagent is %s" % incorrect_noagent)
                st[key] = st[key].fillna('1')
                logging.info( "fill the null value in column %s with 1" % key)
            elif key in key_drop:
                st[key] = st.dropna(subset=[key])
                logging.info("the null in %s is dropped" % key)
            elif key in key_max:
                st[key] = st[key].fillna(most_used_value)
                logging.info("fill the null value in column %s with the most used value" % key)
            elif key == "crawler":
                st[key] = st[key].fillna('0')
                logging.info("fill the null value in column %s with 0" % key)
            elif key == "size":
                st[key] = st[key].fillna(st[key].mean(axis=0))
                logging.info("fill the null value in column %s with the average value" % key)
except Exception as e:
    logging.error(str(e))
    exit()


In [None]:

#zip the csv and log

try:
    dfs = pd.concat(all_csv_df_dict)
    dfs.to_csv('login_data.csv')
    logging.info('All dataframes of csvs are combined and exported as csv: master_csv.csv.')
except Exception as e:
    logging.error(str(e))
    exit()


In [None]:
def zipdir(path, ziph):
    ziph.write(os.path.join('login_data.csv'))
    ziph.write(os.path.join('problem2_log.log'))

In [None]:
zipf = zipfile.ZipFile('Problem2.zip', 'w', zipfile.ZIP_DEFLATED)
zipdir('/', zipf)
zipf.close()
logging.info("csv and log files successfully zipped!")

In [None]:
#Upload the zip file to Amazon S3 Bucket

try:
    zipfile = 'Problem2.zip'
    ts = time.time()
    st = datetime.datetime.fromtimestamp(ts)
    bucket_name = AWS_ACCESS_KEY_ID.lower() + str(st).replace(" ", "").replace("-", "").replace(":", "").replace(".", "")
    conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
    bucket = conn.create_bucket(bucket_name, location=location)
    print ('bucket created')
    print "Uploading %s to Amazon S3 bucket %s" % (zipfile, bucket_name)

    k = Key(bucket)
    k.key = 'Problem2'
    k.set_contents_from_filename(zipfile)
    print("Zip File successfully uploaded to S3")
except:
    logging.info("AWS credentials are invalid!")
    exit()