In [1]:
# Import all libraries

In [2]:
import pandas as pd
import urllib as ulib
import requests
import os
import re #Regular Expression
from io import BytesIO

# To parse the document, BeautifulSoup library is used. 
from bs4 import BeautifulSoup

# To read and open zipfile, zipfile module is used
import zipfile
from zipfile import ZipFile


import glob


In [3]:
#Now we need to specify the link of the website from where we want to scrape the data
page=requests.get("https://www.sec.gov/dera/data/edgar-log-file-data-set.html")

In [4]:
# You can view the content of the webpage with the content property. 
page.content

b'\n\n<!DOCTYPE html>\n<html lang="en" dir="ltr" prefix="content: http://purl.org/rss/1.0/modules/content/  dc: http://purl.org/dc/terms/  foaf: http://xmlns.com/foaf/0.1/  og: http://ogp.me/ns#  rdfs: http://www.w3.org/2000/01/rdf-schema#  schema: http://schema.org/  sioc: http://rdfs.org/sioc/ns#  sioct: http://rdfs.org/sioc/types#  skos: http://www.w3.org/2004/02/skos/core#  xsd: http://www.w3.org/2001/XMLSchema# ">\n  <head>\n    <meta charset="utf-8" /><script type="text/javascript">window.NREUM||(NREUM={}),__nr_require=function(e,n,t){function r(t){if(!n[t]){var o=n[t]={exports:{}};e[t][0].call(o.exports,function(n){var o=e[t][1][n];return r(o||n)},o,o.exports)}return n[t].exports}if("function"==typeof __nr_require)return __nr_require;for(var o=0;o<t.length;o++)r(t[o]);return r}({1:[function(e,n,t){function r(){}function o(e,n,t){return function(){return i(e,[c.now()].concat(u(arguments)),n?null:this,t),n?void 0:this}}var i=e("handle"),a=e(3),u=e(4),f=e("ee").get("tracer"),c=e("l

In [5]:
# Now that the document has been successfully downloaded, we need to parse the document so that the data needed can be extracted
#An instance of the library needs to be created for parsing
soup=BeautifulSoup(page.content,'html.parser')

In [6]:
# Now we need to define a function which will get us all the year links in a list
#The div_tag_list will give us the div tags which has all the year links
#The a_tag_list will basically store the <a> tags in a list. 
#But this variable contains a list which has a list of all the a_list_tags. So basically the list has just one element.
#So we need to just take that one element as a separate list (named a_tag_list again) and use that list 
def generate_link_list():
    div_tag_list=soup.find_all('div',attrs={'id':'asyncAccordion'})
    a_tag_list=[]
    for div_tag in div_tag_list:
        a_tag=div_tag.find_all('a')
        a_tag_list.append(a_tag)
    a_tag_list=a_tag_list[0]
    year_links=[]
    for aa in a_tag_list:
        link=aa.get('href')
        final_link='https://www.sec.gov'+link
        year_links.append(final_link) 
    return(year_links)

In [7]:
generate_link_list()

['https://www.sec.gov/files/edgar2017_1.html',
 'https://www.sec.gov/files/edgar2016_5.html',
 'https://www.sec.gov/files/edgar2015.html',
 'https://www.sec.gov/files/edgar2014.html',
 'https://www.sec.gov/files/edgar2013.html',
 'https://www.sec.gov/files/edgar2012.html',
 'https://www.sec.gov/files/edgar2011.html',
 'https://www.sec.gov/files/edgar2010.html',
 'https://www.sec.gov/files/edgar2009.html',
 'https://www.sec.gov/files/edgar2008.html',
 'https://www.sec.gov/files/edgar2007.html',
 'https://www.sec.gov/files/edgar2006.html',
 'https://www.sec.gov/files/edgar2005.html',
 'https://www.sec.gov/files/edgar2004.html',
 'https://www.sec.gov/files/edgar2003.html']

In [8]:
# Now that the links are generated, we need a function to check whether the year input by the user is valid or not.
# For that we will generate a list which has all the valid years and then will define a function which will check the userinput
    
def valid_year(user_year_input):
    valid_year_list=[]
    for i in range(2003,2018):
        valid_year_list.append(i)
    if user_year_input in valid_year_list:
        return('The year is valid')
    else:
        return('The year is invalid')
        

In [9]:
# Now we need to define a function which will select the appropriate year link depending upon the user input 

def user_input_year_link(year):
    link_list=generate_link_list()
    for year_link in link_list:
        if year in year_link:
            year=year_link
    zip_files = ulib.request.urlopen(year)
    soup=BeautifulSoup(zip_files,'html.parser')
    a_tag=soup.find_all('a')
    a_tag_list=[]
    for i in a_tag:
        links=i.get('href')
        a_tag_list.append(links)
    first_day_zip_links=[]
    for i in a_tag_list:
        if '01.zip' in i:
            first_day_zip_links.append(i)
    return(first_day_zip_links)
    

In [10]:
user_input_year_link('2017')

['http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2017/Qtr2/log20170601.zip',
 'http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2017/Qtr2/log20170501.zip',
 'http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2017/Qtr2/log20170401.zip',
 'http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2017/Qtr1/log20170301.zip',
 'http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2017/Qtr1/log20170201.zip',
 'http://www.sec.gov/dera/data/Public-EDGAR-log-file-data/2017/Qtr1/log20170101.zip']

In [24]:
# Now we need to create a file path on our local where all the zip files will be extracted and csv will be stored

def zip_csv_on_local(year):
    folder=str(year)
    path= str(os.getcwd())+'/'+folder
    os.makedirs(path)
    a=user_input_year_link(year)
    for i in a:
        with ulib.request.urlopen(i) as firstMonth:
            with ZipFile(BytesIO(firstMonth.read())) as unzippedFile:
                unzippedFile.extractall(path)

In [25]:
zip_csv_on_local('2017')

In [32]:
# Now we need to load data from each of the csv files that are in the folder to separate dataframes

def create_dataframes(path):
    file=glob.glob(path+'/log*.csv')
   # def read_csv(list_val):
    #    return pd.read_csv(list_val)
    df1,df2=[pd.read_csv(val) for val in file]
    return df1,df2

In [33]:
a,b=create_dataframes('C:\\Users\\DELL\\Desktop\\ADS\\Untitled Folder\\2017')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [34]:
a.head()

Unnamed: 0,ip,date,time,zone,cik,accession,extention,code,size,idx,norefer,noagent,find,crawler,browser
0,104.197.32.ihd,2017-01-01,00:00:00,0.0,1111711.0,0001193125-12-324016,-index.htm,200.0,7627.0,1.0,0.0,0.0,10.0,0.0,
1,104.197.32.ihd,2017-01-01,00:00:00,0.0,720005.0,0000720005-16-000144,-index.htm,200.0,12014.0,1.0,0.0,0.0,10.0,0.0,
2,104.197.32.ihd,2017-01-01,00:00:00,0.0,1111711.0,0001193125-12-386591,-index.htm,200.0,6581.0,1.0,0.0,0.0,10.0,0.0,
3,104.197.32.ihd,2017-01-01,00:00:00,0.0,720005.0,0000720005-16-000121,-index.htm,200.0,11255.0,1.0,0.0,0.0,10.0,0.0,
4,107.178.195.aea,2017-01-01,00:00:00,0.0,799288.0,0000799288-16-000197,le-20161028_cal.xml,301.0,624.0,0.0,0.0,0.0,10.0,0.0,


In [36]:
b.head()

Unnamed: 0,ip,date,time,zone,cik,accession,extention,code,size,idx,norefer,noagent,find,crawler,browser
0,107.23.85.jfd,2017-02-01,00:00:00,0.0,1013454.0,0001539497-13-000934,-index.htm,200.0,2792.0,1.0,0.0,0.0,10.0,0.0,
1,107.23.85.jfd,2017-02-01,00:00:00,0.0,1407200.0,0001193125-16-541564,-index.htm,200.0,2880.0,1.0,0.0,0.0,10.0,0.0,
2,107.23.85.jfd,2017-02-01,00:00:00,0.0,1013454.0,0001539497-13-000938,-index.htm,200.0,2792.0,1.0,0.0,0.0,10.0,0.0,
3,107.23.85.jfd,2017-02-01,00:00:00,0.0,1013454.0,0001539497-13-000900,-index.htm,200.0,2791.0,1.0,0.0,0.0,10.0,0.0,
4,107.23.85.jfd,2017-02-01,00:00:00,0.0,1407200.0,0001193125-16-535603,-index.htm,200.0,2670.0,1.0,0.0,0.0,10.0,0.0,


In [35]:
import boto
import boto.s3
import sys
from boto.s3.key import Key


#def upload_to_s3(AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY)
AWS_ACCESS_KEY_ID = 'AKIAIND3MCMAVWNP5G4Q'
AWS_SECRET_ACCESS_KEY = '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'

bucket_name = AWS_ACCESS_KEY_ID.lower() + '-dump'
conn = boto.connect_s3(AWS_ACCESS_KEY_ID,
        AWS_SECRET_ACCESS_KEY)


bucket = conn.create_bucket(bucket_name,
    location=boto.s3.connection.Location.DEFAULT)

testfile = "C:\\Users\\DELL\\Desktop\\ADS\\Untitled.zip"
#print 'Uploading %s to Amazon S3 bucket %s' % \
 #  (testfile, bucket_name)

def percent_cb(complete, total):
    sys.stdout.write('.')
    sys.stdout.flush()


k = Key(bucket)
k.key = 'my test file'
k.set_contents_from_filename(testfile,
    cb=percent_cb, num_cb=10)

..........

214093