In [1]:
import datetime as dt
import time, os
import pandas as pd

In [2]:
def download(source, target, filename):
    """
    Download file through http request
    """
    import requests, io
    
    # construct full url
    full_source_path = os.path.join(source, filename)

    # local path
    full_target_path = os.path.join(target, filename)
    
    # whether downloading file exist
    if not os.path.exists(full_target_path):
        try:
            # make request
            req = requests.get(full_source_path)
            print('Downloading {}'.format(filename))

            # write downloaded file to local path
            with io.open(full_target_path, 'wb') as f:
                for chunk in req.iter_content(chunk_size=1024):
                    if chunk:
                        f.write(chunk)
            print('Done Downloading {}'.format(full_target_path))

        except Exception as e:
            print("There was an error: {}; {}".format(e, filename))
    
    else:
        print('{} already exist - Skipping downloading.'.format(full_target_path))
        

def unzip(full_target_path, *args, **kwargs):
    import zipfile

    try:
        z = zipfile.ZipFile(file=full_target_path, mode='r')
        
        # if path not specified then extract in current working directory
        unzip_path  = kwargs.get('path' , os.getcwd()) 
        
        print('Attempting to unzip {}'.format(full_target_path))
        unzipped = []
        for fl in z.namelist():
            full_unzip_path = os.path.join(unzip_path, fl)
            
            # whether unzipping file exist
            if not os.path.exists(full_unzip_path):
                unzipped.append(z.extract(fl, *args, **kwargs))
                print('Done unzipping {}'.format(fl))
            else:
                print('{} already exist - Skip unzipping.'.format(full_unzip_path))
                unzipped.append(full_unzip_path)
                 
        return unzipped
        
    except zipfile.BadZipfile:
        print('Bad zip file for {}, passing.'.format(full_target_path))

In [3]:
# elements that are present in all the tables but they all differ
tables = {
    'events': {'sheetname': 'events', 'filename':'export','index_col': 'GLOBALEVENTID', 'date_col': 'DATEADDED'},
    'gkg': {'sheetname': 'GKG', 'filename':'gkg','index_col': None, 'date_col': 'V2.1DATE'},
    'mentiones' : {'sheetname': 'mentions', 'filename':'mentions','index_col': 'GLOBALEVENTID', 'date_col': 'EVENTTIMEDATE'},
}

In [4]:
# source url
URL = "http://data.gdeltproject.org/gdeltv2/"

# path to download data
PATH = "data/"

In [5]:
# get column names from file
headers = os.path.join(os.getcwd(), PATH, "header.xlsx")

colnames = pd.read_excel(
    headers, 
    sheetname = tables['events']['sheetname'], 
    index_col = 'Column ID', 
    parse_cols = 1
)['Field Name']

In [6]:
start_date = dt.datetime(2016, 8, 10, 10, 00, 00)
end_date = dt.datetime(2016, 8, 10, 12, 00, 00)
date = start_date

df_list = [] 

# iterate within dates with 15-mins step and download, unzip and save csv-file
# each csv-file content save in pandas dataframe
while date <= end_date:
    # filename examples:
    # 20161027054500.export.CSV.zip / 20161027054500.translation.export.CSV.zip
    # 20161027054500.mentions.CSV.zip / 20161027054500.translation.mentions.CSV.zip
    # 20161027054500.gkg.csv.zip / 20161027054500.translation.gkg.csv.zip
    filename = date.strftime("%Y%m%d%H%M%S") + ".translation." + tables['events']['filename'] + ".CSV.zip"
    download(URL, PATH, filename)
    unzipped = unzip(os.path.join(PATH, filename), path=os.path.join(PATH, 'tmp/'))
    
    df_list.append(pd.read_csv(
        unzipped[0], 
        sep='\t', 
        header = None, 
        dtype = str,
        names = colnames, 
        index_col = [tables['events']['index_col']],
        parse_dates = [tables['events']['date_col']])
    )
 
    time.sleep(15) # 15 secs pause to avoid server overloading
    date += dt.timedelta(seconds=60*15) # 15 minutes step

# concatenate dataframes
df = pd.concat(df_list)

data/20160810100000.translation.export.CSV.zip already exist - Skipping downloading.
Attempting to unzip data/20160810100000.translation.export.CSV.zip
data/tmp/20160810100000.translation.export.CSV already exist - Skip unzipping.
data/20160810101500.translation.export.CSV.zip already exist - Skipping downloading.
Attempting to unzip data/20160810101500.translation.export.CSV.zip
data/tmp/20160810101500.translation.export.CSV already exist - Skip unzipping.
data/20160810103000.translation.export.CSV.zip already exist - Skipping downloading.
Attempting to unzip data/20160810103000.translation.export.CSV.zip
data/tmp/20160810103000.translation.export.CSV already exist - Skip unzipping.
data/20160810104500.translation.export.CSV.zip already exist - Skipping downloading.
Attempting to unzip data/20160810104500.translation.export.CSV.zip
data/tmp/20160810104500.translation.export.CSV already exist - Skip unzipping.
data/20160810110000.translation.export.CSV.zip already exist - Skipping down

In [7]:
# storing data
storing_name = start_date.strftime("%Y.%m.%d_%H.%M") + '-' + end_date.strftime("%Y.%m.%d_%H.%M") + '_' + tables['events']['filename'] + '.pkl'
df.to_pickle(os.path.join(PATH, storing_name))