In [23]:
# Watch for sample Weight
# Test With LSTM
# Test with shuffle false

In [24]:
from urllib.request import urlopen
from datetime import datetime
from sagemaker import get_execution_role
import boto3
import pandas as pd
import numpy as np
from tensorflow.python.keras._impl.keras.layers import Dense, Input, LSTM, BatchNormalization, Flatten, TimeDistributed, Dropout, Reshape
from tensorflow.python.keras._impl.keras.models import Model
from tensorflow.python.keras._impl.keras.callbacks import CSVLogger, ModelCheckpoint

In [25]:
def all_divisor(num: int) -> list:
    """
    Return all the divisors of a number
    :param num: The number to find the divisors
    :return: A list of divisors
    """
    divisor = list()
    for i in range(1, num+1):
        if num % i == 0:
            divisor.append(i)
    return divisor


def normalize_nums(num: int, minimum: int, maximum: int) -> float:
    """
    Normalize a positive integer between 0 and 1
    :param num: The num to normalize
    :param minimum: The maximum range
    :param maximum:  The minimum range
    :return: The normalized number
    """
    nnum = 0
    if num == 0:
        return nnum
    if num == np.nan:
        return nnum
    # if num >= 1 or num <= -1:
    #     nnum = num / maxiumum
    # else:
    #     nnum = num * (1 / maxiumum)
    nnum = (num - minimum) / (maximum - minimum)
    return nnum


def normalize_date(date: str, starting_date: str, ending_date: str, date_format: str) -> float:
    """
    Return the position of the date between the starting and ending as a percentage
    :param date: The date to format
    :param starting_date: The lower date boundary
    :param ending_date: The lower date boundary
    :param date_format: The date format to use
    :return: A float representing the positions between the date
    """
    num_days = (datetime.strptime(ending_date, date_format) - datetime.strptime(starting_date, date_format)).days
    day = (datetime.strptime(date, date_format) - datetime.strptime(starting_date, date_format)).days
    return day / num_days

In [26]:
def read_gdelt(to_time_series=False):
    print('Starting reading GDELT')
    gdelt = list()
    s3 = boto3.resource('s3')
    gdelt_bucket = s3.Bucket('gdelt.4ibd.flo')
    
    for object_summary in gdelt_bucket.objects.filter(Prefix='normalized_gdelt_2008_2018.csv'):
        if '.csv' in object_summary.key.split('/')[1]:
            gdelt.append(pd.read_csv(f's3://gdelt.4ibd.flo/{object_summary.key}', compression='gzip',
                                     engine='c', low_memory=True, na_values=[''], dtype=np.float32))
    gdelt = pd.concat(gdelt, ignore_index=True)
    gdelt.sort_values('date', inplace=True)
    # print('GDELT Size :', gdelt.count())

    print('Getting class Weight')
    event_cols = [c for c in list(gdelt.columns) if 'event' in c]
    wc = {i: gdelt[event_cols[i]].value_counts()[1] for i in range(len(event_cols))}
    max_weight = max([nb_el for class_id, nb_el in wc.items()])
    weight_classes = {class_id: max_weight / nb_el for class_id, nb_el in wc.items()}
    
    print('Divising in Train / Test set')
    gdelt_train = gdelt[[c for c in list(gdelt.columns) if 'event' not in c]].values
    gdelt_val = gdelt[[c for c in list(gdelt.columns) if 'event' in c]].values
    
    train_num = int(len(gdelt) * 0.8)
    gdelt = {
        'train': gdelt_train[:train_num],
        'train_val': gdelt_val[:train_num],
        'test': gdelt_train[train_num:],
        'test_val': gdelt_val[train_num:],
        'weight_classes' : weight_classes
    }
    return gdelt

In [34]:
def regression_model(input_shape: tuple, num_classes: int) -> Model:
    """
    Regression model for The GDELT
    :param input_shape: The input shape of the data
    :param num_classes: The number of classes
    :return: A compiled model
    """
    print('Designing model')
    inputs = Input(input_shape)
    outputs = Dense(num_classes, activation='softmax')(inputs)
    model = Model(inputs, outputs)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])
    print(model.summary())
    return model

In [35]:
def linear_model(input_shape: tuple, num_classes: int, neurons=256, layers=4, dr=0.3) -> Model:
    """
    Regression model for The GDELT
    :param input_shape: The input shape of the data
    :param num_classes: The number of classes
    :return: A compiled model
    """
    print('Designing model')
    inputs = Input(input_shape)
    linear = Dense(neurons, activation='relu')(inputs)
    linear = BatchNormalization()(linear)
    linear = Dropout(dr)(linear)
    for _ in range(1, layers):
        linear = Dense(neurons, activation='relu')(linear)
        linear = BatchNormalization()(linear)
        linear = Dropout(dr)(linear)
    outputs = Dense(num_classes, activation='softmax')(linear)
    model = Model(inputs, outputs)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])
    # model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    print(model.summary())
    return model

In [36]:
def lstm_model(input_shape: tuple, num_classes: int, neurons=256, layers=4, dr=0.3) -> Model:
    print('Designing model')
    inputs = Input(input_shape)
    lstm = LSTM(neurons, activation='relu', dropout=dr)(inputs)
    lstm = BatchNormalization()(lstm)
    for _ in range(1, layers):
        lstm = LSTM(neurons, activation='relu', dropout=dr)(lstm)
        lstm = BatchNormalization()(lstm)
    outputs = Dense(num_classes, activation='softmax')(lstm)
    model = Model(inputs, outputs)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])
    # model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    print(model.summary())
    return model

In [30]:
gdelt = read_gdelt()

Starting reading GDELT
Getting class Weight
Divising in Train / Test set


In [33]:
experiment_name = 'LINEAR_4D512_MODEL_ADAM_CAT_DR_BN'
batch_size = 16384
nb_epochs = 128
num_classes = gdelt['train_val'].shape[1:][0]
shape = gdelt['train'].shape[1:]
csv_logger = CSVLogger(f'./{experiment_name}.csv')
checkpoint = ModelCheckpoint(f'./{experiment_name}.h5', monitor='val_categorical_accuracy',
                             verbose=1, save_best_only=True, mode='max')

In [None]:
print('Starting learning')
gdelt_model = linear_model(shape, num_classes, 512)  # regression_model(shape, num_classes)
gdelt_model.fit(gdelt['train'], gdelt['train_val'], batch_size=batch_size, epochs=nb_epochs,
                shuffle=True, validation_data=(gdelt['test'], gdelt['test_val']), callbacks=[csv_logger, checkpoint],
                class_weight=gdelt['weight_classes'])

Starting learning
Designing model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 21)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 512)               11264     
_________________________________________________________________
batch_normalization_9 (Batch (None, 512)               2048      
_________________________________________________________________
dropout_9 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 512)               262656    
_________________________________________________________________
batch_normalization_10 (Batc (None, 512)               2048      
_________________________________________________________________
dropout_10 (Dropout)         (None, 512)  

KeyboardInterrupt: 

ERROR:tornado.general:Uncaught exception in ZMQStream callback
Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 421, in execute_request
    self._abort_queues()
  File "/home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6

In [None]:
print('Writing to S3')
# TODO