In [None]:
# TODO: Calculate class Weight
# TODO: Make Val Dataset

In [1]:
from sagemaker import get_execution_role
import boto3
import pandas as pd
import numpy as np
from tensorflow.python.keras._impl.keras.layers import Dense, Input, LSTM, BatchNormalization, Flatten, TimeDistributed, Dropout, Reshape
from tensorflow.python.keras._impl.keras.models import Model
from tensorflow.python.keras._impl.keras.utils import Sequence
from tensorflow.python.keras._impl.keras.callbacks import CSVLogger, ModelCheckpoint

In [None]:
class GDELTSequencer(Sequence):

    def __init__(self, datas, batch_size):
        """
        Instantiate the class
        :param datas: The list of s3 path for all the csv files 
        :param batch_size: The Number of images to return by batch
        """
        self.datas = datas
        self.batch_size = batch_size

    def __len__(self) -> int:
        """
        Get The len of the current batch
        :return: The len of th current batch
        """
        return int(np.ceil(len(self.datas) / float(self.batch_size)))

    def __getitem__(self, idx) -> tuple:
        """
        Get a batch item in the Sequencer
        :param idx: The index requested
        :return: A tuple of batch
        """
        dl = list()
        batch_datas = self.datas[idx * self.batch_size:(idx + 1) * self.batch_size]
        
        for el in batch_datas:
            dcsv = pd.read_csv('s3://gdelt4ibd/' + el, engine='c', low_memory=True,
                               dtype=np.float32, compression='gzip')
            temp_train = dcsv[['date'] + [c for c in list(dcsv.columns) if 'actor1code' or 'actor2code' in c]]
            temp_test = dcsv[[c for c in list(dcsv.columns) if 'event' or 'geo' in c]]
            dl.append((temp_train, temp_test))
        
        train = pd.concat([t[0] for t in dl], ignore_index=True)
        test = pd.concat([t[1] for t in dl], ignore_index=True)
        
        return train, test

In [5]:
def regression_model(input_shape, num_classes):
    """
    Regression model for The GDELT
    :param input_shape: The input shape of the data
    :param num_classes: The number of classes
    :return: A compiled model
    """
    print('Designing model')
    inputs = Input(input_shape)
    outputs = Dense(num_classes, activation='sigmoid')(inputs)
    model = Model(inputs, outputs)
    # model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy'])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    print(model.summary())
    return model

In [None]:
print('Connecting to S3')
s3 = boto3.resource('s3')

content_object = s3.Object('gdelt4ibd', 'gdelt_metadata.json')
file_content = content_object.get()['Body'].read().decode('utf-8')
gdelt_metadata = json.loads(file_content)

gdelt_bucket = s3.Bucket('gdelt4ibd')

In [6]:
print('Defining Variables')
experiment_name = 'REGRESSION_MODEL_ADAM_BIN_SIG'
batch_size = 8096
nb_epochs = 64
num_classes = gdelt_metadata['num_classes']
shape = gdelt_metadata['train_shape']
csv_logger = CSVLogger('s3://gdelt4ibd/models/' + experiment_name + '.csv')
checkpoint = ModelCheckpoint('s3://gdelt4ibd/models/' + experiment_name + '.h5',
                             monitor='val_acc', verbose=1, save_best_only=True, mode='max')

gdelt_model = regression_model((shape[1:],), num_classes)
gen_gdelt = GDELTSequencer([object_summary.key for object_summary
                            in gdelt_bucket.objects.filter(Prefix='normalized_gdelt_2017_2018.csv'):],
                           batch_size)

print('Starting learning')
gdelt_model.fit_generator(generator=gen_gdelt, epochs=nb_epochs, shuffle=True, callbacks=[csv_logger, checkpoint]
                          steps_per_epoch=shape[0] / batch_size, use_multiprocessing=True, workers=12)

Starting reading GDELT
Normalizing GDELT


MemoryError: 