In [24]:
!pip install tqdm

[33mYou are using pip version 10.0.1, however version 19.3.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [25]:
from sagemaker import get_execution_role
from sagemaker.session import Session
from sagemaker.tensorflow import TensorFlow
import boto3
import random
import os
import config
import json
import pandas as pd
import datetime
import gzip
import io
import random as rand
from tqdm import tqdm_notebook

In [26]:
# Initialize required settings
NUM_OF_DAYS = 2
MAX_DATASET_FILES = 10
MAX_DATASET_FILES_FOR_STATS = 10
TRAIN_EVAL_RATIO = 0.9
WEIGHT_COLUMN = 'weight'
S3_BUCKET = 'wsbidder'
CSV_SEPARATOR = '\t'
DATA_STATS_FILE_KEY = 'trainer_predict_bid/data/stats2.json'
SRC_BUCKET = 'wsbidder'
DST_BUCKET = 'wsbidder'
SRC_PREFIX = 'tsv/etl/imp-pred-service-v1/imppredservice_training_data'
DST_PREFIX = 'trainer_predict_bid/data'
REQUIRED_COLUMNS = {
        'deliveryid': '0',
        'dayofweek': 0,
        'hour': 0,
        'pub_sspid': '0',
        'pub_as_adspaceid': '0',
        'pub_as_domain': '0',
        'pub_as_dimensions': '0',
        'pub_as_position': '0',
        'pub_as_viewrate': 0.0,
        'device_os': '0',
        'device_model': '0',
        'user_ip': '0',
        'user_market': '0',
        'user_city': '0',
        'user_id' : '0',
        'pub_as_iabcategoryid': 'IAB24',
        'req_auctiontype': 0,
        'price': 0.0,
        'advcostcpm': 0.0,
        'won': 0,
        'targetbid': 0
    }
MINIMUM_FREQUENCY = 100
FILTER_LOW_FREQUENCY_COLUMNS = ['pub_sspid', 'pub_accountid', 'pub_as_siteid', 'pub_as_adspaceid', 
                              'pub_as_domain', 'pub_as_pageurl', 'pub_as_dimensions', 'pub_as_iabcategoryid',
                              'pub_as_position', 'device_os', 'device_model', 'user_market', 'user_city',
                              'user_id'] 

In [27]:
# helper methods

def delete_files(bucket, prefixes):
    for prefix in prefixes:
        for obj in boto3.resource('s3').Bucket(bucket).objects.filter(Prefix=prefix).all():
            obj.delete()

def get_files(bucket, prefix, days=[], hours=[], ext='.csv'):
    all_files = []
    s3_resource = boto3.resource('s3')
    for day in days:
        for hour in hours:
            _prefix = '{}/d={}/h={}/'.format(prefix, day, hour)
            new_files = [ obj.key for obj in s3_resource.Bucket(bucket).objects.filter(Prefix=_prefix).all() if obj.key.endswith(ext) ]
            all_files = all_files + new_files
            
    return all_files

def get_hours(startHour=None, numberOfHours=6):
    numberOfHours = numberOfHours % 25
    if startHour is None:
        startHour = datetime.datetime.now().hour
    hours = ["{:02d}".format( (hour + 24) % 24 ) for hour in range(startHour, startHour - numberOfHours, -1)]
    
    return hours

def sanitize_url(row):
    url = str(row['pub_as_pageurl'])
    url = url.replace('https://', '')
    url = url.replace('http://', '')
    url = url.replace('www.', '')

    return url

def clean_dataset_location():
    delete_files(DST_BUCKET, DST_PREFIX + '/train')
    delete_files(DST_BUCKET, DST_PREFIX + '/eval')

def create_dataset(files, name):
    l = len(files)
#     print('Total files...{}'.format(l))

    for file in tqdm_notebook(files):
        destination = '{}/{}/{}'.format(DST_PREFIX, name, os.path.basename(file) )
#         print('Creating file...{}'.format(destination))
        transform_and_save_file([file], destination)
#         print('Finished Creating file {}'.format(destination))

def transform_df_columns_for_dataset(df):
    df['pub_as_domain'] = df['pub_as_domain'].astype(str).str.lower()
    df['user_city'] = df['user_city'].astype(str).str.lower()
    df['domain_position'] = df['pub_as_domain'].astype(str) + df['pub_as_position'].astype(str)
#     df[WEIGHT_COLUMN] = 1.0
    
    return df

def transform_df_columns_for_stats(df):
    df['pub_as_viewrate'] = df['pub_as_viewrate'].astype(float)
    df['price'] = df['price'].astype(float)
    df['advcostcpm'] = df['advcostcpm'].astype(float)
    df[WEIGHT_COLUMN] = df[WEIGHT_COLUMN].astype(float)
    df['targetbid'] = df['targetbid'].astype(float)
    
    return df

In [28]:

def transform_and_save_file(files, destination, is_train=True):
    if len(files) == 0:
        return
    
    try:
        if len(files) > 1:
            df = pd.concat(
                    (pd.read_csv('s3://{}/{}'.format(S3_BUCKET, f), 
                         sep=CSV_SEPARATOR, compression='gzip', 
                         na_values=["null", "\\N"]) for f in files)
            )
        else:
            df = pd.read_csv('s3://{}/{}'.format(S3_BUCKET, files[0]), 
                         sep=CSV_SEPARATOR, compression='gzip', 
                         na_values=["null", "\\N"])
#         print("Number of instances {}".format(df.shape[0]))
    except:
#         print('Error reading files')
#         print(files)
        return
    df = df[list(REQUIRED_COLUMNS.keys())]
    missing_values = REQUIRED_COLUMNS
    df = df.fillna(value=missing_values)
    df = transform_df_columns_for_dataset(df)
    
    if df.empty:
        return

#     print('Read file')
#     df = df[df['user_market'] == '75']
#     df['dayofweek_hour'] = df['dayofweek']*24 + df['hour']
#     print('Added dayofweek_hour')
#     print('Added domain_position')
#     df['pub_as_pageurl'] = df.apply(lambda row: sanitize_url(row), axis=1)
#     print('Sanitized pub_as_pageurl')
    if is_train:
        df = df.apply(
            lambda x: x.mask(x.map(x.value_counts()) < MINIMUM_FREQUENCY, '0') 
                if x.name in FILTER_LOW_FREQUENCY_COLUMNS
                else x
        )
#     print('Removed low frequent')

#     df = df.drop(['dayofweek', 'hour', 'pub_as_position', 'pub_as_domain'], axis=1)
#     print('Dropped columns')
    new_data = df.to_csv(sep=CSV_SEPARATOR, index=False, na_rep="null")
    new_data = gzip.compress(bytes(new_data, 'utf-8'))
    io_data = io.BytesIO(new_data)
    s3 = boto3.client('s3')
    try:
        s3.upload_fileobj(io_data, S3_BUCKET, destination)
    except:
        s3.delete_object(Bucket=S3_BUCKET, Key=destination)

In [29]:
%%time

s3_resource = boto3.resource('s3')

# Last day and available hours of today
filter_days = [ str( datetime.date.today() - datetime.timedelta(i) ) for i in range(NUM_OF_DAYS) ]
filter_hours = get_hours(23, 24)

all_files = get_files(SRC_BUCKET, SRC_PREFIX, filter_days, filter_hours, ext='.gz')
rand.shuffle(all_files)

# all_files = all_files[:int(len(all_files) * 0.1)]
all_files = all_files[:MAX_DATASET_FILES]
train_length = int(len(all_files) * TRAIN_EVAL_RATIO)
train_files = all_files[:train_length]
eval_files = all_files[train_length:]

train_files_basenames = [ os.path.basename(file) for file in train_files ]
eval_files_basenames = [ os.path.basename(file) for file in eval_files ]
print('Files names separated for training and evaluation')

existing_train_files = [ os.path.basename(obj.key) for obj in s3_resource.Bucket(DST_BUCKET).objects.filter(Prefix='{}/train/'.format(DST_PREFIX)).all() if obj.key.endswith('.gz') ]
existing_eval_files = [ os.path.basename(obj.key) for obj in s3_resource.Bucket(DST_BUCKET).objects.filter(Prefix='{}/eval/'.format(DST_PREFIX)).all() if obj.key.endswith('.gz') ]

delete_files_train = [ "{}/train/{}".format(DST_PREFIX, file) 
                          for file in existing_train_files if file not in train_files_basenames ]
delete_files_eval = [ "{}/eval/{}".format(DST_PREFIX, file) 
                         for file in existing_eval_files if file not in eval_files_basenames ]

train_files = [ file for file in train_files if os.path.basename(file) not in existing_train_files ]
eval_files = [ file for file in eval_files if os.path.basename(file) not in existing_eval_files ]


delete_files(DST_BUCKET, delete_files_train)
delete_files(DST_BUCKET, delete_files_eval)

print("Deleted unnecessary files")

Files names separated for training and evaluation
Deleted unnecessary files
CPU times: user 519 ms, sys: 20 ms, total: 539 ms
Wall time: 2.52 s


In [30]:
%%time

print('Creating train files')
create_dataset(train_files, 'train')
print('Finished Creating train files')

print('Creating eval files')
create_dataset(eval_files, 'eval')
print('Finished Creating eval files')

Creating train files


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))


Finished Creating train files
Creating eval files


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Finished Creating eval files
CPU times: user 2min 34s, sys: 2.62 s, total: 2min 36s
Wall time: 2min 41s


In [31]:
def calculate_stats_and_upload_to_s3(all_files):
    dtypes = config.get_types_of_attributes()
    df = pd.concat(
        (pd.read_csv(f, sep=CSV_SEPARATOR, 
                     compression='gzip', 
                     na_values=["null", "\\N"], 
                     dtype=dtypes) for f in all_files)
    )
    df = transform_df_columns_for_stats(df)
    stats_categorical = json.loads(df.describe(include='O').loc[[
        'count', 'unique'
    ]].to_json())
    stats_numeric = json.loads(df.describe().loc[[
        'count', 'mean', 'std', 'min', 'max'
    ]].to_json())

#     weights = json.loads(df['deliveryid'].groupby([ df[label] for label in ['won'] ]).agg(['count']).to_json())
    columns = df.columns.values
    
    STATS = json.dumps(obj={
            'columns': {
                'all': columns.tolist(),
                'categorical': list(stats_categorical.keys()),
                'numeric': list(stats_numeric.keys())
            },
            'stats': { **stats_numeric , **stats_categorical }
        }, indent=4)

    s3 = boto3.resource('s3')
    s3.Bucket(S3_BUCKET).put_object(Key=DATA_STATS_FILE_KEY, Body=STATS)

In [32]:
%%time

s3_resource = boto3.resource('s3')
new_train_files = [ obj.key for obj in s3_resource.Bucket(DST_BUCKET).objects.filter(Prefix='{}/train/'.format(DST_PREFIX)).all() if obj.key.endswith('.gz') ]
# print(new_train_files)
l = len(new_train_files)
rand.shuffle(new_train_files)
new_train_files = new_train_files[:min(max(MAX_DATASET_FILES_FOR_STATS,100),l)]
file_paths = [ 's3://{}/{}/train/{}'.format(DST_BUCKET, DST_PREFIX, os.path.basename(filename)) for filename in new_train_files ]
# print(file_paths)
calculate_stats_and_upload_to_s3(file_paths)

CPU times: user 31.1 s, sys: 1.38 s, total: 32.5 s
Wall time: 32.9 s


In [33]:
# CONFIG = config.get_config()
# dtypes = config.get_types_of_attributes()
# df = pd.concat((pd.read_csv(f, sep=CONFIG['CSV_SEPARATOR'], compression='gzip', na_values=["null", "\\N"], dtype=dtypes) for f in file_paths))

In [34]:
# df['pub_as_viewrate'] = df['pub_as_viewrate'].astype(float)
# print(df['pub_as_viewrate'].std())