In [1]:
from sagemaker import get_execution_role
from sagemaker.session import Session
from sagemaker.tensorflow import TensorFlow
import boto3
import random
import os
import config
import json
import pandas as pd
import datetime
import gzip
import io

In [2]:
LABELS = ['click']
LABEL = 'click'
CSV_SEPARATOR = '\t'
DATA_STATS_FILE_KEY='trainer_predict_imp/data/stats_click.json'
S3_BUCKET = 'wsbidder'

In [3]:
def delete_files(bucket, prefix):
    for obj in boto3.resource('s3').Bucket(bucket).objects.filter(Prefix=prefix).all():
        obj.delete()

def get_files(bucket, prefix, days=[], hours=[], ext='.csv'):
    all_files = []
    s3_resource = boto3.resource('s3')
    for day in days:
        for hour in hours:
            _prefix = '{}/d={}/h={}/'.format(prefix, day, hour)
            new_files = [ obj.key for obj in s3_resource.Bucket(bucket).objects.filter(Prefix=_prefix).all() if obj.key.endswith(ext) ]
            all_files = all_files + new_files
            
    return all_files
             

In [4]:
def calculate_stats_and_upload_to_s3(all_files):
    CONFIG = config.get_config()
    dtypes = config.get_types_of_attributes()
    df = pd.concat((pd.read_csv(f, sep=CSV_SEPARATOR, compression='gzip', na_values=["null", "\\N"], dtype=dtypes) for f in all_files))
    stats_categorical = json.loads(df.describe(include='O').loc[[
        'count', 'unique'
    ]].to_json())
    stats_numeric = json.loads(df.describe().loc[[
        'count', 'mean', 'std', 'min', 'max'
    ]].to_json())

    weights = json.loads(df['deliveryid'].groupby([ df[label] for label in LABELS ]).agg(['count']).to_json())
    columns = df.columns.values
    
    STATS = json.dumps(obj={
            'columns': {
                'all': columns.tolist(),
                'categorical': list(stats_categorical.keys()),
                'numeric': list(stats_numeric.keys())
            },
            'stats': { **stats_numeric , **stats_categorical },
            'weights': { **weights }
        }, indent=4)

    s3 = boto3.resource('s3')
    s3.Bucket(S3_BUCKET).put_object(Key=DATA_STATS_FILE_KEY, Body=STATS)

In [5]:
def get_hours(startHour=None, numberOfHours=6):
    numberOfHours = numberOfHours % 25
    if startHour is None:
        startHour = datetime.datetime.now().hour
    hours = ["{:02d}".format( (hour + 24) % 24 ) for hour in range(startHour, startHour - numberOfHours, -1)]
    
    return hours

# print(get_hours(23, 24))

In [6]:
def sanitize_url(row):
    url = str(row['pub_as_pageurl'])
    url = url.replace('https://', '')
    url = url.replace('http://', '')
    url = url.replace('www.', '')

    return url

def transform_and_save_file(files, destination, is_train=True):
    
    CONFIG = config.get_config()
    dtypes = config.get_types_of_attributes()
    df = pd.concat((pd.read_csv('s3://{}/{}'.format(S3_BUCKET, f), sep=CSV_SEPARATOR, compression='gzip', na_values=["null", "\\N"], dtype=dtypes) for f in files))

    df = df[['deliveryid',
             'dayofweek',
             'hour',
             'pub_sspid',
             'pub_as_adspaceid',
             'pub_as_domain',
             'pub_as_dimensions',
             'pub_as_position',
             'device_os',
             'device_model',
             'user_ip',
             'user_market',
             'user_city',
              LABEL]]
    missing_values = {
        'deliveryid': '0',
        'dayofweek': 0,
        'hour': 0,
        'pub_sspid': '0',
        'pub_as_adspaceid': '0',
        'pub_as_domain': '0',
        'pub_as_dimensions': '0',
        'pub_as_position': '0',
        'device_os': '0',
        'device_model': '0',
        'user_ip': '0',
        'user_market': '0',
        'user_city': '0',
         LABEL: 0
    }
    df = df.fillna(value=missing_values)

    print('Read file')
    df['dayofweek_hour'] = df['dayofweek']*24 + df['hour']
    print('Added dayofweek_hour')
    df['pub_as_domain'] = df['pub_as_domain'].astype(str).str.lower()
    df['user_city'] = df['user_city'].astype(str).str.lower()
    df['domain_position'] = df['pub_as_domain'].astype(str) + df['pub_as_position'].astype(str)
    print('Added domain_position')
#     df['pub_as_pageurl'] = df.apply(lambda row: sanitize_url(row), axis=1)
#     print('Sanitized pub_as_pageurl')
    if is_train:
        df = df.apply(lambda x: x.mask(x.map(x.value_counts()) < 100, '0') if x.name in ['pub_sspid', 'pub_accountid', 'pub_as_siteid', 'pub_as_adspaceid', 'pub_as_domain', 'pub_as_pageurl', 'pub_as_dimensions', 'pub_as_position', 'device_os', 'device_model', 'user_market', 'user_city'] else x)
    print('Removed low frequent')
    total = float(df['deliveryid'].count())
    positive = float(df[df[LABEL] == 1].count()[LABEL])
    positive_weight = 0.5 * (total / positive)
    negative_weight = 0.5 * (total / (total - positive))
    print('Positive weight {}'.format(positive_weight))
    print('Negative weight {}'.format(negative_weight))
    def get_weight(row):
        if int(row[LABEL]) == 1:
            return positive_weight
        return negative_weight
    df['weight'] = df.apply (lambda row: get_weight(row), axis=1)
#     df['weight'] = positive_weight
#     df[df['imp_6'] == 0]['weight'] = negative_weight
    print('Added weight')
#     df.head()

#     df = df.drop(['dayofweek', 'hour', 'pub_as_position', 'pub_as_domain'], axis=1)
#     print('Dropped columns')
    new_data = df.to_csv(sep=CSV_SEPARATOR, index=False, na_rep="null")
    new_data = gzip.compress(bytes(new_data, 'utf-8'))
    io_data = io.BytesIO(new_data)
    s3 = boto3.client('s3')
    s3.upload_fileobj(io_data, S3_BUCKET, destination)

In [7]:
%%time

SRC_BUCKET = 'wsbidder'
DST_BUCKET = 'wsbidder'

SRC_PREFIX = 'tsv/etl/imp-pred-service-v1/imppredservice_training_data'
DST_PREFIX = 'trainer_predict_imp/data'

delete_files(DST_BUCKET, DST_PREFIX + '/train_click')
delete_files(DST_BUCKET, DST_PREFIX + '/eval_click')

eval_files_date = str( datetime.date.today() - datetime.timedelta(1) )
filter_days = [ str( datetime.date.today() - datetime.timedelta(i + 1) ) for i in range(8) ]
filter_hours = get_hours(23, 24)

all_files = get_files(SRC_BUCKET, SRC_PREFIX, filter_days, filter_hours, ext='.gz')

train_files = [ file_path for file_path in all_files if 'd={}'.format(eval_files_date) not in file_path ]
eval_files = [ file_path for file_path in all_files if 'd={}'.format(eval_files_date) in file_path ]
print('Files names separated for training and evaluation')

Files names separated for training and evaluation
CPU times: user 1.32 s, sys: 41.1 ms, total: 1.36 s
Wall time: 7.11 s


In [8]:
%%time

l = len(train_files)
step=1000
for start in range(0, l, step):
    end = min(start + step - 1, l)
    print('Creating training file...{} - {}'.format(start, end))
    transform_and_save_file(train_files[start: end], '{}/train_click/train_{}_{}.csv.gz'.format(DST_PREFIX, start, end))
    print('Finished Creating training file {} - {}'.format(start, end))
print('Creating eval file...')
transform_and_save_file(eval_files, '{}/eval_click/eval.csv.gz'.format(DST_PREFIX), is_train=False)
print('Finished Creating eval file')

Creating training file...0 - 999
Read file
Added dayofweek_hour
Added domain_position
Removed low frequent
Positive weight 403.3071705426357
Negative weight 0.5006206443635629
Added weight
Finished Creating training file 0 - 999
Creating training file...1000 - 1999
Read file
Added dayofweek_hour
Added domain_position
Removed low frequent
Positive weight 419.42261904761904
Negative weight 0.5005967689225479
Added weight
Finished Creating training file 1000 - 1999
Creating training file...2000 - 2016
Read file
Added dayofweek_hour
Added domain_position
Removed low frequent
Positive weight 553.7727272727273
Negative weight 0.5004518567203418
Added weight
Finished Creating training file 2000 - 2016
Creating eval file...
Read file
Added dayofweek_hour
Added domain_position
Removed low frequent
Positive weight 348.11777108433733
Negative weight 0.5007191807231839
Added weight
Finished Creating eval file
CPU times: user 6min 7s, sys: 13.4 s, total: 6min 20s
Wall time: 8min 30s


In [9]:
%%time

s3_resource = boto3.resource('s3')
new_train_files = [ obj.key for obj in s3_resource.Bucket(DST_BUCKET).objects.filter(Prefix='{}/train_click/'.format(DST_PREFIX)).all() if obj.key.endswith('.gz') ]
# print(new_train_files)
calculate_stats_and_upload_to_s3([ 's3://{}/{}/train_click/{}'.format(DST_BUCKET, DST_PREFIX, os.path.basename(filename)) for filename in new_train_files ])

CPU times: user 56.7 s, sys: 2.89 s, total: 59.6 s
Wall time: 1min 1s
