In [None]:
import pandas as pd
import boto3
from bson import json_util
import gzip
import glob
from pandas.io.json import json_normalize
S3_BUCKET_NAME = 'einstein-s3-elasticbackup'

from einsteinds import event_processing

In [None]:
cal_field_dict = {}

In [None]:
user_dict = {}

In [None]:
files = sorted(glob.glob('../data/2018/*/*/*/*', recursive=True))

In [None]:
def get_json_from_gzip(path):

    with gzip.open(path, 'rb') as f:
        file_content = f.read()

    decoded = file_content.decode('utf-8')

    fixed = '[' + decoded.replace('}{', '},{') + ']'

    json_list = json_util.loads(fixed)
    
    return json_list

def flatten_event(event):
    
    return json_normalize(event).to_dict('records')[0]

def fix_session(event):
    
    if event['eventCategory'] == 'session':
        event['eventLabel'] = 'sessionId'
        
    return event

def update_dict(event):
    
    event = flatten_event(event)
    event = fix_session(event)
    
    eventCategory = 'eventCategory('+event['eventCategory']+')'
    eventAction = 'eventAction('+event['eventAction']+')'
    eventLabel = 'eventLabel('+event['eventLabel']+')'
    
    if cal_field_dict.get(eventCategory) == None:
        cal_field_dict[eventCategory] = {}
        cal_field_dict[eventCategory]['count'] = 0
        cal_field_dict[eventCategory]['first_seen'] = event['created']
        cal_field_dict[eventCategory]['last_seen'] = event['created']
        
    cal_field_dict[eventCategory]['first_seen'] = min(cal_field_dict[eventCategory]['first_seen'],event['created'])
    cal_field_dict[eventCategory]['last_seen'] = max(cal_field_dict[eventCategory]['last_seen'],event['created'])
    
    cal_field_dict[eventCategory]['count'] += 1
    
    if cal_field_dict.get(eventCategory).get(eventAction) == None:
        
        cal_field_dict[eventCategory][eventAction] = {}
        cal_field_dict[eventCategory][eventAction]['count'] = 0
        cal_field_dict[eventCategory][eventAction]['first_seen'] = event['created']
        cal_field_dict[eventCategory][eventAction]['last_seen'] = event['created']
        
    cal_field_dict[eventCategory][eventAction]['first_seen'] = min(cal_field_dict[eventCategory][eventAction]['first_seen'],event['created'])
    cal_field_dict[eventCategory][eventAction]['last_seen'] = max(cal_field_dict[eventCategory][eventAction]['last_seen'],event['created'])
    
    cal_field_dict[eventCategory][eventAction]['count'] += 1
        
    if cal_field_dict.get(eventCategory).get(eventAction).get(eventLabel) == None:
        
        cal_field_dict[eventCategory][eventAction][eventLabel] = {}
        cal_field_dict[eventCategory][eventAction][eventLabel]['count'] = 0
        cal_field_dict[eventCategory][eventAction][eventLabel]['first_seen'] = event['created']
        cal_field_dict[eventCategory][eventAction][eventLabel]['last_seen'] = event['created']
        
    cal_field_dict[eventCategory][eventAction][eventLabel]['first_seen'] = min(cal_field_dict[eventCategory][eventAction][eventLabel]['first_seen'],event['created'])
    cal_field_dict[eventCategory][eventAction][eventLabel]['last_seen'] = max(cal_field_dict[eventCategory][eventAction][eventLabel]['last_seen'],event['created'])        
    cal_field_dict[eventCategory][eventAction][eventLabel]['count'] += 1
    
    for field in [key for key in event.keys() if key not in ['eventCategory', 'eventLabel', 'eventAction']]:
        
        if cal_field_dict[eventCategory][eventAction][eventLabel].get(field) == None:
            cal_field_dict[eventCategory][eventAction][eventLabel][field] = {}
        
        if cal_field_dict[eventCategory][eventAction][eventLabel][field].get('first_seen') == None:
            cal_field_dict[eventCategory][eventAction][eventLabel][field]['first_seen'] = event['created']
        if cal_field_dict[eventCategory][eventAction][eventLabel][field].get('last_seen') == None:
            cal_field_dict[eventCategory][eventAction][eventLabel][field]['last_seen'] = event['created']
            
        first_seen = min(cal_field_dict[eventCategory][eventAction][eventLabel][field]['first_seen'], event['created'])
        last_seen = max(cal_field_dict[eventCategory][eventAction][eventLabel][field]['last_seen'], event['created'])
        
        cal_field_dict[eventCategory][eventAction][eventLabel][field]['first_seen'] = first_seen
        cal_field_dict[eventCategory][eventAction][eventLabel][field]['last_seen'] = last_seen
        
        if cal_field_dict[eventCategory][eventAction][eventLabel][field].get('datatypes') == None:
            cal_field_dict[eventCategory][eventAction][eventLabel][field]['datatypes'] = {}
        
        if cal_field_dict[eventCategory][eventAction][eventLabel][field]['datatypes'].get(str(type(event[field]))) == None:
            cal_field_dict[eventCategory][eventAction][eventLabel][field]['datatypes'][str(type(event[field]))] = 0
        
        cal_field_dict[eventCategory][eventAction][eventLabel][field]['datatypes'][str(type(event[field]))] += 1
        
        if cal_field_dict[eventCategory][eventAction][eventLabel][field].get('count') == None:
            cal_field_dict[eventCategory][eventAction][eventLabel][field]['count'] = 0
        
        cal_field_dict[eventCategory][eventAction][eventLabel][field]['count'] += 1
        
def process_file_events(events):
    for event in events:
        update_dict(event)
        
def process_files(files):
    
    for i, file in enumerate(files):
        
        events = get_json_from_gzip(file)
        
        process_file_events(events)
        
        print('Done {} files'.format(i+1))
        

def update_user_dict(event):
    
    event = flatten_event(event)
    event = fix_session(event)
    
    year = event['created'].year
    month = event['created'].month
    day = event['created'].day
    hour = event['created'].hour
    year_month = str(year)+str(month)
    year_month_day = str(year)+str(month)+str(day)
    year_month_day_hour = str(year)+str(month)+str(day)+'_'+str(hour)
    
    if event.get('metadata.email') != None and event['metadata.email'] not in [None, '']:
        if user_dict.get(event['metadata.email']) == None:
            user_dict[event['metadata.email']] = {}
        
        if user_dict[event['metadata.email']].get(year_month_day_hour) == None:
            user_dict[event['metadata.email']][year_month_day_hour] = []
        
        user_dict[event['metadata.email']][year_month_day_hour].append(event)


def files_to_user_dict(files):
    
    for i, file in enumerate(files):
        
        events = get_json_from_gzip(file)
        
        _ = [update_user_dict(event) for event in events]
        
        print('Done {} files'.format(i+1)) 

In [None]:
files_to_user_dict(files)

In [None]:
for user in list(user_dict.keys()):
    user_data = user_dict[user]
    user_valid = user.replace('@','at').replace('/','').replace("\\", '')

    with open('../data/user_data_{}.json'.format(user_valid.lower()[0:30]), 'w') as outfile:
        outfile.write(json_util.dumps(user_data))

In [None]:
process_files(files)

In [None]:
with open('../data/event_info.json', 'w') as outfile:
    outfile.write(json_util.dumps(cal_field_dict))

In [None]:
for key1 in cal_field_dict.keys():
    for key2 in cal_field_dict[key1].keys():
        if isinstance(cal_field_dict[key1][key2], dict):
            for key3 in cal_field_dict[key1][key2].keys():
                print(key1,key2,key3)

In [None]:
user_files = glob.glob('../data/user_data*.json')

In [None]:
def load_bson_from_file(filepath):
    
    with open(filepath, 'r') as myfile:
        data = myfile.read().replace('\n', '')
    
    return json_util.loads(data)


def load_all_events(user_dict):
    
    events = []
    
    for key in user_dict.keys():
    
        events += user_dict[key]
        
    return events


def get_all_user_events(fp):
    
    return load_all_events(load_bson_from_file(fp))

In [None]:
events = []

for file in files[0:1]:
    events+= get_json_from_gzip(file)

In [None]:
clean_events = pd.DataFrame([event_processing.clean_event_minimal(event) for event in events])

In [None]:
clean_events.columns

In [None]:
categories = [{'event_category': key, 'first': cal_field_dict[key]['first_seen'], 'last': cal_field_dict[key]['last_seen']} for key in cal_field_dict.keys()]
actions = [{'event_category': key1, 'event_action': key2, 
               'first': cal_field_dict[key1][key2]['first_seen'], 
               'last': cal_field_dict[key1][key2]['last_seen']} for key1 in cal_field_dict.keys() for key2 in cal_field_dict[key1].keys() if key2 not in ['first_seen', 'last_seen','count']]
labels = [{'event_category': key1, 'event_action': key2, 'event_label': key3,
               'first': cal_field_dict[key1][key2][key3]['first_seen'], 
               'last': cal_field_dict[key1][key2][key3]['last_seen']} for key1 in cal_field_dict.keys() for key2 in cal_field_dict[key1].keys() if key2 not in ['first_seen', 'last_seen','count'] for key3 in cal_field_dict[key1][key2] if key3 not in ['first_seen', 'last_seen', 'count']]

fields = [{'event_category': key1, 'event_action': key2, 'event_label': key3, 'field': key4,
               'first': cal_field_dict[key1][key2][key3][key4]['first_seen'], 
               'last': cal_field_dict[key1][key2][key3][key4]['last_seen']} 
          for key1 in cal_field_dict.keys() 
          for key2 in cal_field_dict[key1].keys() if key2 not in ['first_seen', 'last_seen','count'] 
          for key3 in cal_field_dict[key1][key2] if key3 not in ['first_seen', 'last_seen', 'count'] 
          for key4 in cal_field_dict[key1][key2][key3] if key4 not in ['first_seen', 'last_seen', 'count']]

In [None]:
dates = pd.DataFrame(fields)

In [None]:
dates.groupby(['event_category','field'])[['first','last']].aggregate({'first': 'min', 'last': 'max'}).reset_index().sort_values(['event_category','first','field'])