In [1]:
import os, glob, gzip
import pandas as pd
from tqdm import tqdm
import json
from datetime import datetime

pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings('ignore')

> #### Iterate through the directory recursively and extract the tar.gz files

In [2]:
def get_gzip_file_paths(root_dir):
    
    '''
        Gzipped files are where the clickthrough data is stored.
        Collect all the gzip paths recursively starting the traversal
        from the root_directory
        
        @return List of gzips (clickthrough files)
    '''
    
    gzip_file_paths = []
    for subdir, dirs, files in tqdm(os.walk(root_dir)):

        for file in files:
            filepath = os.path.join(subdir, file)
            if filepath.endswith(".gz"):
                gzip_file_paths.append(filepath)
                
    return gzip_file_paths

In [3]:

def parse_clickthrough_gzip(path):
    
    try:
        df = pd.read_json(path, lines=True, compression='gzip')
        df["filepath"] = path.split("clickstream_data_2016")[1][1:]
        df["request_keys"] = df["request"].apply(lambda x: x.keys())
        df["server_request_keys"] = df["server_request"].apply(lambda x: x.keys())

        request_attributes_all = [list(x) for x in df["request_keys"].tolist()]
        request_attributes = list(set([item for sublist in request_attributes_all for item in sublist]))
        for attribute in request_attributes:
            df[attribute] = df["request"].apply(lambda x: x[attribute] if attribute in x else None)

        serv_request_attributes_all = [list(x) for x in df["server_request_keys"].tolist()]
        serv_request_attributes = list(set([item for sublist in serv_request_attributes_all for item in sublist]))
        for attribute in serv_request_attributes:
            df[attribute] = df["server_request"].apply(lambda x: x[attribute] if attribute in x else None)


        del df["request_keys"]
        del df["server_request_keys"]
        del df["request"]
        del df["server_request"]
        return df
    except:
        return None
        

def read_gzip_files(gzip_file_paths):
    
    '''
    Read the gzipped files into a dataframe and 
    construct the clickthrough attributes.
    
    @return List of dataframes with clickthrough attributes, 
            one for each file
    @return List of dataframes which have
    '''

    json_data = list(map(parse_clickthrough_gzip, gzip_file_paths))
    failed_files = [gzip_file_paths[i] for i, df in enumerate(json_data) if df is None]
    
    return json_data, failed_files

### Validation

In [4]:
def get_all_headers(dfs):
    
    '''
    Most dataframes might not have the full set of keys.
    Get every key which was present in request/ server_request.
    '''
    
    headers = [x.columns.tolist() for x in dfs]
    headers = list(set([item for sublist in headers for item in sublist]))
    return headers


def format_clickthrough_dataframes(df, headers_to_include):

    
    missing_cols = [x for x in headers_to_include if x not in df.columns.tolist()]
    if len(missing_cols) > 0:
        for col in missing_cols:
            df[col] = None
    df = df[headers_to_include]
    return df

In [5]:
# Ideally this has to be read in from a configuration file. 
# However, reading from a variable for now.
root_directory = '/home/sree/code/apptio-data/clickstream_data_2016'


gzip_paths = get_gzip_file_paths(root_directory)
clickthrough_dfs, failed_files = read_gzip_files(gzip_paths[:4])


headers = get_all_headers(clickthrough_dfs)

# Using all headers for now. Later depending on the usecase, we can use a subset of these
# columns for our analysis
clickthrough_dfs = [format_clickthrough_dataframes(df, headers) for df in clickthrough_dfs]

input_data = pd.concat(clickthrough_dfs)
input_data.head()

205it [00:00, 36503.18it/s]


Unnamed: 0,eventId,responseHeaders,parentFrameId,is_online,user_guid,error,windowTitle,tabId,method,fromCache,url,mainFrameRequestId,filepath,requestHeaders,software_id,partner_id,user_agent,type,statusLine,windowName,documentReferer,request_unixtime,redirectUrl,timeStamp,country_code,statusCode,frameId,navigationId,requestType,ip,x_forwarded_for,user_map,accept_language,openerTabId,requestId
0,,"{'Vary': 'User-Agent', 'Date': 'Wed, 07 Sep 20...",0,,2f8b23ca273de94a51281b0697a126d7,,,335,GET,False,http://s.amazon-adsystem.com/iu3?d=imdb.com&a1...,,09/07/00/US/CR/1697b2090acdf06d/part-00014-000...,"{'Upgrade-Insecure-Requests': '1', 'Accept-Lan...",d2c091f86191954cdf6e24beb1d2092a,1697b2090acdf06d,Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...,sub_frame,HTTP/1.1 200 OK,,,1473207176,,1473207000000.0,US,200,3807,,main,54.239.31.89,69.207.104.248,{'tiger': '0b3cefeb-182c-9f91-b3ae-836baaa448c8'},"en-US,en;q=0.8",,46079
1,,"{'Date': 'Wed, 07 Sep 2016 00:12:37 GMT', 'Tra...",-1,,2f8b23ca273de94a51281b0697a126d7,,"""NCIS"" (2003)",335,GET,False,http://www.imdb.com/title/tt0364845/eprate?ref...,,09/07/00/US/CR/1697b2090acdf06d/part-00014-000...,"{'Upgrade-Insecure-Requests': '1', 'Accept-Lan...",d2c091f86191954cdf6e24beb1d2092a,1697b2090acdf06d,Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...,main_frame,HTTP/1.1 200 OK,,http://www.imdb.com/title/tt0364845/episodes?r...,1473207176,,1473207000000.0,US,200,0,,main,207.171.162.180,69.207.104.248,{'tiger': '0b3cefeb-182c-9f91-b3ae-836baaa448c8'},"en-US,en;q=0.8",,46030
2,,"{'content-type': 'text/html', 'status': '200',...",0,,2f8b23ca273de94a51281b0697a126d7,,,335,GET,False,https://www.facebook.com/widgets/like.php?widt...,,09/07/00/US/CR/1697b2090acdf06d/part-00014-000...,"{'Upgrade-Insecure-Requests': '1', 'Accept-Lan...",d2c091f86191954cdf6e24beb1d2092a,1697b2090acdf06d,Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...,sub_frame,HTTP/1.1 200,,,1473207159,,1473207000000.0,US,200,3799,,main,31.13.71.36,69.207.104.248,{'tiger': '0b3cefeb-182c-9f91-b3ae-836baaa448c8'},"en-US,en;q=0.8",,46001
3,,"{'Vary': 'User-Agent', 'Date': 'Wed, 07 Sep 20...",0,,2f8b23ca273de94a51281b0697a126d7,,,335,GET,False,http://s.amazon-adsystem.com/iu3?d=imdb.com&a1...,,09/07/00/US/CR/1697b2090acdf06d/part-00014-000...,"{'Upgrade-Insecure-Requests': '1', 'Accept-Lan...",d2c091f86191954cdf6e24beb1d2092a,1697b2090acdf06d,Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...,sub_frame,HTTP/1.1 200 OK,,,1473207159,,1473207000000.0,US,200,3801,,main,54.239.31.89,69.207.104.248,{'tiger': '0b3cefeb-182c-9f91-b3ae-836baaa448c8'},"en-US,en;q=0.8",,46002
4,,"{'Content-Language': 'en-US', 'Date': 'Wed, 07...",-1,,2f8b23ca273de94a51281b0697a126d7,,NCIS (TV Series 2003â ) - Episodes - IMDb,335,GET,True,http://www.imdb.com/title/tt0364845/episodes?r...,,09/07/00/US/CR/1697b2090acdf06d/part-00014-000...,"{'Upgrade-Insecure-Requests': '1', 'Accept-Lan...",d2c091f86191954cdf6e24beb1d2092a,1697b2090acdf06d,Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...,main_frame,HTTP/1.1 200 OK,,http://www.imdb.com/title/tt0364845/?ref_=nv_sr_1,1473207159,,1473207000000.0,US,200,0,,main,207.171.162.180,69.207.104.248,{'tiger': '0b3cefeb-182c-9f91-b3ae-836baaa448c8'},"en-US,en;q=0.8",,45987


    0. Has gzip files been corrupted
    1. No empty files
    2. Has all mandatory columns
    3. See if the filepath url is valid
    4. check for a filter condition

In [6]:
input_data.shape

(5484, 35)

# Make sure that all columns exist in all the dataframes

## Convert the timestamps to an understandable format

In [6]:
input_data["request_datetime"] = input_data["request_unixtime"].apply(lambda x: datetime.fromtimestamp(x))
input_data.head()

Unnamed: 0,accept_language,country_code,documentReferer,error,eventId,filepath,frameId,fromCache,ip,is_online,mainFrameRequestId,method,navigationId,openerTabId,parentFrameId,partner_id,redirectUrl,requestHeaders,requestId,requestType,request_unixtime,responseHeaders,software_id,statusCode,statusLine,tabId,timeStamp,type,url,user_agent,user_guid,user_map,windowName,windowTitle,x_forwarded_for,request_datetime
0,"en-US,en;q=0.8",US,,,,09/07/00/US/CR/1697b2090acdf06d/part-00014-000...,3807,False,54.239.31.89,,,GET,,,0,1697b2090acdf06d,,"{'Upgrade-Insecure-Requests': '1', 'Accept-Lan...",46079,main,1473207176,"{'Vary': 'User-Agent', 'Date': 'Wed, 07 Sep 20...",d2c091f86191954cdf6e24beb1d2092a,200,HTTP/1.1 200 OK,335,1473207000000.0,sub_frame,http://s.amazon-adsystem.com/iu3?d=imdb.com&a1...,Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...,2f8b23ca273de94a51281b0697a126d7,{'tiger': '0b3cefeb-182c-9f91-b3ae-836baaa448c8'},,,69.207.104.248,2016-09-06 20:12:56
1,"en-US,en;q=0.8",US,http://www.imdb.com/title/tt0364845/episodes?r...,,,09/07/00/US/CR/1697b2090acdf06d/part-00014-000...,0,False,207.171.162.180,,,GET,,,-1,1697b2090acdf06d,,"{'Upgrade-Insecure-Requests': '1', 'Accept-Lan...",46030,main,1473207176,"{'Date': 'Wed, 07 Sep 2016 00:12:37 GMT', 'Tra...",d2c091f86191954cdf6e24beb1d2092a,200,HTTP/1.1 200 OK,335,1473207000000.0,main_frame,http://www.imdb.com/title/tt0364845/eprate?ref...,Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...,2f8b23ca273de94a51281b0697a126d7,{'tiger': '0b3cefeb-182c-9f91-b3ae-836baaa448c8'},,"""NCIS"" (2003)",69.207.104.248,2016-09-06 20:12:56
2,"en-US,en;q=0.8",US,,,,09/07/00/US/CR/1697b2090acdf06d/part-00014-000...,3799,False,31.13.71.36,,,GET,,,0,1697b2090acdf06d,,"{'Upgrade-Insecure-Requests': '1', 'Accept-Lan...",46001,main,1473207159,"{'content-type': 'text/html', 'status': '200',...",d2c091f86191954cdf6e24beb1d2092a,200,HTTP/1.1 200,335,1473207000000.0,sub_frame,https://www.facebook.com/widgets/like.php?widt...,Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...,2f8b23ca273de94a51281b0697a126d7,{'tiger': '0b3cefeb-182c-9f91-b3ae-836baaa448c8'},,,69.207.104.248,2016-09-06 20:12:39
3,"en-US,en;q=0.8",US,,,,09/07/00/US/CR/1697b2090acdf06d/part-00014-000...,3801,False,54.239.31.89,,,GET,,,0,1697b2090acdf06d,,"{'Upgrade-Insecure-Requests': '1', 'Accept-Lan...",46002,main,1473207159,"{'Vary': 'User-Agent', 'Date': 'Wed, 07 Sep 20...",d2c091f86191954cdf6e24beb1d2092a,200,HTTP/1.1 200 OK,335,1473207000000.0,sub_frame,http://s.amazon-adsystem.com/iu3?d=imdb.com&a1...,Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...,2f8b23ca273de94a51281b0697a126d7,{'tiger': '0b3cefeb-182c-9f91-b3ae-836baaa448c8'},,,69.207.104.248,2016-09-06 20:12:39
4,"en-US,en;q=0.8",US,http://www.imdb.com/title/tt0364845/?ref_=nv_sr_1,,,09/07/00/US/CR/1697b2090acdf06d/part-00014-000...,0,True,207.171.162.180,,,GET,,,-1,1697b2090acdf06d,,"{'Upgrade-Insecure-Requests': '1', 'Accept-Lan...",45987,main,1473207159,"{'Content-Language': 'en-US', 'Date': 'Wed, 07...",d2c091f86191954cdf6e24beb1d2092a,200,HTTP/1.1 200 OK,335,1473207000000.0,main_frame,http://www.imdb.com/title/tt0364845/episodes?r...,Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.3...,2f8b23ca273de94a51281b0697a126d7,{'tiger': '0b3cefeb-182c-9f91-b3ae-836baaa448c8'},,NCIS (TV Series 2003â ) - Episodes - IMDb,69.207.104.248,2016-09-06 20:12:39


#### Profile this data (pandas profiling works better here, unfortunately a system error stops me from doing this)