In [None]:
%matplotlib inline

import math
import datetime
from enum import IntEnum

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import graphviz

from sklearn import tree, metrics, svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, Imputer

import subprocess
import json

In [None]:
md = pd.read_csv("meta_data", header=0, sep='\t', engine="python")
cd = pd.read_csv("clicking_data", header=0, sep='\t', engine="python")
ed = pd.read_csv("experiment_details", header=0, sep='\t', engine="python")

In [None]:
# Inner SQL joins
df = pd.merge(md,ed, how='inner', on='user_id')
df = pd.merge(df,cd, how='inner', left_on='user_id', right_on='user_session')

# Preprocess data for EMM
df['condition_1'] = df.condition.apply(lambda x: x == '1-Control')
df['clicked'] = df.action.apply(lambda x: x == 'clic')
df['useragent'] = df.useragent.apply(lambda x: x.split()[0])

df = df.drop(columns=['condition', 'action', # preprocessed
                      
                      'user_id', 'experiment_id',
                      'timestamp', 'action_label', 'action_type', 
                      'tstamp', 'user_session',
                      'platform', 'etl_tstamp', 'collector_tstamp', 'dvce_created_tstamp',
                      'domain_userid', 'domain_sessionid',
                      'page_referrer', 'page_title', 'page_url',
                      # irrelevant
                      
                      'geo_city', 'geo_country',
                      'geo_region_name', 'geo_timezone',
                      'os_timezone',
                      # geo_region provides enough insight, should be more precise than country
                      
                     'user_id'])[df['event'] == 'page_view'].drop(columns=['event']).reset_index()

cols = df.columns.values
for col in cols:
    if df[col].nunique() < 2:
        print('Dropping {0} column: {1} unique values'.format(col, df[col].nunique()))
        df = df.drop(columns=[col])
        
print('Row columns: {0}'.format(df.columns.values[1:]))

In [None]:
print(df.nunique())

In [None]:
encoded=json.dumps({
    "rows": json.loads(df.to_json(orient="records")),
    "width":3,
    "depth":3,
    "results":20,
    "bins":30,
    "targets":{
        "Clicked": True,
        "Condition1": True,
    },
})

In [None]:
with open("input.json", "w") as f:
    f.write(encoded)

In [None]:
p = subprocess.run("./beam", stdout=subprocess.PIPE, stderr=subprocess.PIPE, input=encoded, universal_newlines=True)
display(p.stderr[len(p.stderr)-1000:].split('\n'))
display(p.stdout[:1000].split('\n'))

I worked alone on the assignment.
Beam search implementation is located in `beam.go`, which is also submitted.
I tried optimizing the implementation in various ways, but the execution still is very computationally intensive, hence I did not manage to experiment with different algorithm parameters too much(execution simply takes too much time, even after extensive profiling)