In [None]:
import numpy as np 
import pandas as pd
import os
from csv import DictReader
from vowpalwabbit import pyvw
from datetime import datetime

In [None]:
class CFG:
    roundoff = 4
    cutoff_point = 1200
    nof_cols = 300

# Functions

Our workhorse function: take a single line from the train / test file,
read it as a dictionary and create a string matching the VW input format:
https://github.com/VowpalWabbit/vowpal_wabbit/wiki/Input-format

In [None]:

def process_line(row, train = True):
    
    # initialize the strings for the categorical and numerical
    # variables namespaces, respectively
    n_c = ''; n_n = ''; result = ''
    
    # loop over entries in our dictionary representing a row
    for k,v in row.items():
        if k in num_cols:
            n_n += " %s" % str(k.replace('_', '')) + ':' 
            n_n += str(np.round(np.float(v ), CFG.roundoff))
        if k in cat_cols:
            n_c += " %s" % 'invid' + '_' + str(v)   
 
    if train: 
        label = row['target']
    else:
        label = '-1'
        
    result = label + ' ' + row['row_id'] + '|' + n_n + '|' + n_c
        
    return result

# Data


In [None]:
# read the first few rows so we get an idea of the columns
xdat = pd.read_csv('../input/ubiquant-market-prediction/train.csv', nrows = 10)
xdat.head(5)

Vowpal input format requires defining which columns are numeric and categorical. For faster iteration, we only use a subset of the numerical columns - checking if we can get away with using everything and staying within Kaggle kernel execution time is on my TODO list ;-) 

In [None]:
# f_0, ..., f_299 are the numerical ones
# num_cols = ['f_' + str(i) for i in range(300)]
num_cols = ['f_' + str(i) for i in range(CFG.nof_cols)]

# investment_id is the only categorical one that makes sense as model input - time_id does not overlap anyway, since we are dealing with a time series problem
cat_cols = ['investment_id']


# Model

For a full overview of command line arguments: https://github.com/VowpalWabbit/vowpal_wabbit/wiki/Command-Line-Arguments

We will train a linear model (no interactions), optimising MSE:

In [None]:
# vw = pyvw.vw(b=21, random_seed=17, loss_function='squared', passes = 1, learning_rate=0.7, k=True, 
#              c=True,  quiet= False)

# start_time = datetime.now()
# for e, row in enumerate( DictReader(open('../input/ubiquant-market-prediction/train.csv')) ):
#     if e % 1e5 == 0: print("{}\t{} passed.".format(e, datetime.now() - start_time))
#     # training Vowpal Wabbit with current example
#     vw.learn(process_line(row))
    
    
vw = pyvw.vw(initial_regressor = '../input/vw-models/xmodel_full_v1.vw')    

# Eval and submit

In [None]:
# we skip evaluation for now ;-) 

import ubiquant


In [None]:
env = ubiquant.make_env()  
iter_test = env.iter_test()
for (test_df, sample_prediction_df) in iter_test:
    
    # build predictions for this segment
    predictions = []

    # build predictions using the same logic as above for training,
    # i.e. creating VW-formatted lines on the fly
    # the only change - for consistency - is to_dict(), 
    # since we are iterating over a pandas dataframe
    # and not reading line-by-line from a file.
    for row in test_df.iterrows():    
        predictions.append(vw.predict(process_line(row[1].to_dict(), train = False)))
    
    sample_prediction_df['target'] = predictions    
    sample_prediction_df['target'].fillna(0, inplace = True)
    env.predict(sample_prediction_df)  