<h1>Using Vowpal Wabbit for Recommendations</h1>

[Vowpal Wabbit](https://github.com/VowpalWabbit/vowpal_wabbit)

- linear / logistic regression
- matrix factorization

https://stackoverflow.com/questions/39040721/vowpal-wabbit-low-rank-matrix-factorization

In [90]:
import sys
sys.path.append('..')

In [47]:
import os
from subprocess import run
from tempfile import TemporaryDirectory

import pandas as pd

from reco_utils.dataset.movielens import load_pandas_df
from reco_utils.dataset.python_splitters import python_stratified_split
from reco_utils.evaluation.python_evaluation import rmse, mae, exp_var, rsquared

In [12]:
def to_vw(df, output):
    """Convert Pandas DataFrame to vw input format
    Args:
        df (pd.DataFrame): input DataFrame
        output (str): path to output file
    """
    with open(output, 'w') as f:
        for _, row in df.reset_index().iterrows():
            f.write('{rating} {index}|u {userID} |i {itemID}\n'.format_map(row))

In [74]:
def evaluate(test, prediction_path):
    """Convenience function to show metrics of interest"""
    preds = pd.read_csv(prediction_path, delim_whitespace=True, names=['prediction'], index_col=1).join(test)
    # ensure results are integers
    preds['prediction'] = preds['prediction'].apply(lambda x: int(max(1, min(5, round(x)))))

    for f in [rmse, mae, rsquared, exp_var]:
        print('{name}: {metric}'.format(name=f.__name__.upper(), metric=f(test, preds)))

In [92]:
# create temp directory to maintain data files
tmpdir = TemporaryDirectory()

model_path = os.path.join(tmpdir.name, 'vw.model')
train_path = os.path.join(tmpdir.name, 'train.dat')
test_path = os.path.join(tmpdir.name, 'test.dat')
prediction_path = os.path.join(tmpdir.name, 'prediction.dat')

In [96]:
# load movielens data (defaults to 100k dataset)
df = load_pandas_df()

# we need to reset the rating type to an integer to simplify the vw formatting
df['rating'] = df['rating'].astype('int64')

# split data to train and test sets, default values take 75% of each users ratings as train, and 25% as test
train, test = python_stratified_split(df)

# save train and test data in vw format
to_vw(train, train_path)
to_vw(test, test_path)

In [97]:
train_params = 'vw --loss_function logistic --oaa 5 -f {model} -d {data}'.format(model=model_path, data=train_path)
test_params = 'vw --loss_function logistic -i {model} -d {data} -t -p {pred}'.format(model=model_path, data=test_path, pred=prediction_path)

run(train_params.split(' '))
run(test_params.split(' '))
evaluate(test=test, prediction_path=prediction_path)

RMSE: 1.096127511484472
MAE: 0.7486004478566859
RSQUARED: 0.0619844518503353
EXP_VAR: 0.09174928195842325


In [98]:
train_params = 'vw -f {model} -d {data}'.format(model=model_path, data=train_path)
test_params = 'vw -i {model} -d {data} -t -p {pred}'.format(model=model_path, data=test_path, pred=prediction_path)

run(train_params.split(' '))
run(test_params.split(' '))
evaluate(test=test, prediction_path=prediction_path)

RMSE: 1.0062584072124479
MAE: 0.727447216890595
RSQUARED: 0.2094908073935564
EXP_VAR: 0.20961863654797408


In [101]:
# https://github.com/VowpalWabbit/vowpal_wabbit/wiki/Matrix-factorization-example
train_params = 'vw --rank 5 -qui -f {model} -d {data}'.format(model=model_path, data=train_path)
test_params = 'vw -i {model} -d {data} -t -p {pred}'.format(model=model_path, data=test_path, pred=prediction_path)

run(train_params.split(' '))
run(test_params.split(' '))
evaluate(test=test, prediction_path=prediction_path)

RMSE: 1.0417354423872618
MAE: 0.7565978886756238
RSQUARED: 0.15276719934656535
EXP_VAR: 0.1679807175013488


In [100]:
# https://github.com/VowpalWabbit/vowpal_wabbit/tree/master/demo/movielens
train_params = 'vw --lrq ui5 -f {model} -d {data}'.format(model=model_path, data=train_path)
test_params = 'vw -i {model} -d {data} -t -p {pred}'.format(model=model_path, data=test_path, pred=prediction_path)

run(train_params.split(' '))
run(test_params.split(' '))
evaluate(test=test, prediction_path=prediction_path)

RMSE: 1.022910147347139
MAE: 0.7394033909149073
RSQUARED: 0.18311136391545657
EXP_VAR: 0.18355330450522944


In [88]:
# show example of get_top_k

In [89]:
tmpdir.cleanup()