Let's install `h2o4gpu`

In [1]:
!apt-get install -y libopenblas-dev pbzip2
!pip install -U tabulate==0.8.2
!pip install h2o4gpu
import h2o4gpu

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libopenblas-dev is already the newest version (0.2.19-3).
pbzip2 is already the newest version (1.1.9-1+b1).
0 upgraded, 0 newly installed, 0 to remove and 46 not upgraded.
Requirement already up-to-date: tabulate==0.8.2 in /opt/conda/lib/python3.6/site-packages (0.8.2)


Read and process netflix dataset to scipy sparse matrix

In [2]:
import gc
import numpy as np
import scipy
from sklearn.model_selection import train_test_split

files = [
    '../input/combined_data_1.txt',
    '../input/combined_data_2.txt',
    '../input/combined_data_3.txt',
    '../input/combined_data_4.txt',
]

coo_row = []
coo_col = []
coo_val = []

for file_name in files:
    print('processing {0}'.format(file_name))
    with open(file_name, "r") as f:
        movie = -1
        for line in f:
            if line.endswith(':\n'):
                movie = int(line[:-2]) - 1
                continue
            assert movie >= 0
            splitted = line.split(',')
            user = int(splitted[0])
            rating = float(splitted[1])
            coo_row.append(user)
            coo_col.append(movie)
            coo_val.append(rating)
    gc.collect()

print('transformation...')

coo_val = np.array(coo_val, dtype=np.float32)
coo_col = np.array(coo_col, dtype=np.int32)
coo_row = np.array(coo_row)
user, indices = np.unique(coo_row, return_inverse=True)
user = user.astype(np.int32)

gc.collect()

coo_matrix = scipy.sparse.coo_matrix((coo_val, (indices, coo_col)))
shape = coo_matrix.shape
print('R matrix size', shape)

gc.collect()

print('splitting into training and validation set')
train_row, test_row, train_col, test_col, train_data, test_data = train_test_split(
    coo_matrix.row, coo_matrix.col, coo_matrix.data, test_size=0.2, random_state=42)

train = scipy.sparse.coo_matrix(
    (train_data, (train_row, train_col)), shape=shape)
test = scipy.sparse.coo_matrix(
    (test_data, (test_row, test_col)), shape=shape)


processing ../input/combined_data_1.txt
processing ../input/combined_data_2.txt
processing ../input/combined_data_3.txt
processing ../input/combined_data_4.txt
transformation...
R matrix size (480189, 17770)
splitting into training and validation set


Let's factorize matrix R 

In [6]:
n_components = 40
_lambda = 0.01
# increase it in case out-of GPU memory, but n_components / BATCHES has to be a multiple of 10
BATCHES=1



scores = []
factorization = h2o4gpu.solvers.FactorizationH2O(
    n_components, _lambda, max_iter=100)
factorization.fit(train, X_test=test, X_BATCHES=BATCHES,
                      THETA_BATCHES=BATCHES, scores=scores, verbose=True, early_stopping_rounds=5)
print('best iteration:',factorization.best_iteration)

iteration 0 train: 0.831149160861969 cv: 0.9896033406257629
iteration 1 train: 0.7535563111305237 cv: 0.9313276410102844
iteration 2 train: 0.7214264869689941 cv: 0.9011292457580566
iteration 3 train: 0.7067406177520752 cv: 0.8872774243354797
iteration 4 train: 0.6985356211662292 cv: 0.8795573711395264
iteration 5 train: 0.6933938264846802 cv: 0.8747438192367554
iteration 6 train: 0.6899409294128418 cv: 0.8715165853500366
iteration 7 train: 0.6874918937683105 cv: 0.8692240715026855
iteration 8 train: 0.685676634311676 cv: 0.8675171732902527
iteration 9 train: 0.6842845678329468 cv: 0.8662049174308777
iteration 10 train: 0.6831879019737244 cv: 0.8651642203330994
iteration 11 train: 0.6823055148124695 cv: 0.8643186092376709
iteration 12 train: 0.6815824508666992 cv: 0.8636147975921631
iteration 13 train: 0.6809810400009155 cv: 0.8630224466323853
iteration 14 train: 0.6804736256599426 cv: 0.8625165820121765
iteration 15 train: 0.6800405383110046 cv: 0.8620761632919312
iteration 16 train: 

And now `factorization.XT` and `factorization.thetaT` contain dense representation of users and movies respectively.

In [12]:
print('X shape:', factorization.XT.shape)
print('ThetaT shape:', factorization.thetaT.shape)


X shape: (480189, 40)
ThetaT shape: (17770, 40)
