In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
import functools
from tqdm import tqdm
from torch import nn
import matplotlib.pyplot as plt
import torch
from torch.utils.data import DataLoader, TensorDatase

# Movielens 100k dataset. Collaborative filtering

In [2]:
!head ml-100k/u.item

1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0
2|GoldenEye (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?GoldenEye%20(1995)|0|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0
3|Four Rooms (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0
4|Get Shorty (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995)|0|1|0|0|0|1|0|0|1|0|0|0|0|0|0|0|0|0|0
5|Copycat (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Copycat%20(1995)|0|0|0|0|0|0|1|0|1|0|0|0|0|0|0|0|1|0|0
6|Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)|01-Jan-1995||http://us.imdb.com/Title?Yao+a+yao+yao+dao+waipo+qiao+(1995)|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0
7|Twelve Monkeys (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Twelve%20Monkeys%20(1995)|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|1|0|0|0
8|Babe (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Babe%20(1995)|0|0|0|0|1

In [3]:
!head ml-1m/movies.dat

1::Toy Story (1995)::Animation|Children's|Comedy
2::Jumanji (1995)::Adventure|Children's|Fantasy
3::Grumpier Old Men (1995)::Comedy|Romance
4::Waiting to Exhale (1995)::Comedy|Drama
5::Father of the Bride Part II (1995)::Comedy
6::Heat (1995)::Action|Crime|Thriller
7::Sabrina (1995)::Comedy|Romance
8::Tom and Huck (1995)::Adventure|Children's
9::Sudden Death (1995)::Action
10::GoldenEye (1995)::Action|Adventure|Thriller


In [4]:
!head ml-100k/u.data

196	242	3	881250949
186	302	3	891717742
22	377	1	878887116
244	51	2	880606923
166	346	1	886397596
298	474	4	884182806
115	265	2	881171488
253	465	5	891628467
305	451	3	886324817
6	86	3	883603013


In [5]:
with open('ml-100k/u.item', errors='ignore') as f:
    movies = (dict([(int(line.split('|')[0]), line.split('|')[1]) for line in f.readlines()]))

In [6]:
with open('ml-1m/movies.dat', errors='ignore') as f:
    movies = (dict([(int(line.split('::')[0]), line.split('::')[1]) for line in f.readlines()]))

In [7]:
df = pd.read_csv('ml-100k/u.data', 
            sep='\t', 
            names=['user_id', 'item_id', 'rating', 'timestamp'], 
            parse_dates=['timestamp'], 
            date_parser=functools.partial(pd.to_datetime, unit='s'))
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,1997-12-04 15:55:49
1,186,302,3,1998-04-04 19:22:22
2,22,377,1,1997-11-07 07:18:36
3,244,51,2,1997-11-27 05:02:03
4,166,346,1,1998-02-02 05:33:16


In [8]:
df = pd.read_csv('ml-1m/ratings.dat', 
            sep='::', 
            names=['user_id', 'item_id', 'rating', 'timestamp'], 
            parse_dates=['timestamp'], 
            date_parser=functools.partial(pd.to_datetime, unit='s'))
df.head()

  """


Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1193,5,2000-12-31 22:12:40
1,1,661,3,2000-12-31 22:35:09
2,1,914,3,2000-12-31 22:32:48
3,1,3408,4,2000-12-31 22:04:35
4,1,2355,5,2001-01-06 23:38:11


In [9]:
A = df.drop('timestamp', axis=1).pivot(index='user_id', columns='item_id', values='rating')
A.head()

item_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,2.0,,,,,...,,,,,,,,,,


# Singular value decomposition

SVD is a matrix factorization technique that factorizes $A = USV^h$. 

Where $U$ is user matrix. The rows can represent users, and the columns some latent (orthogonal) embedding.
$V$ is an item matrix, The columns represent items and the rows a latent embedding.
$S = \text{diag}(\Sigma)$ are scaling factors (**singular values**), ordered from high to low.

By truncating the data, (discarding the lower scaling factors), we can reconstruct a low-rank approximation of $A$. 
Note that the latent columns/ rows are orthogonal so a truncated SVD at $k$ is a $k$-rank approximation of A.


<img src="img/svd.png" width="400px">

First we show this low-rank-approximation properties of SVD. Below, we'll replace the NaNs with zeros. This influences our SVD result. 
In actual collaborative filtering we know that the ratings are not zero, and we need to find a way to ignore the NaNs.

In [10]:
# Decompose the zero filled matrix
u, s, v = np.linalg.svd(np.nan_to_num(A))

Prove that reconstructing the factorization is equal to $A$. Note that we need, to truncate to $n$, as this is the rank of the matrix. The $V$ matrix is actually **M x M**, where $M > N$.

In [16]:
n = s.shape[-1]

np.allclose(np.nan_to_num(A), (u[:, :n] * s[:n]) @ v[:n, :])

True

Show different reconstruction matrices at differen $k$.

In [17]:
for k in [1, 10, 100]:
    print(f'k={k}\n\n', np.dot(u[:, :k] * s[:k], v[:k, :]).astype(int))

k=1

 [[0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [2 0 0 ... 0 0 0]]
k=10

 [[2 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [2 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]]
k=100

 [[5 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [2 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]]


# SVD with NaNs
Now we are going the implement SVD ourselfs in a manner that is comparable with Expectation Maximization. 
First we are going to replace the item columns with means and than we are iteratively updating based on a low-rank SVD factorization.

In [22]:
def em_svd(A, k=3, n_iter=10):
    A_original = A
    A = np.array(A)
    mask = np.isnan(A_original)
    
    # Fill the NaNs in the columns (i.e. movies) first with means
    item_means = np.nanmean(A_original, axis=0)
    A[mask] = (item_means[None, :] * np.ones(A.shape))[mask]
    
    # Update the matrix n_iter times
    for i in tqdm(range(n_iter)):
        u, s, v = np.linalg.svd(A)
        A_approx = np.dot(u[:, :k] * s[:k], v[:k, :])
        
        if np.allclose(A[mask], A_approx[mask], rtol=0.01, atol=0.01):
            break

        A[mask] = A_approx[mask]
    return A
        


Some utility functions. One to load k-fold test data. One that given a numpy user-item matrix, and the original pandas user-item dataframe creates a narrow table.
Finally we have a evaluation function that returns MAE.

In [23]:
def load_train_test_1m():

    A = []
    for t in ['base', 'test']:
        A.append((pd.read_csv(f'ml-100k/u{n}.{t}', 
            sep='\t', 
            names=['user_id', 'item_id', 'rating', 'timestamp'])
               .drop('timestamp', axis=1)

              ))
        
    A[0] = A[0].pivot(index='user_id', columns='item_id', values='rating')

    return A

train_A, test_A = load_train_test(1)

In [14]:
def get_prediction_df(A_approx, A):
    assert A.shape == A.shape
    A = A.copy()
    A.iloc[:, :] = A_approx
    
    return A.stack().reset_index().rename({0: 'rating'}, axis=1)

def evaluate(A, train_A, test_A):
    train_A = get_prediction_df(A, train_A)
    merged = test_A.merge(train_A, on=['item_id', 'user_id'])
    
    return (np.abs(merged.rating_x - merged.rating_y)).mean()


In [25]:
def k_fold_eval(k, n_iter=25):
    mae = []
    for i in range(1, 6):
        train_A, test_A = load_train_test(i)
        A_approx = em_svd(train_A.values, k, n_iter=n_iter)
        mae.append(evaluate(A_approx, train_A, test_A))
    return mae

In [21]:
mae = k_fold_eval(None, n_iter=0)

print('Filling with mean; MAE:', np.mean(mae))

NameError: name 'em_svd' is not defined

In [18]:
mae = k_fold_eval(k=2, n_iter=25)

print('Filling with 2 singular values; MAE:', np.mean(mae))

100%|██████████| 25/25 [00:30<00:00,  1.22s/it]
100%|██████████| 25/25 [00:32<00:00,  1.30s/it]
100%|██████████| 25/25 [00:32<00:00,  1.29s/it]
100%|██████████| 25/25 [00:40<00:00,  1.62s/it]
100%|██████████| 25/25 [00:36<00:00,  1.45s/it]


Filling with mean; MAE: 0.7330830721460039


In [19]:
mae = k_fold_eval(k=5, n_iter=25)

print('Filling with 5 singular values; MAE:', np.mean(mae))

100%|██████████| 25/25 [00:32<00:00,  1.30s/it]
100%|██████████| 25/25 [00:53<00:00,  2.15s/it]
100%|██████████| 25/25 [00:52<00:00,  2.12s/it]
100%|██████████| 25/25 [00:54<00:00,  2.18s/it]
100%|██████████| 25/25 [00:46<00:00,  1.84s/it]


Filling with mean; MAE: 0.7336953418060109


# SVD influenced approach
https://datajobs.com/data-science-repo/Recommender-Systems-%5BNetflix%5D.pdf
https://medium.com/@m_n_malaeb/singular-value-decomposition-svd-in-recommender-systems-for-non-math-statistics-programming-4a622de653e9

$$ r_{ui} = \mu + b_i + b_u + q_i^Tp_u $$

Minimize:

$$\min \sum_{ui \in k} (r_{ui} - \hat{r}_{ui})^2$$

And for regularization:

$$ \min \sum_{ui \in k} (r_{ui} - \hat{r}_{ui})^2+ \lambda( ||q_i|| + ||p_u|| + b_i + b_u)^2$$

In [26]:
train_A, test_A = load_train_test(1)

In [11]:
def load_train_test_1m():
        
    A = pd.read_csv('ml-1m/ratings.dat', 
                sep='::', 
                names=['user_id', 'item_id', 'rating', 'timestamp'], 
                parse_dates=['timestamp'], 
                date_parser=functools.partial(pd.to_datetime, unit='s'))
    n = int(A.shape[0] * 0.8)
    
    A = [A.iloc[:n], A.iloc[n:]]
    A[0] = A[0].pivot(index='user_id', columns='item_id', values='rating')

    return A

train_A, test_A = load_train_test_1m()

  import sys


In [17]:
class CFsvd(nn.Module):
    def __init__(self, M_shape, embedding_dim=10):
        super().__init__()
        self.q = nn.Embedding(M_shape[0], embedding_dim)
        self.p = nn.Embedding(M_shape[1], embedding_dim)
        self.b_i = nn.Parameter(torch.zeros(M_shape[0]))
        self.b_u = nn.Parameter(torch.zeros(M_shape[1]))
        self.mu = nn.Parameter(torch.zeros(1))
        
    def forward(self, i=None, u=None):
        if i is None or u is None:
            r_base = torch.matmul(self.q._parameters['weight'], self.p._parameters['weight'].T)
            b_i = self.b_i[:, None]
            b_u = self.b_u[None, :]
        else:
            r_base = torch.dot(self.q(i), self.p(u))
            b_i = self.b_i[i]
            b_u = self.b_u[u]
        
        r_base + self.mu + b_i + b_u
        
        return r_base
    
    def regularization_term(self, lambda_):
        norm_q_i = m.q._parameters['weight'].norm(dim=1)
        norm_p_u = m.p._parameters['weight'].norm(dim=1)
        
        return ((norm_p_u[:, None] + norm_q_i + self.b_i + self.b_u[:, None])**2 * lambda_).mean()
        
    

m = CFsvd(train_A.shape, 1000)
m.forward(torch.tensor(1), torch.tensor(1))
m.regularization_term(10)

tensor(40003.8828, grad_fn=<MeanBackward0>)

In [16]:
def criterion(y_pred, y_true):
    return torch.mean((y_pred - y_true)**2)

def train(A, m):
    optim = torch.optim.Adam(m.parameters(), lr=0.05)
    
    A = torch.tensor(A, dtype=torch.float)
    A = A.cuda()
    m = m.cuda()
    mask = ~torch.isnan(A)
    r = A[mask]
    
    for i in range(1000):
        optim.zero_grad()
        r_pred = m.forward()[mask]
        
        loss = criterion(r_pred, r) + m.regularization_term(1)
        loss.backward()
        optim.step()
        
        if i % 25 == 0:
            A_approx = m.forward().cpu().data.numpy()
            mask = ~np.isnan(train_A.values)
            A_approx[mask] = train_A.values[mask]

            print(f'Epoch: {i}\t loss: {loss.data}\t mae: {evaluate(A_approx, train_A, test_A)}')
        
    
train(train_A.values, m)

Epoch: 0	 loss: 21016.544921875	 mae: 23.651815831661224
Epoch: 25	 loss: 1863.3104248046875	 mae: 4.623928831651424
Epoch: 50	 loss: 113.89006042480469	 mae: 3.8581562154711437
Epoch: 75	 loss: 9.20861530303955	 mae: 2.907867202911383
Epoch: 100	 loss: 3.0643815994262695	 mae: 1.6436229871633725
Epoch: 125	 loss: 1.8411128520965576	 mae: 1.2860527443580139
Epoch: 150	 loss: 1.3686624765396118	 mae: 1.1104936676147656
Epoch: 175	 loss: 1.127641201019287	 mae: 1.012470502119798
Epoch: 200	 loss: 0.9860596656799316	 mae: 0.9509812731009263
Epoch: 225	 loss: 0.8935704827308655	 mae: 0.9022707679332831
Epoch: 250	 loss: 0.8276486396789551	 mae: 0.8629026733911954
Epoch: 275	 loss: 0.7775210738182068	 mae: 0.8327500162980496
Epoch: 300	 loss: 0.7375521659851074	 mae: 0.8083920998451037
Epoch: 325	 loss: 0.7044882774353027	 mae: 0.7878574927647909
Epoch: 350	 loss: 0.6763318181037903	 mae: 0.770408251346686
Epoch: 375	 loss: 0.6517999172210693	 mae: 0.755508792706025
Epoch: 400	 loss: 0.6300

KeyboardInterrupt: 

In [41]:
train_A.shape

(4795, 3685)

In [47]:
class VAE(nn.Module):
    def __init__(self, item_size=3685, z_size=10):
        super().__init__()
        # P(z|x)
        self.encoder_mu = nn.Sequential(
            nn.Linear(item_size, 40),
            nn.ReLU(),
            nn.Linear(40, z_size)
        )
        self.encoder_log_variance = nn.Sequential(
            nn.Linear(item_size, 40),
            nn.ReLU(),
            nn.Linear(40, z_size)
        )
        # P(x|z)
        self.decoder = nn.Sequential(
            nn.Linear(z_size, 512),
            nn.ReLU(),
            nn.Linear(512, 3685),
            nn.Sigmoid()
        )
        
    def reparameterize(self, mu, log_var):
        std = torch.exp(0.5 * log_var)
        eps = torch.randn_like(std)
        return mu + std * eps
        
    def encode(self, user):
        mu = self.encoder_mu(user)
        log_var = self.encoder_log_variance(user)
        return self.reparameterize(mu, log_var), mu, log_var
        
    def forward(self, user):
        z, mu, log_var = self.encode(user)
        return self.decoder(z), mu, log_var
        
        
A = torch.tensor(train_A.values, dtype=torch.float)
mask = torch.isnan(A)
A[mask] = 0

m = VAE()
m(A[0, :].unsqueeze(0))[0].shape

torch.Size([1, 3685])

In [48]:
epochs = 600
batch_size = 32
m = VAE()

train_A, test_A = load_train_test_1m()
mask = np.isnan(train_A)
train_A[mask] = 0
X = torch.tensor(train_A.values, dtype=torch.float)
X /= 5

m.cuda()
X = X.cuda()

ds = TensorDataset(X)
dl = DataLoader(ds, shuffle=True, batch_size=batch_size)

optim = torch.optim.Adam(m.parameters(), lr=0.001, weight_decay=0.25)

def det_loss(x, x_reconstructed, mu, log_var, beta=1):
    mask = torch.tensor((x != 0), dtype=torch.float, device='cuda')
    x = x * mask
    x_reconstructed = x_reconstructed * mask
    
    reconstruction_error = (0.5 * (x - x_reconstructed)**2).sum()
    
    # see Appendix B from VAE paper:
    # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014
    # https://arxiv.org/abs/1312.6114
    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    kl_divergence = (-0.5 * torch.sum(1 + log_var - log_var**2 - log_var.exp())) * beta

    return (reconstruction_error + kl_divergence).sum()



  import sys


In [49]:
i = 0
avg_loss = 0
print_iter = 100

for epoch in range(epochs):
    beta = 1.5
    for [x] in dl:
        i += 1
        optim.zero_grad()
        x_, mu, log_var = m(x)
        loss = det_loss(x, x_, mu, log_var, beta=beta)
        loss.backward()
        optim.step()
        avg_loss += loss.data
       
        if i % print_iter == 0:
            m.eval()
            z, mu, log_var = m.encode(X)
            A_approx = m.decoder(mu)
            A_approx = A_approx.cpu().data.numpy() * 5
            mask = ~(train_A.values == 0)
            A_approx[mask] = train_A.values[mask]
            print(f'Epoch: {epoch}\t loss: {avg_loss / print_iter}\t mae: {evaluate(A_approx, train_A, test_A)}, beta: {beta}')
            avg_loss = 0
            m.train()




Epoch: 0	 loss: 121.6930160522461	 mae: 0.8529034027686486, beta: 1.0
Epoch: 1	 loss: 103.61193084716797	 mae: 0.7673151370806571, beta: 1.0
Epoch: 1	 loss: 108.52708435058594	 mae: 0.737634389828413, beta: 1.0
Epoch: 2	 loss: 100.79552459716797	 mae: 0.7476609975863726, beta: 1.0
Epoch: 3	 loss: 103.6495590209961	 mae: 0.7339995702107748, beta: 1.0
Epoch: 3	 loss: 104.99667358398438	 mae: 0.7138890914427929, beta: 1.0
Epoch: 4	 loss: 101.78014373779297	 mae: 0.7200900285671918, beta: 1.0
Epoch: 5	 loss: 99.49959564208984	 mae: 0.6903639084253556, beta: 1.0
Epoch: 5	 loss: 98.52613067626953	 mae: 0.6729123898041554, beta: 1.0
Epoch: 6	 loss: 99.57936096191406	 mae: 0.6931770459199563, beta: 1.0
Epoch: 7	 loss: 94.38773345947266	 mae: 0.7041273239331368, beta: 1.0
Epoch: 7	 loss: 100.06471252441406	 mae: 0.702087769141564, beta: 1.0
Epoch: 8	 loss: 93.92159271240234	 mae: 0.646308550467858, beta: 1.0
Epoch: 9	 loss: 99.370849609375	 mae: 0.6766748367211758, beta: 1.0
Epoch: 9	 loss: 95.

KeyboardInterrupt: 

In [75]:
m.eval()
A_approx, mu, log_var = m(X)
print(mu)
A_approx * 5

tensor([[ 4.1694,  1.1283, -0.6849,  ...,  1.4611,  4.8079, -4.2967],
        [ 2.6535,  1.9477, -1.0743,  ...,  1.1685,  3.3653, -3.1873],
        [ 1.8414,  2.5392, -1.8258,  ...,  1.6144,  3.3629, -3.2717],
        ...,
        [ 1.2586,  2.4881, -2.0959,  ...,  1.8134,  3.2040, -3.1249],
        [-0.6214,  3.3128, -4.1001,  ...,  3.4142,  3.9148, -3.8488],
        [ 0.2889,  1.6915, -3.1133,  ...,  3.1609,  4.2395, -3.8847]],
       device='cuda:0', grad_fn=<AddmmBackward>)


tensor([[3.6526, 2.8894, 2.7644,  ..., 2.1981, 2.7667, 2.8317],
        [3.7819, 2.8878, 2.8798,  ..., 2.1891, 2.6657, 2.7466],
        [3.7861, 2.9076, 2.9152,  ..., 2.1980, 2.6384, 2.7111],
        ...,
        [3.9412, 3.0924, 3.0326,  ..., 2.1665, 2.6558, 2.7216],
        [4.3998, 3.6585, 3.4516,  ..., 2.0340, 2.7246, 2.7727],
        [3.8131, 3.1653, 2.9686,  ..., 2.2212, 2.7179, 2.7264]],
       device='cuda:0', grad_fn=<MulBackward0>)

In [108]:
(A_approx * 5).mean(1).cpu().data.numpy().astype(int)

array([3, 2, 2, 3, 2, 2, 3, 3, 3, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 2, 2, 3,
       2, 3, 3, 2, 2, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2,
       3, 3, 2, 2, 2, 2, 2, 3, 3, 3, 2, 3, 2, 3, 3, 3, 2, 2, 2, 2, 3, 3,
       3, 2, 3, 2, 2, 3, 2, 3, 2, 2, 2, 2, 3, 2, 2, 3, 2, 3, 2, 3, 3, 3,
       3, 3, 3, 3, 2, 3, 3, 2, 2, 3, 3, 2, 2, 2, 3, 2, 2, 2, 2, 2, 3, 2,
       2, 3, 3, 2, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 2,
       2, 3, 2, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 3, 3, 3, 2, 3, 3, 3, 2, 2,
       2, 2, 3, 3, 3, 3, 2, 2, 2, 3, 3, 2, 3, 2, 3, 3, 2, 2, 3, 3, 3, 3,
       2, 3, 2, 3, 2, 3, 2, 2, 3, 3, 3, 3, 3, 2, 3, 2, 2, 2, 2, 2, 2, 2,
       2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 3, 2, 3, 3, 2, 2, 3, 2,
       3, 2, 2, 2, 3, 2, 2, 2, 2, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 3, 3, 3,
       2, 3, 2, 2, 2, 2, 3, 3, 2, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2, 2, 3, 3,
       2, 2, 3, 2, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 3, 2, 2, 3,
       3, 3, 2, 2, 3, 3, 2, 3, 3, 3, 2, 3, 2, 3, 3,

In [56]:
A_approx.view(-1, 1650)

tensor([[0.7513, 0.6257, 0.5710,  ..., 0.4900, 0.6486, 0.6092],
        [0.6851, 0.5777, 0.5064,  ..., 0.5050, 0.5964, 0.6083],
        [0.6434, 0.5267, 0.4733,  ..., 0.4135, 0.5940, 0.5830],
        ...,
        [0.8031, 0.6532, 0.5978,  ..., 0.4175, 0.6759, 0.6572],
        [0.8731, 0.7549, 0.6394,  ..., 0.4255, 0.7179, 0.7453],
        [0.7500, 0.6268, 0.6381,  ..., 0.4057, 0.6882, 0.6496]],
       device='cuda:0', grad_fn=<ViewBackward>)

In [57]:
A_approx = (m.decoder(mu) * 5).data.numpy()
mask = ~(train_A.values == 0)
A_approx[mask] = train_A.values[mask]
evaluate(A_approx, train_A, test_A)

TypeError: can't convert CUDA tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.