# CDL

In [1]:
import numpy as np
import pickle
import tensorflow as tf
import pandas as pd
from keras.preprocessing.text import Tokenizer, one_hot
from pandas.api.types import CategoricalDtype
from scipy.sparse import csr_matrix
#init random seed
np.random.seed(5)
from sklearn.model_selection import train_test_split
import os

from sklearn.metrics import mean_squared_error, mean_absolute_error

Using TensorFlow backend.


## 1. Data Preprocessing

In [2]:
#df = pd.read_json('/home/neopux/UHH/datasets/Video_Games_5_proc.json')
df = pd.read_json('D:/Datasets/amazon_reviews/Video_Games_5_proc.json')

In [3]:
pd.__version__

'0.23.4'

In [4]:
df.head(2)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTextProc,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,700099867,"[8, 12]",1,Installing the game was a struggle (because of...,instal game struggle game window live bugs).so...,"07 9, 2012",A2HD75EMZR8QLN,123,Pay to unlock content? I don't think so.,1341792000
1,700099867,"[0, 0]",4,If you like rally cars get this game you will ...,if like rally car game fun it orient 34;europe...,"06 30, 2013",A3UR8NLLY1ZHCX,"Alejandro Henao ""Electronic Junky""",Good rally game,1372550400


In [5]:
df_train, df_test = train_test_split(df, test_size = 0.3, stratify=df['reviewerID'], random_state=42)

In [6]:
reviews = df_train.groupby('asin').reviewTextProc.agg(' '.join)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=10000)
vectorizer.fit(reviews.values)

item_infomation_matrix = vectorizer.transform(reviews.values)

In [8]:
print(item_infomation_matrix.shape)

(10668, 10000)


In [9]:
list(vectorizer.vocabulary_.items())[0:20]

[('inspiring', 4598),
 ('gentle', 3770),
 ('solar', 8157),
 ('sixaxis', 8008),
 ('bracket', 1234),
 ('walk', 9628),
 ('matt', 5420),
 ('tim', 8961),
 ('by', 1370),
 ('moving', 5750),
 ('outfits', 6226),
 ('tweak', 9200),
 ('glyph', 3845),
 ('carl', 1449),
 ('obey', 6054),
 ('waist', 9622),
 ('tournament', 9050),
 ('orb', 6177),
 ('ata', 762),
 ('asphalt', 726)]

In [10]:
list(vectorizer.stop_words_)[0:10]

['benders',
 'iterface',
 'repeaditive',
 'darkly',
 'weaknessno',
 'lia',
 'smallest',
 'tghing',
 'dreamcastthough',
 'uns2']

In [11]:
item_infomation_matrix = np.array(item_infomation_matrix.todense())

#### build rating matrix 

In [12]:
asin = CategoricalDtype(sorted(df_train.asin.unique()), ordered=True)
rev_id = CategoricalDtype(sorted(df_train.reviewerID.unique()), ordered=True)

row_cat = df_train.reviewerID.astype(rev_id).cat
col_cat = df_train.asin.astype(asin).cat

row = row_cat.codes
col = col_cat.codes

sparse_matrix = csr_matrix((df_train["overall"].values, (row, col)), \
                           shape=(rev_id.categories.size, asin.categories.size), dtype = 'd')

In [13]:
rating_matrix = sparse_matrix

#### save matrix by pickle

In [None]:
with open(r'D:/Datasets/amazon_reviews/cdl_item_infomation_matrix.pickle', 'wb') as handle:
    pickle.dump(X_train, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(r'D:/Datasets/amazon_reviews/cdl_rating_matrix.pickle', 'wb') as handle:
    pickle.dump(rating_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)

#### load matrix from pickle 

In [5]:
with open(r'D:/Datasets/amazon_reviews/cdl_item_infomation_matrix.pickle', 'rb') as handle:
    X_train = pickle.load(handle)  
    
with open(r'D:/Datasets/amazon_reviews/cdl_rating_matrix.pickle', 'rb') as handle2:
    rating_matrix = pickle.load(handle2)

## 2. build model

### matrix factorization model

In [15]:
class MF():
    def __init__(self , rating_matrix, k):
        self.num_u = rating_matrix.shape[0] #5551
        self.num_v = rating_matrix.shape[1] #16980
        
        self.u_lambda = 0.1
        self.v_lambda = 0.1#10
        
        self.k = k #latent維度
        self.a = 1
        self.b = 0.01
        
        self.R = np.mat(rating_matrix)
        
        self.C = np.mat(np.ones(self.R.shape)) * self.b
        self.C[np.where(self.R>0)] = self.a
        
        self.I_U = np.mat(np.eye(self.k) * self.u_lambda)
        self.I_V = np.mat(np.eye(self.k) * self.v_lambda)
        
        self.Q = rating_matrix
        self.non_zero_idx = self.Q > 0
        
        self.W = rating_matrix > 0.5
        self.W = self.W.astype(np.float64, copy=False)
        
        self.lambda_x = 0.1 * np.eye(k)
        self.lambda_y = 10.0 * np.eye(k)
        
        self.n_factors = k
        self.m, self.n = self.Q.shape
        
        self.X = 5 * np.random.rand(self.m, self.n_factors)
        self.Y = 5 * np.random.rand(self.n_factors, self.n)
        
        self.U = np.mat(np.random.normal(0 , 1/self.u_lambda , size=(self.k, self.num_u)))
        self.V = np.mat(np.random.normal(0 , 1/self.v_lambda , size=(self.k, self.num_v)))
                        

    def test(self):
        print( ((U_cut*self.R[np.ravel(np.where(self.R[:,j]>0)[1]),j] + self.v_lambda * self.V_sdae[j])).shape)
        
    
    def ALS(self , V_sdae):
        self.V_sdae = np.mat(V_sdae)
        
        V_sq = self.V * self.V.T * self.b
        for i in range(self.num_u):
            idx_a = np.ravel(np.where(self.R[i,:]>0)[1])
            V_cut = self.V[:,idx_a]
            self.U[:,i] = np.linalg.pinv(V_sq + V_cut * V_cut.T * (self.a-self.b) + self.I_U )*(V_cut*self.R[i,idx_a].T) #V_sq+V_cut*V_cut.T*a_m_b = VCV^T
        
        U_sq = self.U * self.U.T * self.b
        for j in range(self.num_v):
            idx_a = np.ravel(np.where(self.R[:,j]>0)[1])
            U_cut = self.U[:,idx_a]
            self.V[:,j] = np.linalg.pinv(U_sq +  U_cut * U_cut.T * (self.a-self.b)+self.I_V) * (U_cut*self.R[idx_a,j] + self.v_lambda * np.resize(self.V_sdae[j],(self.k,1)))
        
        return self.U ,self.V
    
    
    def ALS_v2(self, V_sdae):
        self.V_sdae = np.mat(V_sdae)
        
        for i in range(0, self.num_u):
            #idx = nonZero[i,:]
            idx_a = np.ravel(np.where(self.R[i,:]>0)[1])
            #a = Y[idx,]
            V_cut = self.V[:,idx_a]
            
            #b = np.dot(np.transpose(Y[idx,]), ratingsMatrix[i, idx])
            b =  V_cut * self.R[i,idx_a].T
            updateU = np.linalg.solve((V_cut * V_cut.T + self.I_U),  b)
            #print(updateU)
            self.U[:,i] = updateU
    
        for j in range(0, self.num_v):
            #idx = nonZero[:,j]
            idx_a = np.ravel(np.where(self.R[:,j]>0)[1])
            
            #a = X[idx,]
            U_cut = self.U[:,idx_a]
            
            #b = np.dot(np.transpose(X[idx,]), ratingsMatrix[idx, j])
            b = U_cut*self.R[idx_a,j] #+ self.v_lambda * np.resize(self.V_sdae[j],(self.k,1))
            
            updateV = np.linalg.solve((U_cut * U_cut.T +self.I_V), b)
            self.V[:,j] = updateV
                                                                  
        return self.U ,self.V
    
    def ALS_v3_weighted(self, V_sdae, print_loss):
        #print("ALS step ")
        for u in range(self.W.shape[0]):
            Wu = self.W[u]            
            c = self.Y * Wu
            
            #print("Y" + str(self.Y.shape))
            #print("W_diag" + str(Wu_diag.shape))
            #print("Q_u" + str(self.Q[u].shape))
            
            a = np.matmul(c, np.transpose(self.Q[u]))
            d = np.matmul(c, np.transpose(self.Y)) + self.lambda_x
            
            #print("C:" + str(c.shape))
            #print("A:" + str(a.shape))
            #print("D:" + str(d.shape))
            
            self.X[u] = np.linalg.solve(d, a).T
            
            #if u % 1000 == 0:
            #    print("WOOOOOOOOOOOOOOOOOOOOOOOOOP:" ,u)
        
        #print("ALS step 2")    
        for i in range(self.W.shape[1]):
            Wi = self.W.T[i]
            c = self.X.T * Wi
            
            a = np.matmul(c, self.Q[:, i]) 
            b = np.matmul(self.lambda_y, V_sdae[i])
            
            #print("B", b.shape)
            #print("A", a.shape)
            self.Y[:,i] = np.linalg.solve(np.matmul(c, self.X) + self.lambda_y, a + b)
            
        preds = np.dot(self.X, self.Y)
        err = mean_squared_error(self.Q[self.non_zero_idx], preds[self.non_zero_idx]) ** 0.5
           
        print("ALS LOSS: %f" % err)    
        del preds
        
        return self.X, self.Y, err


In [69]:
rm

array([[1, 2, 3, 7],
       [1, 5, 2, 1],
       [1, 7, 2, 1],
       [1, 2, 3, 4]])

In [72]:
W = rm > 0.5
W = W.astype(np.float64, copy=False)

In [74]:
X = np.mat(5 * np.random.rand(4, 3))
Y = np.mat(5 * np.random.rand(3, 4))

In [76]:
Wu = W[1]
Wu_diag = np.mat(np.diag(Wu))

In [80]:
np.array(Y) * np.array(Wu)

array([[0.04955666, 2.4480058 , 2.42686456, 4.29286235],
       [2.24025944, 0.80691231, 1.8383078 , 1.93557249],
       [1.51707224, 1.24571824, 2.71906757, 3.17993   ]])

In [79]:
Y * Wu_diag

matrix([[0.04955666, 2.4480058 , 2.42686456, 4.29286235],
        [2.24025944, 0.80691231, 1.8383078 , 1.93557249],
        [1.51707224, 1.24571824, 2.71906757, 3.17993   ]])

In [7]:
iim

array([[0.7, 0.8, 0.9],
       [1. , 0.2, 0.3],
       [0.5, 0.6, 0.7]])

In [119]:
cdl = CDL(rm , iim)#item_infomation_matrix)
U, V = cdl.training() #188910

Mask shape: (4, 3)
Noising completed..:(4, 3)
1 / 1000
ALS step 
ALS LOSS: 2.720730
EPOCH 1 MODEL LOSS 110.523331
2 / 1000
ALS step 
EPOCH 2 MODEL LOSS 104.084976
3 / 1000
ALS step 
EPOCH 3 MODEL LOSS 99.165581
4 / 1000
ALS step 
EPOCH 4 MODEL LOSS 95.199127
5 / 1000
ALS step 
EPOCH 5 MODEL LOSS 91.383072
6 / 1000
ALS step 
EPOCH 6 MODEL LOSS 87.388420
7 / 1000
ALS step 
EPOCH 7 MODEL LOSS 83.545349
8 / 1000
ALS step 
EPOCH 8 MODEL LOSS 80.878616
9 / 1000
ALS step 
EPOCH 9 MODEL LOSS 77.144592
10 / 1000
ALS step 
EPOCH 10 MODEL LOSS 73.642151
11 / 1000
ALS step 
EPOCH 11 MODEL LOSS 70.703026
12 / 1000
ALS step 
EPOCH 12 MODEL LOSS 68.055199
13 / 1000
ALS step 
EPOCH 13 MODEL LOSS 65.062660
14 / 1000
ALS step 
EPOCH 14 MODEL LOSS 62.779819
15 / 1000
ALS step 
EPOCH 15 MODEL LOSS 60.966976
16 / 1000
ALS step 
EPOCH 16 MODEL LOSS 57.891499
17 / 1000
ALS step 
EPOCH 17 MODEL LOSS 56.504326
18 / 1000
ALS step 
EPOCH 18 MODEL LOSS 53.332420
19 / 1000
ALS step 
EPOCH 19 MODEL LOSS 51.142727
2

#### masking noise 

In [16]:
def mask(corruption_level, shape):
    mask = np.random.binomial(1, 1 - corruption_level, shape)
    return mask

def add_noise(x , corruption_level ):
    mask_ = mask(corruption_level , x.shape)
    print("Mask shape: " + str(mask_.shape))
    x = np.multiply(x, mask_)
    print("Noising completed..:" + str(x.shape))
    return x

In [35]:
class CDL():
    def __init__(self , rating_matrix , item_infomation_matrix):
        self.xp_name = 'test5_lr00001_k50_250n_withoutput3'
        self.base_dir = 'D:/Models/master/'
        
        os.mkdir('%s/%s/' % (self.base_dir, self.xp_name))
        os.mkdir('%s/%s/tf/' % (self.base_dir, self.xp_name))
        os.mkdir('%s/%s/pickles/' % (self.base_dir, self.xp_name))
        
        self.k = 50
        self.n_input = item_infomation_matrix.shape[1] # dimensionality of text representations - 1000
        self.n_hidden1 = 250 # 200
        self.n_hidden2 = self.k
        
        
        self.lambda_w = 1
        self.lambda_n = 1
        self.lambda_u = 1
        self.lambda_v = 1
        
        self.drop_ratio = 0.1
        self.learning_rate = 0.0001
        self.epochs = 100
        self.batch_size = 32
        
        self.num_u = rating_matrix.shape[0]
        self.num_v = rating_matrix.shape[1]
        intializer = tf.variance_scaling_initializer()
        self.non_zero_idx = rating_matrix > 0
        
        self.Weights = {
            #'w1' : tf.Variable(tf.random_normal( [self.n_input , self.n_hidden1] , mean=0.0, stddev=1 / self.lambda_w )),
            #'w2' : tf.Variable(tf.random_normal( [self.n_hidden1 , self.n_hidden2] , mean=0.0, stddev=1 / self.lambda_w )),
            #'w3' : tf.Variable(tf.random_normal( [self.n_hidden2 , self.n_hidden1] , mean=0.0, stddev=1 / self.lambda_w )),
            #'w4' : tf.Variable(tf.random_normal( [self.n_hidden1 , self.n_input] , mean=0.0, stddev=1 / self.lambda_w ))   
            'w1' : tf.Variable(intializer([self.n_input, self.n_hidden1]), dtype=tf.float32),
            'w2' : tf.Variable(intializer([self.n_hidden1, self.n_hidden2]), dtype=tf.float32),
            'w3' : tf.Variable(intializer([self.n_hidden2, self.n_hidden1]), dtype=tf.float32),
            'w4' : tf.Variable(intializer([self.n_hidden1, self.n_input]), dtype=tf.float32)   
        }
        self.Biases = {
            'b1' : tf.Variable(tf.random_normal( [self.n_hidden1] , mean=0.0, stddev=1 / self.lambda_w )),
            'b2' : tf.Variable(tf.random_normal( [self.n_hidden2] , mean=0.0, stddev=1 / self.lambda_w )),
            'b3' : tf.Variable(tf.random_normal( [self.n_hidden1] , mean=0.0, stddev=1 / self.lambda_w )),
            'b4' : tf.Variable(tf.random_normal( [self.n_input] , mean=0.0, stddev=1 / self.lambda_w ))
            #'b1' : tf.Variable(tf.zeros(self.n_hidden1)),
            #'b2' : tf.Variable(tf.zeros(self.n_hidden2)),
            #'b3' : tf.Variable(tf.zeros(self.n_hidden1)),
            #'b4' : tf.Variable(tf.zeros(self.n_input))
        }
        
        self.item_infomation_matrix = item_infomation_matrix
        self.rating_matrix = rating_matrix
    
        self.build_model()
        self.saver = tf.train.Saver()
        
    def encoder(self , x , drop_ratio):
        w1 = self.Weights['w1']
        b1 = self.Biases['b1']
        L1 = tf.nn.relu(tf.matmul(x,w1) + b1)
        L1 = tf.nn.dropout( L1 , keep_prob= 1 - drop_ratio )
        
        w2 = self.Weights['w2']
        b2 = self.Biases['b2']
        L2 = tf.nn.relu(tf.matmul(L1,w2) + b2)
        L2 = tf.nn.dropout(L2 , keep_prob= 1 - drop_ratio)
        
        return L2
    
    def decoder(self , x , drop_ratio):
        w3 = self.Weights['w3']
        b3 = self.Biases['b3']
        L3 = tf.nn.relu(tf.matmul(x,w3) + b3)
        L3 = tf.nn.dropout(L3 , keep_prob= 1 - drop_ratio)

        w4 = self.Weights['w4']
        b4 = self.Biases['b4']
        L4 = tf.nn.relu(tf.matmul(L3,w4) + b4)
        L4 = tf.nn.dropout(L4 , keep_prob= 1 - drop_ratio)

        return L4
    
    def build_model(self):
        self.model_X_0 = tf.placeholder(tf.float32 , shape=(None , self.n_input))
        self.model_X_c = tf.placeholder(tf.float32 , shape=(None , self.n_input))
        self.model_V = tf.placeholder(tf.float32 , shape=(None , self.k))
        
        self.model_drop_ratio = tf.placeholder(tf.float32)
        
        self.V_sdae = self.encoder(self.model_X_0 , self.model_drop_ratio)
        self.y_pred = self.decoder(self.V_sdae , self.model_drop_ratio)
        
        self.Regularization = tf.reduce_sum([tf.nn.l2_loss(w) + tf.nn.l2_loss(b) 
                                             for w,b in zip(self.Weights.values() , self.Biases.values())])
        loss_r =1/2 * self.lambda_w * self.Regularization
        self.loss_a =1/2 * self.lambda_n * tf.reduce_sum(tf.pow( self.model_X_c - self.y_pred , 2 ))
        loss_v =1/2 * self.lambda_v * tf.reduce_sum(tf.pow( self.model_V - self.V_sdae , 2 ))
        
        self.Loss = loss_r + self.loss_a + loss_v
        self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.Loss)
        
    
    def training(self):
        #np.random.shuffle(self.item_infomation_matrix) #random index of train data
        
        self.item_infomation_matrix_noise = add_noise(self.item_infomation_matrix , 0.3)
        
        sess = tf.Session()
        ## define dirs for tensorboard
        train_writer = tf.summary.FileWriter('%s/%s/train'%(self.base_dir, self.xp_name), sess.graph)
        test_writer = tf.summary.FileWriter('%s/%s/test'%(self.base_dir, self.xp_name))
        
        sess.run(tf.global_variables_initializer())
        
        mf = MF(self.rating_matrix, self.k)
        
        for epoch in range(0, self.epochs):
            print("%d / %d"%(epoch + 1, self.epochs))
            
            V_sdae = sess.run(self.V_sdae , feed_dict={self.model_X_0 : self.item_infomation_matrix_noise , 
                                                       self.model_drop_ratio : 0.1})
            # calc and print ALS loss every N epochs
            print_loss = epoch % 1 == 0  
            U , V, err = mf.ALS_v3_weighted(V_sdae, print_loss)
            
            #print("V shape initial" + str(V.shape))
            #V = np.resize(V, (self.num_v , self.k))
            #print("V shape after" + str(V.shape))
            V = V.T
            
            auto_losses = []
            model_losses = []
            for i in range(0 , self.item_infomation_matrix.shape[0] , self.batch_size):
                X_train_batch = self.item_infomation_matrix_noise[i:i+self.batch_size]
                y_train_batch = self.item_infomation_matrix[i:i+self.batch_size]
                
                V_batch = V[i:i+self.batch_size]
                
                _ , my_loss, auto_loss = sess.run([self.optimizer, self.Loss, self.loss_a] , 
                                       feed_dict={self.model_X_0: X_train_batch , 
                                                  self.model_X_c: y_train_batch, 
                                                  self.model_V: V_batch, 
                                                  self.model_drop_ratio : 0.1})
                auto_losses.append(auto_loss)
                model_losses.append(my_loss)
            
            
            #summary = sess.run([self.summaries],
            #        feed_dict={self.autoencoder_loss: np.mean(auto_losses), 
            #                   self.model_loss:np.mean(model_losses),
            #                   self.als_loss:err})
            
            summary = tf.Summary();
            summary.value.add(tag='Autoencoder Loss', simple_value=np.mean(auto_losses))
            summary.value.add(tag='Model Loss', simple_value=np.mean(model_losses))
            summary.value.add(tag='ALS Loss', simple_value=err)

            train_writer.add_summary(summary, epoch + 1)

            
            print("EPOCH %i MODEL LOSS %f" % (epoch + 1, np.mean(model_losses)))
            print("EPOCH %i AUTOENCODER LOSS %f" % (epoch + 1, np.mean(auto_losses)))
            
            if epoch % 5 == 0:
                os.mkdir('%s/%s/tf/epoch_%s/' % (self.base_dir, self.xp_name, epoch))
                os.mkdir('%s/%s/pickles/epoch_%s/' % (self.base_dir, self.xp_name, epoch))
                
                # save tensorflow model
                self.saver.save(sess, '%s/%s/tf/epoch_%s/model_.ckpt' % (self.base_dir, self.xp_name, epoch))
                
                # save U and V matricies from ALS
                with open(r'%s/%s/pickles/epoch_%s/U.pickle'% (self.base_dir, self.xp_name, epoch), 'wb') as handle:
                    pickle.dump(U, handle, protocol=pickle.HIGHEST_PROTOCOL)
                with open(r'%s/%s/pickles/epoch_%s/V.pickle'% (self.base_dir, self.xp_name, epoch), 'wb') as handle:
                    pickle.dump(V, handle, protocol=pickle.HIGHEST_PROTOCOL)
                
        
        sess.close()
        return U, V

In [18]:
item_infomation_matrix.shape

(10668, 10000)

In [19]:
rating_matrix.shape

(24303, 10668)

In [20]:
rm = np.array(rating_matrix.todense())

In [20]:
tf.reset_default_graph()

In [114]:
from sklearn.preprocessing import normalize
rm = np.array(
    [[1, 2, 3, 7],
     [1, 5, 2, 1],
     [1, 7, 2, 1],
     [1, 2, 3, 4]])
#rm = np.mat(normalize(rm, axis=0, norm='l1'))
#print(rm)

iim = np.array([[0.7, 0.8, 0.9],[ 1, 0.2, 0.3],[0.5, 0.6, 0.7],[0.5, 0.6, 0.7]])

In [21]:
with open(r'D:/Models/master/test5_lr00001_k50_300n/pickles/epoch_45/U.pickle', 'rb') as handle:
    U = pickle.load(handle)  
    
with open(r'D:/Models/master/test5_lr00001_k50_300n/pickles/epoch_45/V.pickle', 'rb') as handle2:
    V = pickle.load(handle2)

In [36]:
cdl = CDL(rm, item_infomation_matrix)
U, V = cdl.training() #188910

Mask shape: (10668, 10000)
Noising completed..:(10668, 10000)
1 / 100
ALS LOSS: 2.898589
EPOCH 1 MODEL LOSS 35166.007812
EPOCH 1 AUTOENCODER LOSS 29824.156250
2 / 100
ALS LOSS: 1.448117
EPOCH 2 MODEL LOSS 4426.907227
EPOCH 2 AUTOENCODER LOSS 61.334709
3 / 100
ALS LOSS: 0.817276
EPOCH 3 MODEL LOSS 3961.845703
EPOCH 3 AUTOENCODER LOSS 28.336441
4 / 100
ALS LOSS: 0.730024
EPOCH 4 MODEL LOSS 3677.159424
EPOCH 4 AUTOENCODER LOSS 22.104527
5 / 100
ALS LOSS: 0.714830
EPOCH 5 MODEL LOSS 3470.143311
EPOCH 5 AUTOENCODER LOSS 20.630169
6 / 100
ALS LOSS: 0.697931
EPOCH 6 MODEL LOSS 3300.103271
EPOCH 6 AUTOENCODER LOSS 19.794699
7 / 100
ALS LOSS: 0.684345
EPOCH 7 MODEL LOSS 3154.348877
EPOCH 7 AUTOENCODER LOSS 19.099277
8 / 100
ALS LOSS: 0.676849
EPOCH 8 MODEL LOSS 3020.109375
EPOCH 8 AUTOENCODER LOSS 18.700705
9 / 100
ALS LOSS: 0.663997
EPOCH 9 MODEL LOSS 2896.240723
EPOCH 9 AUTOENCODER LOSS 18.330490
10 / 100
ALS LOSS: 0.650858
EPOCH 10 MODEL LOSS 2777.876953
EPOCH 10 AUTOENCODER LOSS 18.120188
1

In [29]:
%reset Out

Once deleted, variables cannot be recovered. Proceed (y/[n])? y
Flushing output cache (0 entries)


In [103]:
with open(r'/home/neopux/UHH/datasets/cdl_U_mx_train.pickle', 'wb') as handle:
    pickle.dump(U, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(r'/home/neopux/UHH/datasets/cdl_V_mx_train.pickle', 'wb') as handle:
    pickle.dump(V, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [6]:
with open(r'/home/neopux/UHH/datasets/cdl_U_mx_train.pickle', 'rb') as handle:
    U = pickle.load(handle)  
    
with open(r'/home/neopux/UHH/datasets/cdl_V_mx_train.pickle', 'rb') as handle2:
    V = pickle.load(handle2)

In [332]:
U.shape

(24303, 25)

In [333]:
V.shape

(10668, 25)

In [43]:
preds = np.dot(U, V.T)

In [44]:
preds.shape

(24303, 10668)

In [45]:
rm.shape

(24303, 10668)

## Metrics

### For training set

In [220]:
preds[rm > 0]

array([-1.03274029, -1.32472413, -1.09501641, ...,  1.29053572,
        1.21824796,  2.60169135])

In [46]:
mean_squared_error(rm[rm > 0], preds[rm > 0]) ** 0.5

0.2687775334691775

In [319]:
mean_squared_error(rm, preds) ** 0.5

3.826338841575348

### For test set

In [320]:
%reset Out

Once deleted, variables cannot be recovered. Proceed (y/[n])? y
Flushing output cache (34 entries)


In [24]:
preds_df_unmelt = pd.DataFrame(preds, columns = col_cat.categories, index = row_cat.categories)
preds_df_unmelt.index.name = 'reviewerID'
preds_df_unmelt.columns.name = 'asin'
preds_df_unmelt.head(2)

asin,0700099867,6050036071,7100027950,7293000936,8176503290,907843905X,9625990674,9861019731,9882155456,B000003SQQ,...,B00J128FPA,B00J226358,B00J6DLPLK,B00J9P3KBS,B00JM3R6M6,B00JQ8YH6A,B00JQHU9RC,B00JXW6GE0,B00KAI3KW2,B00KHECZXO
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A00263941WP7WCIL7AKWL,4.289962,4.479144,4.975905,4.668979,4.249168,4.665921,4.739466,4.839571,4.955299,3.684972,...,5.120315,4.357303,4.797302,4.150864,4.637881,4.048742,5.02645,4.182327,4.909996,3.821161
A005481137I9SCAWEF7ON,2.878864,4.109722,4.398296,2.986301,3.570328,4.110752,4.122372,3.985164,4.01625,3.582355,...,4.355327,2.920714,4.032251,2.717914,4.087327,2.863482,2.80838,3.29596,4.778112,2.329751


In [47]:
df_test_val = df_test.copy()
#preds_df_unmelt.loc['A00263941WP7WCIL7AKWL', '0700099867']

In [48]:
df_test_val.head(2)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTextProc,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
193452,B005QA98JS,"[1, 1]",4,This is another set of games that sort of surp...,this set game sort surprise good set this set ...,"10 18, 2012",AFXTKAO0CB354,C. Weaver,Aonther Set Of Games That Surprised Me....,1350518400
166816,B0043QL2FE,"[4, 7]",4,"After playing this game a lot more, I have dec...",after play game lot i decide change review ini...,"03 16, 2011",A4E0I88T1MS4O,Fani,Solid improvement from Top Spin 3 but still ha...,1300233600


In [49]:
df_test_val['value'] = 0

In [50]:
def get_val(x):
    if x['reviewerID'] in preds_df_unmelt.index:
        if x['asin'] in preds_df_unmelt.columns:
            return preds_df_unmelt.loc[x['reviewerID'], x['asin']]
    return None

In [51]:
df_test_val['value'] = df_test_val.apply(get_val, axis = 1)

In [52]:
df_test_val.head(2)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTextProc,reviewTime,reviewerID,reviewerName,summary,unixReviewTime,value
193452,B005QA98JS,"[1, 1]",4,This is another set of games that sort of surp...,this set game sort surprise good set this set ...,"10 18, 2012",AFXTKAO0CB354,C. Weaver,Aonther Set Of Games That Surprised Me....,1350518400,3.935068
166816,B0043QL2FE,"[4, 7]",4,"After playing this game a lot more, I have dec...",after play game lot i decide change review ini...,"03 16, 2011",A4E0I88T1MS4O,Fani,Solid improvement from Top Spin 3 but still ha...,1300233600,3.67552


In [53]:
df_test.shape

(69534, 10)

In [54]:
df_test_val.shape

(69534, 11)

In [55]:
df_test[~df_test_val.value.isnull()].head(2)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTextProc,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
193452,B005QA98JS,"[1, 1]",4,This is another set of games that sort of surp...,this set game sort surprise good set this set ...,"10 18, 2012",AFXTKAO0CB354,C. Weaver,Aonther Set Of Games That Surprised Me....,1350518400
166816,B0043QL2FE,"[4, 7]",4,"After playing this game a lot more, I have dec...",after play game lot i decide change review ini...,"03 16, 2011",A4E0I88T1MS4O,Fani,Solid improvement from Top Spin 3 but still ha...,1300233600


In [56]:
mean_squared_error(df_test[~df_test_val.value.isnull()].overall, df_test_val[~df_test_val.value.isnull()].value) ** 0.5

1.1504624368435534

In [57]:
mean_absolute_error(df_test[~df_test_val.value.isnull()].overall, df_test_val[~df_test_val.value.isnull()].value)

0.8833793696260032