# CDL

### import module

In [40]:
import numpy as np
import pickle
import tensorflow as tf
import pandas as pd
from keras.preprocessing.text import Tokenizer, one_hot
from pandas.api.types import CategoricalDtype
from scipy.sparse import csr_matrix
#init random seed
np.random.seed(5)
from sklearn.model_selection import train_test_split
import os

## 1. Data Preprocessing

In [2]:
df = pd.read_json('/home/neopux/UHH/datasets/Video_Games_5_proc.json')

In [3]:
pd.__version__

'0.24.2'

In [4]:
df.head(2)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime,reviewTextProc
0,700099867,"[8, 12]",1,Installing the game was a struggle (because of...,"07 9, 2012",A2HD75EMZR8QLN,123,Pay to unlock content? I don't think so.,1341792000,instal game struggle game window live bugs).so...
1,700099867,"[0, 0]",4,If you like rally cars get this game you will ...,"06 30, 2013",A3UR8NLLY1ZHCX,"Alejandro Henao ""Electronic Junky""",Good rally game,1372550400,if like rally car game fun it orient 34;europe...


In [5]:
df_train, df_test = train_test_split(df, test_size = 0.3, random_state=42)

In [6]:
reviews = df_train.groupby('asin').reviewTextProc.agg(' '.join)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=10000)
vectorizer.fit(reviews.values)

item_infomation_matrix = vectorizer.transform(reviews.values)

In [8]:
print(item_infomation_matrix.shape)

(10666, 10000)


In [10]:
list(vectorizer.vocabulary_.items())[0:20]

[('instal', 4626),
 ('game', 3718),
 ('struggle', 8522),
 ('window', 9806),
 ('live', 5190),
 ('bugs', 1333),
 ('some', 8180),
 ('championship', 1545),
 ('race', 7004),
 ('car', 1444),
 ('unlocked', 9345),
 ('buy', 1377),
 ('addon', 342),
 ('pay', 6419),
 ('nearly', 5887),
 ('30', 129),
 ('dollar', 2691),
 ('new', 5933),
 ('not', 6010),
 ('like', 5145)]

In [11]:
list(vectorizer.stop_words_)[0:10]

['scense',
 'collider',
 'unleashedrace',
 'clangy',
 'skyalnder',
 '48mbpulse',
 'pandoran',
 'statewide',
 'sleet',
 'preplay']

In [12]:
item_infomation_matrix = np.array(item_infomation_matrix.todense())

#### build rating matrix 

In [13]:
asin = CategoricalDtype(sorted(df_train.asin.unique()), ordered=True)
rev_id = CategoricalDtype(sorted(df_train.reviewerID.unique()), ordered=True)

row_cat = df_train.reviewerID.astype(rev_id).cat
col_cat = df_train.asin.astype(asin).cat

row = row_cat.codes
col = col_cat.codes

sparse_matrix = csr_matrix((df_train["overall"].values, (row, col)), \
                           shape=(rev_id.categories.size, asin.categories.size), dtype = 'd')

In [14]:
rating_matrix = sparse_matrix

#### save matrix by pickle

In [58]:
with open(r'D:/Datasets/amazon_reviews/cdl_item_infomation_matrix.pickle', 'wb') as handle:
    pickle.dump(X_train, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(r'D:/Datasets/amazon_reviews/cdl_rating_matrix.pickle', 'wb') as handle:
    pickle.dump(rating_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)

#### load matrix from pickle 

In [5]:
with open(r'D:/Datasets/amazon_reviews/cdl_item_infomation_matrix.pickle', 'rb') as handle:
    X_train = pickle.load(handle)  
    
with open(r'D:/Datasets/amazon_reviews/cdl_rating_matrix.pickle', 'rb') as handle2:
    rating_matrix = pickle.load(handle2)

## 2. build model

### matrix factorization model

In [15]:
class MF():
    def __init__(self , rating_matrix ):
        self.num_u = rating_matrix.shape[0] #5551
        self.num_v = rating_matrix.shape[1] #16980
        
        self.u_lambda = 100
        self.v_lambda = 0.1
        
        self.k = 50 #latent維度
        self.a = 1
        self.b = 0.01
        
        self.R = np.mat(rating_matrix)
        self.C = np.mat(np.ones(self.R.shape)) * self.b
        self.C[np.where(self.R>0)] = self.a
        
        self.I_U = np.mat(np.eye(self.k) * self.u_lambda)
        self.I_V = np.mat(np.eye(self.k) * self.v_lambda)
        
        self.U = np.mat(np.random.normal(0 , 1/self.u_lambda , size=(self.k,self.num_u)))
        self.V = np.mat(np.random.normal(0 , 1/self.v_lambda , size=(self.k,self.num_v)))
                        

    def test(self):
        print( ((U_cut*self.R[np.ravel(np.where(self.R[:,j]>0)[1]),j] + self.v_lambda * self.V_sdae[j])).shape)
        
    
    def ALS(self , V_sdae):
        self.V_sdae = np.mat(V_sdae)
        
        V_sq = self.V * self.V.T * self.b
        for i in range(self.num_u):
            idx_a = np.ravel(np.where(self.R[i,:]>0)[1])
            V_cut = self.V[:,idx_a]
            self.U[:,i] = np.linalg.pinv( V_sq+ V_cut * V_cut.T * (self.a-self.b) + self.I_U )*(V_cut*self.R[i,idx_a].T) #V_sq+V_cut*V_cut.T*a_m_b = VCV^T
        
        U_sq = self.U * self.U.T * self.b
        for j in range(self.num_v):
            idx_a = np.ravel(np.where(self.R[:,j]>0)[1])
            U_cut = self.U[:,idx_a]
            self.V[:,j] = np.linalg.pinv(U_sq+U_cut*U_cut.T*(self.a-self.b)+self.I_V)* (U_cut*self.R[idx_a,j] + self.v_lambda * np.resize(self.V_sdae[j],(self.k,1)))
        
        return self.U ,self.V

#### masking noise 

In [16]:
def mask(corruption_level, shape):
    mask = np.random.binomial(1, 1 - corruption_level, shape)
    return mask

def add_noise(x , corruption_level ):
    mask_ = mask(corruption_level , x.shape)
    print("Mask shape: " + str(mask_.shape))
    x = np.multiply(x, mask_)
    print("Noising completed..:" + str(x.shape))
    return x

In [52]:
class CDL():
    def __init__(self , rating_matrix , item_infomation_matrix):
        # model參數設定
        self.n_input = 10000
        self.n_hidden1 = 500
        self.n_hidden2 = 50
        self.k = 50
        
        self.lambda_w = 1
        self.lambda_n = 1
        self.lambda_u = 1
        self.lambda_v = 1
        
        self.drop_ratio = 0.1
        self.learning_rate = 0.001
        self.epochs = 500
        self.batch_size = 32
        
        self.num_u = rating_matrix.shape[0]
        self.num_v = rating_matrix.shape[1]
        intializer = tf.variance_scaling_initializer()
        
        self.Weights = {
            #'w1' : tf.Variable(tf.random_normal( [self.n_input , self.n_hidden1] , mean=0.0, stddev=1 / self.lambda_w )),
            #'w2' : tf.Variable(tf.random_normal( [self.n_hidden1 , self.n_hidden2] , mean=0.0, stddev=1 / self.lambda_w )),
            #'w3' : tf.Variable(tf.random_normal( [self.n_hidden2 , self.n_hidden1] , mean=0.0, stddev=1 / self.lambda_w )),
            #'w4' : tf.Variable(tf.random_normal( [self.n_hidden1 , self.n_input] , mean=0.0, stddev=1 / self.lambda_w ))   
            'w1' : tf.Variable(intializer([self.n_input, self.n_hidden1]), dtype=tf.float32),
            'w2' : tf.Variable(intializer([self.n_hidden1, self.n_hidden2]), dtype=tf.float32),
            'w3' : tf.Variable(intializer([self.n_hidden2, self.n_hidden1]), dtype=tf.float32),
            'w4' : tf.Variable(intializer([self.n_hidden1, self.n_input]), dtype=tf.float32)   
        }
        self.Biases = {
            #'b1' : tf.Variable(tf.random_normal( [self.n_hidden1] , mean=0.0, stddev=1 / self.lambda_w )),
            #'b2' : tf.Variable(tf.random_normal( [self.n_hidden2] , mean=0.0, stddev=1 / self.lambda_w )),
            #'b3' : tf.Variable(tf.random_normal( [self.n_hidden1] , mean=0.0, stddev=1 / self.lambda_w )),
            #'b4' : tf.Variable(tf.random_normal( [self.n_input] , mean=0.0, stddev=1 / self.lambda_w ))
            'b1' : tf.Variable(tf.zeros(self.n_hidden1)),
            'b2' : tf.Variable(tf.zeros(self.n_hidden2)),
            'b3' : tf.Variable(tf.zeros(self.n_hidden1)),
            'b4' : tf.Variable(tf.zeros(self.n_input))
        }
        
        self.item_infomation_matrix = item_infomation_matrix
        self.rating_matrix = rating_matrix
    
        self.build_model()
        self.saver = tf.train.Saver()
        
    def encoder(self , x , drop_ratio):
        w1 = self.Weights['w1']
        b1 = self.Biases['b1']
        L1 = tf.nn.relu( tf.matmul(x,w1) + b1 )
        L1 = tf.nn.dropout( L1 , keep_prob= 1 - drop_ratio )
        
        w2 = self.Weights['w2']
        b2 = self.Biases['b2']
        L2 = tf.nn.relu( tf.matmul(L1,w2) + b2 )
        L2 = tf.nn.dropout(L2 , keep_prob= 1 - drop_ratio)
        
        return L2
    
    def decoder(self , x , drop_ratio):
        w3 = self.Weights['w3']
        b3 = self.Biases['b3']
        L3 = tf.nn.relu(tf.matmul(x,w3) + b3)
        L3 = tf.nn.dropout(L3 , keep_prob= 1 - drop_ratio)

        w4 = self.Weights['w4']
        b4 = self.Biases['b4']
        L4 = tf.nn.relu(tf.matmul(L3,w4) + b4)
        L4 = tf.nn.dropout(L4 , keep_prob= 1 - drop_ratio)

        return L4
    
    def build_model(self):
        self.model_X_0 = tf.placeholder(tf.float32 , shape=(None , self.n_input))
        self.model_X_c = tf.placeholder(tf.float32 , shape=(None , self.n_input))
        self.model_V = tf.placeholder(tf.float32 , shape=(None , self.k))
        self.model_drop_ratio = tf.placeholder(tf.float32)
        
        self.V_sdae = self.encoder( self.model_X_0 , self.model_drop_ratio )
        self.y_pred = self.decoder( self.V_sdae , self.model_drop_ratio )
        
        self.Regularization = tf.reduce_sum([tf.nn.l2_loss(w) + tf.nn.l2_loss(b) 
                                             for w,b in zip(self.Weights.values() , self.Biases.values())])
        loss_r =1/2 * self.lambda_w * self.Regularization
        loss_a =1/2 * self.lambda_n * tf.reduce_sum(tf.pow( self.model_X_c - self.y_pred , 2 ))
        loss_v =1/2 * self.lambda_v * tf.reduce_sum(tf.pow( self.model_V - self.V_sdae , 2 ))
        
        self.Loss = loss_r + loss_a + loss_v
        self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.Loss)
    
    def training(self):
        #np.random.shuffle(self.item_infomation_matrix) #random index of train data
        
        self.item_infomation_matrix_noise = add_noise(self.item_infomation_matrix , 0.3)
        
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())
        
        mf = MF(self.rating_matrix)
        
        for epoch in range(0, self.epochs):
            print("%d / %d"%(epoch+1 , self.epochs))
            
            V_sdae = sess.run(self.V_sdae , feed_dict={self.model_X_0 : self.item_infomation_matrix_noise , self.model_drop_ratio : 0.1})
            
            U , V = mf.ALS(V_sdae)
            V = np.resize(V, (self.num_v , 50))
            
            for i in range(0 , self.item_infomation_matrix.shape[0] , self.batch_size):
                X_train_batch = self.item_infomation_matrix_noise[i:i+self.batch_size]
                y_train_batch = self.item_infomation_matrix[i:i+self.batch_size]
                V_batch = V[i:i+self.batch_size]
                
                _ , my_loss = sess.run([self.optimizer, self.Loss] , feed_dict={self.model_X_0 :X_train_batch , self.model_X_c : y_train_batch , self.model_V:V_batch, self.model_drop_ratio : 0.1})
            
            print("EPOCH %i LOSS %d" % (epoch, my_loss))
            
            if epoch % 50 == 0:
                os.mkdir('./models/%s/' % epoch)
                self.saver.save(sess, './models/%s/model_.ckpt' % epoch)
        
        sess.close()
        return U, V

In [53]:
item_infomation_matrix.shape

(10666, 10000)

In [54]:
rating_matrix.shape

(24280, 10666)

In [56]:
cdl = CDL(rating_matrix.todense() , item_infomation_matrix)
U, V = cdl.training() #188910

Mask shape: (10666, 10000)
Noising completed..:(10666, 10000)
1 / 500
EPOCH 0 LOSS 38256
2 / 500
EPOCH 1 LOSS 37839
3 / 500
EPOCH 2 LOSS 45189
4 / 500
EPOCH 3 LOSS 38831
5 / 500
EPOCH 4 LOSS 45071
6 / 500
EPOCH 5 LOSS 47140
7 / 500
EPOCH 6 LOSS 41313
8 / 500
EPOCH 7 LOSS 51621
9 / 500
EPOCH 8 LOSS 41407
10 / 500
EPOCH 9 LOSS 41800
11 / 500
EPOCH 10 LOSS 36958
12 / 500
EPOCH 11 LOSS 44572
13 / 500
EPOCH 12 LOSS 43631
14 / 500
EPOCH 13 LOSS 41954
15 / 500
EPOCH 14 LOSS 41988
16 / 500
EPOCH 15 LOSS 38905
17 / 500
EPOCH 16 LOSS 41959
18 / 500
EPOCH 17 LOSS 35495
19 / 500
EPOCH 18 LOSS 36717
20 / 500
EPOCH 19 LOSS 38856
21 / 500
EPOCH 20 LOSS 36139
22 / 500
EPOCH 21 LOSS 48009
23 / 500
EPOCH 22 LOSS 42087
24 / 500
EPOCH 23 LOSS 41003
25 / 500
EPOCH 24 LOSS 38418
26 / 500
EPOCH 25 LOSS 39123
27 / 500
EPOCH 26 LOSS 38324
28 / 500
EPOCH 27 LOSS 41622
29 / 500
EPOCH 28 LOSS 36707
30 / 500
EPOCH 29 LOSS 34216
31 / 500
EPOCH 30 LOSS 48535
32 / 500
EPOCH 31 LOSS 37472
33 / 500
EPOCH 32 LOSS 35541


EPOCH 259 LOSS 39885
261 / 500
EPOCH 260 LOSS 41989
262 / 500
EPOCH 261 LOSS 40323
263 / 500
EPOCH 262 LOSS 37295
264 / 500
EPOCH 263 LOSS 41465
265 / 500
EPOCH 264 LOSS 45385
266 / 500
EPOCH 265 LOSS 31862
267 / 500
EPOCH 266 LOSS 35395
268 / 500
EPOCH 267 LOSS 36998
269 / 500
EPOCH 268 LOSS 34485
270 / 500
EPOCH 269 LOSS 31187
271 / 500
EPOCH 270 LOSS 30054
272 / 500
EPOCH 271 LOSS 36077
273 / 500
EPOCH 272 LOSS 34100
274 / 500
EPOCH 273 LOSS 37411
275 / 500
EPOCH 274 LOSS 31523
276 / 500
EPOCH 275 LOSS 32512
277 / 500
EPOCH 276 LOSS 34132
278 / 500
EPOCH 277 LOSS 29298
279 / 500
EPOCH 278 LOSS 37686
280 / 500
EPOCH 279 LOSS 30616
281 / 500
EPOCH 280 LOSS 38028
282 / 500
EPOCH 281 LOSS 35323
283 / 500
EPOCH 282 LOSS 38205
284 / 500
EPOCH 283 LOSS 38449
285 / 500
EPOCH 284 LOSS 33003
286 / 500
EPOCH 285 LOSS 47594
287 / 500
EPOCH 286 LOSS 36285
288 / 500
EPOCH 287 LOSS 41134
289 / 500
EPOCH 288 LOSS 29015
290 / 500
EPOCH 289 LOSS 29896
291 / 500
EPOCH 290 LOSS 34182
292 / 500
EPOCH 29

In [46]:
%reset Out

Once deleted, variables cannot be recovered. Proceed (y/[n])? y
Flushing output cache (0 entries)


In [103]:
with open(r'/home/neopux/UHH/datasets/cdl_U_mx_train.pickle', 'wb') as handle:
    pickle.dump(U, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(r'/home/neopux/UHH/datasets/cdl_V_mx_train.pickle', 'wb') as handle:
    pickle.dump(V, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [6]:
with open(r'/home/neopux/UHH/datasets/cdl_U_mx_train.pickle', 'rb') as handle:
    U = pickle.load(handle)  
    
with open(r'/home/neopux/UHH/datasets/cdl_V_mx_train.pickle', 'rb') as handle2:
    V = pickle.load(handle2)

In [57]:
U.shape

(50, 24280)

In [58]:
V.shape

(10666, 50)

In [59]:
preds = U.transpose() * V.transpose()

In [60]:
preds.shape

(24280, 10666)

In [61]:
rating_matrix.shape

(24280, 10666)

## Metrics

### For training set

In [62]:
from sklearn.metrics import mean_squared_error

In [63]:
type(rating_matrix)

scipy.sparse.csr.csr_matrix

In [64]:
type(preds)

numpy.matrix

In [65]:
mean_squared_error(rating_matrix.todense(), preds) ** 0.5

0.2900202161173118

In [66]:
df_train_merged = df_train.merge(preds_df, on=['reviewerID', 'asin'])

NameError: name 'preds_df' is not defined

In [49]:
df_train_merged.head(2)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime,reviewTextProc,value
0,B008CP6MA2,"[0, 0]",5,"Great game, one of the best Playstation games ...","09 5, 2013",A1GFH98ATO6D5I,"Anvibe ""anvibe""",Impressive game,1378339200,great game good playstation game the price inc...,0.002949
1,B0050SVGW8,"[0, 0]",4,"I compare this game to Mario Bros. Wii,and Don...","03 18, 2012",A2NBT073SQ4MXA,JASON,"fun,a bit childish,and a boring soundtrack.",1332028800,i compare game mario bros. wii donkey kong cou...,0.005494


In [51]:
mean_squared_error(df_train_merged.overall, df_train_merged.value) ** 0.5

4.128256555954819

In [58]:
sum((df_train_merged.overall - df_train_merged.value < 1) & (df_train_merged.overall > 4))

5

### For test set

In [69]:
%reset Out

Once deleted, variables cannot be recovered. Proceed (y/[n])? y
Flushing output cache (1 entries)


In [68]:
preds_df_unmelt = pd.DataFrame(preds, columns = col_cat.categories, index = row_cat.categories)
preds_df_unmelt.index.name = 'reviewerID'
preds_df_unmelt.columns.name = 'asin'
preds_df_unmelt.head(2)

asin,0700099867,6050036071,7100027950,7293000936,8176503290,907843905X,9625990674,9861019731,9882155456,B000003SQQ,...,B00J128FPA,B00J226358,B00J6DLPLK,B00J9P3KBS,B00JM3R6M6,B00JQ8YH6A,B00JQHU9RC,B00JXW6GE0,B00KAI3KW2,B00KHECZXO
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A00263941WP7WCIL7AKWL,0.036217,-0.042771,0.142428,0.034893,-0.035349,0.060498,0.017174,0.067424,-0.027854,0.048216,...,0.032131,0.158985,0.042688,-0.108086,0.086972,-0.053289,-0.04667,-0.090321,0.033316,-0.06604
A005481137I9SCAWEF7ON,0.231518,0.21126,0.122527,0.219639,0.094767,0.090912,0.137666,0.135985,0.019068,0.063811,...,0.206081,0.309379,0.208197,-0.033302,0.167693,0.11397,0.121568,0.041478,-0.107656,0.082672


In [70]:
preds_df = preds_df_unmelt.reset_index().melt('reviewerID', var_name='asin')
preds_df.head(2)

Unnamed: 0,reviewerID,asin,value
0,A00263941WP7WCIL7AKWL,700099867,0.036217
1,A005481137I9SCAWEF7ON,700099867,0.231518


In [71]:
df_merged = df_test.merge(preds_df, on=['reviewerID', 'asin'])
df_merged.head(2)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime,reviewTextProc,value
0,B0000C6EB4,"[7, 7]",5,If you're looking for a far more rewarding and...,"11 6, 2003",AWXPAJ7VG5D4Y,Philip Lochner,The Definitive WW2 FPS,1068076800,if be look far rewarding enjoyable experience ...,-0.039016
1,B000B6ML1Y,"[0, 2]",5,The game is awesome...using gadgets playing co...,"07 3, 2007",A3L2ORVGVM3UET,"A. Gift For You ""I am THE godman""",Love it,1183420800,the game awesome gadget play co op online frie...,0.003322


In [72]:
df_test.shape

(69534, 10)

In [73]:
df_merged.shape

(69378, 11)

In [74]:
mean_squared_error(df_merged.overall, df_merged.value) ** 0.5

4.186046002615581

In [33]:
sum((df_merged.overall - df_merged.value) < 3)

16418

In [43]:
idx = ((df_merged.overall - df_merged.value < 1) & (df_merged.overall > 1))

In [44]:
df_merged[idx].shape

(106, 11)