# CDL

### import module

In [1]:
import numpy as np
import pickle
import tensorflow as tf
import pandas as pd
from keras.preprocessing.text import Tokenizer, one_hot
from pandas.api.types import CategoricalDtype
from scipy.sparse import csr_matrix
#init random seed
np.random.seed(5)

Using TensorFlow backend.


## 1. Data Preprocessing

In [2]:
df = pd.read_json('D:/Datasets/amazon_reviews/Video_Games_5_proc.json')

In [3]:
df.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTextProc,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0700099867,"[8, 12]",1,Installing the game was a struggle (because of...,instal game struggle game window live bugs).so...,"07 9, 2012",A2HD75EMZR8QLN,123,Pay to unlock content? I don't think so.,1341792000
1,0700099867,"[0, 0]",4,If you like rally cars get this game you will ...,if like rally car game fun it orient 34;europe...,"06 30, 2013",A3UR8NLLY1ZHCX,"Alejandro Henao ""Electronic Junky""",Good rally game,1372550400
10,0700099867,"[1, 1]",5,I'm not quite finished with the game's DiRT To...,i be finish game 's dirt tour mode i believe i...,"06 28, 2011",A38NXTZUFB1O2K,FiSH,Best in the series!,1309219200
100,9882155456,"[0, 0]",3,They work fine but i feel like they have a lit...,they work fine feel like little lag control us...,"01 7, 2014",A19IX3U60WJL1V,Edgar,works but flawed.,1389052800
1000,B00000I1BE,"[8, 10]",5,This game in my opinion is the best of all the...,this game opinion good castlevania series far ...,"10 3, 2000",A2U7ABLSDAITGC,D. Lewis,Castlevania Mania,970531200


In [4]:
reviews = df.groupby('asin').reviewTextProc.agg(' '.join)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=10000)
vectorizer.fit(reviews.values)

X_train = vectorizer.transform(reviews.values)

In [6]:
print(X_train.shape)

(10672, 10000)


In [7]:
list(vectorizer.vocabulary_.items())[0:10]

[('axel', 850),
 ('nt', 6034),
 ('nfl', 5934),
 ('disapoint', 2555),
 ('everytime', 3112),
 ('understandably', 9274),
 ('interrupt', 4680),
 ('hair', 4056),
 ('museum', 5789),
 ('suit', 8595)]

In [8]:
list(vectorizer.stop_words_)[0:10]

['doubleheader',
 'tarturus',
 'starvation',
 'dimensionsfor',
 '47bundle',
 'whimpers',
 'gumball',
 'anf',
 'quoteable',
 'visaul']

In [9]:
X_train

<10672x10000 sparse matrix of type '<class 'numpy.int64'>'
	with 6711786 stored elements in Compressed Sparse Row format>

#### build rating matrix citeulike-a

In [10]:
asin = CategoricalDtype(sorted(df.asin.unique()), ordered=True)
rev_id = CategoricalDtype(sorted(df.reviewerID.unique()), ordered=True)

row_cat = df.reviewerID.astype(rev_id).cat
col_cat = df.asin.astype(asin).cat

row = row_cat.codes
col = col_cat.codes

sparse_matrix = csr_matrix((df["overall"].values, (row, col)), \
                           shape=(rev_id.categories.size, asin.categories.size), dtype = 'd')

In [11]:
rating_matrix = sparse_matrix

#### save matrix by pickle

In [58]:
with open(r'D:/Datasets/amazon_reviews/cdl_item_infomation_matrix.pickle', 'wb') as handle:
    pickle.dump(X_train, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(r'D:/Datasets/amazon_reviews/cdl_rating_matrix.pickle', 'wb') as handle:
    pickle.dump(rating_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)

#### load matrix from pickle 

In [5]:
with open(r'D:/Datasets/amazon_reviews/cdl_item_infomation_matrix.pickle', 'rb') as handle:
    X_train = pickle.load(handle)  
    
with open(r'D:/Datasets/amazon_reviews/cdl_rating_matrix.pickle', 'rb') as handle2:
    rating_matrix = pickle.load(handle2)

## 2. build model

### matrix factorization model

In [12]:
class MF():
    def __init__(self , rating_matrix ):
        self.num_u = rating_matrix.shape[0] #5551
        self.num_v = rating_matrix.shape[1] #16980
        
        self.u_lambda = 100
        self.v_lambda = 0.1
        
        self.k = 50 #latent維度
        self.a = 1
        self.b = 0.01
        
        self.R = np.mat(rating_matrix)
        self.C = np.mat(np.ones(self.R.shape)) * self.b
        self.C[np.where(self.R>0)] = self.a
        
        self.I_U = np.mat(np.eye(self.k) * self.u_lambda)
        self.I_V = np.mat(np.eye(self.k) * self.v_lambda)
        
        self.U = np.mat(np.random.normal(0 , 1/self.u_lambda , size=(self.k,self.num_u)))
        self.V = np.mat(np.random.normal(0 , 1/self.v_lambda , size=(self.k,self.num_v)))
                        

    def test(self):
        print( ((U_cut*self.R[np.ravel(np.where(self.R[:,j]>0)[1]),j] + self.v_lambda * self.V_sdae[j])).shape)
        
    
    def ALS(self , V_sdae):
        self.V_sdae = np.mat(V_sdae)
        
        V_sq = self.V * self.V.T * self.b
        for i in range(self.num_u):
            idx_a = np.ravel(np.where(self.R[i,:]>0)[1])
            V_cut = self.V[:,idx_a]
            self.U[:,i] = np.linalg.pinv( V_sq+ V_cut * V_cut.T * (self.a-self.b) + self.I_U )*(V_cut*self.R[i,idx_a].T) #V_sq+V_cut*V_cut.T*a_m_b = VCV^T
        
        U_sq = self.U * self.U.T * self.b
        for j in range(self.num_v):
            idx_a = np.ravel(np.where(self.R[:,j]>0)[1])
            U_cut = self.U[:,idx_a]
            self.V[:,j] = np.linalg.pinv(U_sq+U_cut*U_cut.T*(self.a-self.b)+self.I_V)* (U_cut*self.R[idx_a,j] + self.v_lambda * np.resize(self.V_sdae[j],(self.k,1)))
        
        return self.U ,self.V

#### masking noise 

In [19]:
def mask(corruption_level, shape):
    mask = np.random.binomial(1, 1 - corruption_level, shape)
    return mask

def add_noise(x , corruption_level ):
    mask_ = mask(corruption_level , x.shape)
    x = np.multiply(x, mask_)
    print("Noising completed..:" + str(x.shape))
    return x

In [15]:
class CDL():
    def __init__(self , rating_matrix , item_infomation_matrix):
        # model參數設定
        self.n_input = 8000
        self.n_hidden1 = 200
        self.n_hidden2 = 50
        self.k = 50
        
        self.lambda_w = 1
        self.lambda_n = 1
        self.lambda_u = 1
        self.lambda_v = 1
        
        self.drop_ratio = 0.1
        self.learning_rate = 0.001
        self.epochs = 10
        self.batch_size = 32
        
        self.num_u = rating_matrix.shape[0]
        self.num_v = rating_matrix.shape[1]
        
        self.Weights = {
            'w1' : tf.Variable(tf.random_normal( [self.n_input , self.n_hidden1] , mean=0.0, stddev=1 / self.lambda_w )),
            'w2' : tf.Variable(tf.random_normal( [self.n_hidden1 , self.n_hidden2] , mean=0.0, stddev=1 / self.lambda_w )),
            'w3' : tf.Variable(tf.random_normal( [self.n_hidden2 , self.n_hidden1] , mean=0.0, stddev=1 / self.lambda_w )),
            'w4' : tf.Variable(tf.random_normal( [self.n_hidden1 , self.n_input] , mean=0.0, stddev=1 / self.lambda_w ))   
        }
        self.Biases = {
            'b1' : tf.Variable(tf.random_normal( [self.n_hidden1] , mean=0.0, stddev=1 / self.lambda_w )),
            'b2' : tf.Variable(tf.random_normal( [self.n_hidden2] , mean=0.0, stddev=1 / self.lambda_w )),
            'b3' : tf.Variable(tf.random_normal( [self.n_hidden1] , mean=0.0, stddev=1 / self.lambda_w )),
            'b4' : tf.Variable(tf.random_normal( [self.n_input] , mean=0.0, stddev=1 / self.lambda_w ))
        }
        
        self.item_infomation_matrix = item_infomation_matrix
        self.rating_matrix = rating_matrix
    
        self.build_model()
        
    def encoder(self , x , drop_ratio):
        w1 = self.Weights['w1']
        b1 = self.Biases['b1']
        L1 = tf.nn.sigmoid( tf.matmul(x,w1) + b1 )
        L1 = tf.nn.dropout( L1 , keep_prob= 1 - drop_ratio )
        
        w2 = self.Weights['w2']
        b2 = self.Biases['b2']
        L2 = tf.nn.sigmoid( tf.matmul(L1,w2) + b2 )
        L2 = tf.nn.dropout(L2 , keep_prob= 1 - drop_ratio)
        
        return L2
    
    def decoder(self , x , drop_ratio):
        w3 = self.Weights['w3']
        b3 = self.Biases['b3']
        L3 = tf.nn.sigmoid(tf.matmul(x,w3) + b3)
        L3 = tf.nn.dropout(L3 , keep_prob= 1 - drop_ratio)

        w4 = self.Weights['w4']
        b4 = self.Biases['b4']
        L4 = tf.nn.sigmoid(tf.matmul(L3,w4) + b4)
        L4 = tf.nn.dropout(L4 , keep_prob= 1 - drop_ratio)

        return L4
    
    def build_model(self):
        self.model_X_0 = tf.placeholder(tf.float32 , shape=(None , self.n_input))
        self.model_X_c = tf.placeholder(tf.float32 , shape=(None , self.n_input))
        self.model_V = tf.placeholder(tf.float32 , shape=(None , self.k))
        self.model_drop_ratio = tf.placeholder(tf.float32)
        
        self.V_sdae = self.encoder( self.model_X_0 , self.model_drop_ratio )
        self.y_pred = self.decoder( self.V_sdae , self.model_drop_ratio )
        
        self.Regularization = tf.reduce_sum([tf.nn.l2_loss(w) + tf.nn.l2_loss(b) 
                                             for w,b in zip(self.Weights.values() , self.Biases.values())])
        loss_r =1/2 * self.lambda_w * self.Regularization
        loss_a =1/2 * self.lambda_n * tf.reduce_sum(tf.pow( self.model_X_c - self.y_pred , 2 ))
        loss_v =1/2 * self.lambda_v * tf.reduce_sum(tf.pow( self.model_V - self.V_sdae , 2 ))
        
        self.Loss = loss_r + loss_a + loss_v
        self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.Loss)
    
    def training(self):
        #np.random.shuffle(self.item_infomation_matrix) #random index of train data
        
        #self.item_infomation_matrix_noise = add_noise(self.item_infomation_matrix , 0.3)
        
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())
        
        mf = MF(self.rating_matrix)
        
        for epoch in range(self.epochs):
            print("%d / %d"%(epoch+1 , self.epochs))
            
            V_sdae = sess.run(self.V_sdae , feed_dict={self.model_X_0 : self.item_infomation_matrix_noise , self.model_drop_ratio : 0.1})
            
            U , V = mf.ALS(V_sdae)
            V = np.resize(V,(16980 , 50))
            for i in range(0 , self.item_infomation_matrix.shape[0] , self.batch_size):
                X_train_batch = self.item_infomation_matrix_noise[i:i+self.batch_size]
                y_train_batch = self.item_infomation_matrix[i:i+self.batch_size]
                V_batch = V[i:i+self.batch_size]
                
                _ , my_loss = sess.run([self.optimizer, self.Loss] , feed_dict={self.model_X_0 :X_train_batch , self.model_X_c : y_train_batch , self.model_V:V_batch, self.model_drop_ratio : 0.1})
            print(my_loss)
        
        sess.close()

In [20]:
X_train.shape

(10672, 10000)

In [21]:
X_train_noised = add_noise(X_train , 0.3)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\irina\Miniconda3\envs\exmc\lib\site-packages\scipy\sparse\base.py", line 461, in __mul__
    M, N = self.shape
  File "C:\Users\irina\Miniconda3\envs\exmc\lib\site-packages\scipy\sparse\base.py", line 86, in get_shape
    def get_shape(self):
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\irina\Miniconda3\envs\exmc\lib\site-packages\scipy\sparse\base.py", line 461, in __mul__
    M, N = self.shape
SystemError: PyEval_EvalFrameEx returned a result with an error set

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\irina\Miniconda3\envs\exmc\lib\site-packages\scipy\sparse\base.py", line 461, in __mul__
    M, N = self.shape
SystemError: PyEval_EvalFrameEx returned a result with an error set

During handling of the above exception, another exception occurred:

Traceback (most

KeyboardInterrupt: 

In [None]:
cdl = CDL(rating_matrix , X_train)
cdl.training()

In [16]:
%reset Out

Once deleted, variables cannot be recovered. Proceed (y/[n])? y
Flushing output cache (4 entries)
