# collaborative filtering using embedding

#### import libraries

In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#### data 준비하기

In [3]:
beers = pd.read_csv('D:/stat_team/project/beers-breweries-and-beer-reviews/beers.csv') #맥주 정보
breweries = pd.read_csv('D:/stat_team/project/beers-breweries-and-beer-reviews/breweries.csv') #공장 정보
reviews = pd.read_csv('D:/stat_team/project/beers-breweries-and-beer-reviews/reviews.csv') #평점

In [4]:
reviews.head()

Unnamed: 0,beer_id,username,date,text,look,smell,taste,feel,overall,score
0,271781,bluejacket74,2017-03-17,"750 ml bottle, 2016 vintage, bottle #304 of...",4.0,4.0,4.0,4.25,4.0,4.03
1,125646,_dirty_,2017-12-21,,4.5,4.5,4.5,4.5,4.5,4.5
2,125646,CJDUBYA,2017-12-21,,4.75,4.75,4.75,4.75,4.75,4.75
3,125646,GratefulBeerGuy,2017-12-20,0% 16 oz can. Funny story: As I finally wal...,4.75,4.75,4.5,4.5,4.5,4.58
4,125646,LukeGude,2017-12-20,Classic TH NEIPA. Overflowing head and bouq...,4.25,4.5,4.25,4.25,4.25,4.31


In [5]:
reviews.shape

(9073128, 10)

In [6]:
### 필요한 columns: beer_id, username, score
rating = reviews.drop(['date','text','look','smell','taste','feel','overall'], axis = 1)
rating.columns = ['beer','user','rating']
rating.head()

Unnamed: 0,beer,user,rating
0,271781,bluejacket74,4.03
1,125646,_dirty_,4.5
2,125646,CJDUBYA,4.75
3,125646,GratefulBeerGuy,4.58
4,125646,LukeGude,4.31


In [7]:
### user를 number로 바꾸기
user_uniq = rating['user'].unique()
user_dict = {user_uniq[i]: i+1 for i in range(len(user_uniq))}

rating['user_id'] = rating['user'].map(user_dict)

print('{} username convert to {} unigue number'.format(user_uniq.shape[0],len(user_dict)))
rating.head()

164935 username convert to 164935 unigue number


Unnamed: 0,beer,user,rating,user_id
0,271781,bluejacket74,4.03,1
1,125646,_dirty_,4.5,2
2,125646,CJDUBYA,4.75,3
3,125646,GratefulBeerGuy,4.58,4
4,125646,LukeGude,4.31,5


In [8]:
n_users, n_beers = len(rating.user_id.unique()), len(rating.beer.unique())
print('The dataset includes {} ratings by {} unique users on {} unique beers.'.format(rating.shape[0],n_users,n_beers))

The dataset includes 9073128 ratings by 164935 unique users on 309542 unique beers.


In [9]:
### object 변수 없애고 numeric 변수만 남기기
data = rating.drop('user',axis = 1)
data = data[['user_id','beer','rating']]
data.head()

Unnamed: 0,user_id,beer,rating
0,1,271781,4.03
1,2,125646,4.5
2,3,125646,4.75
3,4,125646,4.58
4,5,125646,4.31


## Data split - train, test

In [10]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size = .2)

print('The training and testing data include {}, {} records.'.format(len(train),len(test)))

The training and testing data include 7258502, 1814626 records.


## Create model

In [11]:
from keras.models import Sequential, Model
from keras.layers import Embedding, Flatten, Dense, Dropout, concatenate, multiply, Input
from keras.optimizers import Adam

Using TensorFlow backend.


#### 1-Matrix factorization method

In [20]:
dim_embeddings = 50
bias = 1

beer_input = Input(shape=[1],name='Beer')
beer_embedding = Embedding(n_beers+1, dim_embeddings, name = 'Beer-Embedding')(beer_input)
beer_bias = Embedding(n_beers+1, bias, name = 'Beer-Bias')(beer_input)

user_input = Input(shape=[1], name = 'User')
user_embedding = Embedding(n_users+1,dim_embeddings, name = 'User-Embedding')(user_input)
user_bias = Embedding(n_users+1,bias, name = 'User-Bias')(user_input)

matrix_product = multiply([beer_embedding, user_embedding])
matrix_product = Dropout(0.2)(matrix_product)

input_terms = concatenate([matrix_product, user_bias, beer_bias])
input_terms = Flatten()(input_terms)

dense_1 = Dense(50, activation = 'relu', name = 'Dense1')(input_terms)
dense_1 = Dropout(0.2)(dense_1)
dense_2 = Dense(20, activation = 'relu', name = 'Dense2')(dense_1)
dense_2 = Dropout(0.2)(dense_2)
result = Dense(1, activation = 'relu', name = 'Activation')(dense_2)

model_mf = Model(inputs = [beer_input, user_input], outputs = result)

model_mf.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Beer (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
User (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
Beer-Embedding (Embedding)      (None, 1, 50)        15477150    Beer[0][0]                       
__________________________________________________________________________________________________
User-Embedding (Embedding)      (None, 1, 50)        8246800     User[0][0]                       
____________________________________________________________________________________________

## Training

In [21]:
opt_adam = Adam(lr = 0.002)

model_mf.compile(optimizer = opt_adam, loss = ['mse'], metrics = ['mean_absolute_error'])

history_mf = model_mf.fit([train['user_id'],train['beer']],train['rating'],batch_size = 50,
                          validation_split = 0.005, epochs = 4, verbose = 0)

InvalidArgumentError: indices[1,0] = 333679 is not in [0, 164936)
	 [[{{node User-Embedding_1/embedding_lookup}}]]

#### 2-Tabular data method

In [22]:
dim_embedding_user = 50
dim_embedding_beer = 50

# beer embedding
beer_input = Input(shape = [1], name = 'Beer')
beer_embedding = Embedding(n_beers+1, dim_embedding_beer+1, name = 'Beer-Embedding')(beer_input)
beer_vec = Flatten(name = 'Beer-Flatten')(beer_embedding)
beer_vec = Dropout(0.2)(beer_vec)

# user embedding
user_input = Input(shape = [1], name = 'User')
user_embedding = Embedding(n_users+1, dim_embedding_user+1, name = 'User-Embedding')(user_input)
user_vec = Flatten(name = 'User-Flatten')(user_embedding)
user_vec = Dropout(0.2)(user_vec)

# concat flattened values
concat = concatenate([beer_vec,user_vec])
concat_dropout = Dropout(0.2)(concat)

# add Dense Layer(point)
dense_1 = Dense(20, name = 'Fully-Connected1', activation = 'relu')(concat)
dense_2 = Dense(30, name = 'Fully-connected2', activation = 'relu')(dense_1)

# output
result = Dense(1, activation = 'relu', name = 'Activaton')(dense_2)
#result = Dense(1, activation = 'sigmoid', name = 'Activation')(dense_2)

# model with 2 input and 1 output
model_tabular = Model([user_input,beer_input],result)

# model summary
model_tabular.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Beer (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
User (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
Beer-Embedding (Embedding)      (None, 1, 51)        15786693    Beer[0][0]                       
__________________________________________________________________________________________________
User-Embedding (Embedding)      (None, 1, 51)        8411736     User[0][0]                       
____________________________________________________________________________________________

## Training

In [23]:
np.random.seed(999)

opt_adam = Adam(lr = 0.002)

# compile model
model_tabular.compile(optimizer = opt_adam, loss = ['mse'], metrics = ['mean_absolute_error'])

# history
history_tabular = model_tabular.fit([train['user_id'],train['beer']], train['rating'],
                                   batch_size = 500, validation_split = 0.005, epochs = 4, verbose = 1)

Train on 7222209 samples, validate on 36293 samples
Epoch 1/4


InvalidArgumentError: indices[67,0] = 324162 is not in [0, 309543)
	 [[{{node Beer-Embedding_2/embedding_lookup}}]]

In [None]:
history = pd.DataFrame(history_tabular.history)

### Prediction

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

def get_array(series):
    return np.array([[element] for element in series])

predictions = model_tabular.predict([get_array(test['user_id']), get_array(test['beer'])])
print('rmse on test data is',(mean_squared_error(test['rating'],predictions))**.5)