In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import gensim
import warnings
warnings.filterwarnings(action='ignore')



# 1) data load

In [2]:
movie = pd.read_csv('ratings.csv', low_memory = False)
movie.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179


In [3]:
# 개봉한지 오래된 순서대로 정렬
movie_df = movie.sort_values(by='timestamp', ascending = True).reset_index()
movie.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179


영화의 title이 없음. 영화 metadata를 불러와 movieID와 matching 시켜줌

In [4]:
movie_meta = pd.read_csv('movies_metadata.csv', low_memory = False)
movie_meta.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [5]:
movie_meta.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [6]:
# metadata의 id column의 이름을 movieId로 바꿈
movie_meta = movie_meta.rename(columns = {'id':'movieId'})

# movieId column의 type을 String으로 바꿈
movie_df['movieId'] = movie_df['movieId'].astype(str)
movie_meta['movieId'] = movie_meta['movieId'].astype(str)

movie_df = pd.merge(movie_df, movie_meta[['movieId', 'original_title']], how = 'left', on = 'movieId')

In [7]:
movie_df.head(3)

Unnamed: 0,index,userId,movieId,rating,timestamp,original_title
0,52635,383,21,3.0,789652009,The Endless Summer
1,52641,383,47,5.0,789652009,
2,52684,383,1079,3.0,789652009,


In [8]:
# 'original_title' column이 null값인 row를 drop함
movie_df = movie_df[movie_df['original_title'].notnull()].reset_index(drop=True)

In [9]:
# user_id 별로 groupby 진행
agg = movie_df.groupby(['userId'])['original_title'].agg({'unique'})
agg.head()

Unnamed: 0_level_0,unique
userId,Unnamed: 1_level_1
1,"[Jay and Silent Bob Strike Back, Vivement dima..."
2,"[Terminator 3: Rise of the Machines, The Conve..."
3,"[300, The Killing, Shortbus, Finding Neverland..."
4,"[David, The Wedding Planner, Casablanca, Sleep..."
5,"[Gleaming the Cube, Cool Hand Luke, Hidalgo, U..."


In [10]:
movie_df['original_title'].unique()

array(['The Endless Summer', 'Jarhead', '彼女の想いで', ...,
       'The Lonedale Operator', 'Violeta se fue a los cielos',
       'To Kill a Priest'], dtype=object)

# 2) Word2Vec: CBOW

사용자가 시청한 영화 하나하나를 단어로 보고, 영화 간의 유사도 계산

In [352]:
# int 형식은 Word2Vec에서 학습이 진행되지 않으므로 String으로 변경함 (ex. 300)
sentence = []
for user_sentence in agg['unique'].values:
    sentence.append(list(map(str, user_sentence)))

#### Softmax_loss Function

In [353]:
def softmax_loss(score):
    
    ## forward pass
    softmax = np.exp(score) / np.sum(np.exp(score))
    N = score[0].shape[0]
    loss = -np.sum(np.log(softmax), axis=1) #softmax loss
    loss /= N
    
    ## backward pass
    ds = softmax - 1
    
    return loss, ds

#### CBOW class 

In [375]:
class CBOW(object):
    def __init__(
        self,
        hidden_dim,
        input_dim,
        num_classes,
        windows
    ):
        self.windows = windows
        self.params = {}
        for i in range(windows*2):
            idx = str(i)
            W_name = 'W_in_' + idx
            self.params[W_name] = np.random.randn(input_dim, hidden_dim)
        
        self.params['W_out'] = np.random.randn(hidden_dim, num_classes)
    
    def loss(self, X, y, j, mode):
        
        # forward pass
        h, count = 0, self.windows*2
        for i in range(self.windows*2):
            if i < self.windows:
                if j-i<0:
                    count -= 1
                    continue
                idx = j-i
            else:
                if j+i>=X.shape[0]:
                    count -= 1 
                    continue
                idx = j+i
        h += np.matmul(X[idx], self.params['W_in_'+str(i)])
        h /= count
        h = h.reshape(1, -1)
        scores = np.matmul(h, self.params['W_out'])
        
        if mode == "test":
            return scores
        
        grads = {}
        # softmax function
        loss, ds = softmax_loss(scores)
        
        # backward pass
        grads['W_out'] = np.dot(h.T, ds)
        dh = np.dot(ds, self.params['W_out'].T)
        
        for i in range(self.windows*2):
            if i < self.windows:
                if j-i<0:
                    continue
                idx = j-i
            else:
                if j+i>=X.shape[0]:
                    continue
                idx = j+i
            grads['W_in_' + str(i)] = np.dot(X[idx].T.reshape(-1, 1) ,dh/2)
        
        return loss, grads

In [376]:
X = np.array([
                [1, 0, 0, 0, 0, 0], # you
                [0, 1, 0, 0, 0, 0], # say
                [0, 0, 1, 0, 0, 0], # goodbye
                [0, 0, 0, 1, 0, 0], # and
                [0, 0, 0, 0, 1, 0], # I
                [0, 1, 0, 0, 0, 0], # say    
                [0, 0, 0, 0, 0, 1], # hello 
            ])
y = np.array([
                [1, 0, 0, 0, 0, 0], # you
                [0, 1, 0, 0, 0, 0], # say
                [0, 0, 1, 0, 0, 0], # goodbye
                [0, 0, 0, 1, 0, 0], # and
                [0, 0, 0, 0, 1, 0], # I
                [0, 1, 0, 0, 0, 0], # say    
                [0, 0, 0, 0, 0, 1], # hello 
            ])

In [377]:
cbow_model = CBOW(hidden_dim = 3,
                  input_dim = 6, 
                  num_classes = 6,
                  windows = 1)
epochs = 5
learning_rates = 3e-3

In [378]:
# train
for i in range(y.shape[0]):
    for e in range(epochs):
        loss, grads = cbow_model.loss(X, y[i], i, "train")
        print(f"epochs {e+1} | loss: {loss}")
        for name in grads:
            cbow_model.params[name] -= learning_rates * grads[name]
    print("--------------------------------------------")

epochs 1 | loss: [1.84226408]
epochs 2 | loss: [1.84195558]
epochs 3 | loss: [1.84164976]
epochs 4 | loss: [1.84134663]
epochs 5 | loss: [1.84104614]
--------------------------------------------
epochs 1 | loss: [2.25286539]
epochs 2 | loss: [2.25238992]
epochs 3 | loss: [2.25192391]
epochs 4 | loss: [2.2514673]
epochs 5 | loss: [2.25102003]
--------------------------------------------
epochs 1 | loss: [1.81182335]
epochs 2 | loss: [1.81182453]
epochs 3 | loss: [1.81182722]
epochs 4 | loss: [1.8118314]
epochs 5 | loss: [1.81183708]
--------------------------------------------
epochs 1 | loss: [2.50415215]
epochs 2 | loss: [2.50077278]
epochs 3 | loss: [2.49742568]
epochs 4 | loss: [2.49411057]
epochs 5 | loss: [2.49082715]
--------------------------------------------
epochs 1 | loss: [1.84064894]
epochs 2 | loss: [1.84035165]
epochs 3 | loss: [1.84005702]
epochs 4 | loss: [1.83976502]
epochs 5 | loss: [1.83947565]
--------------------------------------------
epochs 1 | loss: [1.9510003

In [379]:
# test: you ? goodbye and I say hello
X_test = np.array([
                   [0, 1, 0, 0, 0, 0], # say
                   [0, 0, 0, 1, 0, 0] # and
                  ])
print(cbow_model.loss(X_test, None, 1, "test"))

[[0.91585752 0.09535075 1.10113623 0.25500447 0.83198836 0.16955488]]
