In [1]:
import torch
import pandas as pd
import numpy as np
import os
from scipy.sparse import coo_matrix
from torch.utils.data.dataset import random_split
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

In [2]:
# if not os.path.isfile('dataset.csv'):
#     dataset = open('dataset.csv', mode='w')

# files = ['combined_data_1.txt',
#          'combined_data_2.txt',
#          'combined_data_3.txt',
#          'combined_data_4.txt']

# # Remove the line with movie_id: and add a new column of movie_id
# # Combine all data files into a csv file
# for file in files:
#   print("Opening file: {}".format(file))
#   with open(file) as f:
#     for line in f:
#         line = line.strip()
#         if line.endswith(':'):
#             movie_id = line.replace(':', '')
#         else:
#             dataset.write(movie_id + ',' + line)
#             dataset.write('\n')
# dataset.close()

# # Read all data into a pd dataframe
# data = pd.read_csv('dataset.csv', names=['movie_id', 'user_id','rating','date'])

# data

In [3]:
device = torch.device('cuda:0' if  torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [4]:
data = pd.read_csv('dataset.csv', names=['movie_id', 'user_id','rating','date'])

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100480507 entries, 0 to 100480506
Data columns (total 4 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   movie_id  int64 
 1   user_id   int64 
 2   rating    int64 
 3   date      object
dtypes: int64(3), object(1)
memory usage: 3.0+ GB


In [6]:
data = data.astype({ 'rating':float })

In [7]:
data.head()

Unnamed: 0,movie_id,user_id,rating,date
0,1,1488844,3.0,2005-09-06
1,1,822109,5.0,2005-05-13
2,1,885013,4.0,2005-10-19
3,1,30878,4.0,2005-12-26
4,1,823519,3.0,2004-05-03


In [8]:
data.tail()

Unnamed: 0,movie_id,user_id,rating,date
100480502,17770,1790158,4.0,2005-11-01
100480503,17770,1608708,3.0,2005-07-19
100480504,17770,234275,1.0,2004-08-07
100480505,17770,255278,4.0,2004-05-28
100480506,17770,453585,2.0,2005-03-10


## Data Split

In [9]:
coo_val = np.array(data['rating'])
coo_col = np.array(data['movie_id'])
coo_row = np.array(data['user_id'])

In [10]:
coo_col

array([    1,     1,     1, ..., 17770, 17770, 17770])

In [11]:
data.nunique()

movie_id     17770
user_id     480189
rating           5
date          2182
dtype: int64

In [12]:
max(data['user_id'])

2649429

user_id를 그냥 row의 index로 써버리면 너무 큰 matrix가 나온다.

In [13]:
user, indices = np.unique(coo_row, return_inverse=True) # user의 id와 가진 user_id의 index 반환

In [14]:
print(user) # 존재하는 user id
print(indices) # data user_id를 index로 변환

[      6       7       8 ... 2649421 2649426 2649429]
[270045 149546 160878 ...  42442  46235  82582]


In [15]:
max(indices)

480188

In [16]:
a = np.array([5,2,6,2,7,5,6,8,2,9])

u,ids = np.unique(a,return_inverse = True)
print(u)
print(ids)

[2 5 6 7 8 9]
[1 0 2 0 3 1 2 4 0 5]


In [17]:
coo = coo_matrix((coo_val, (indices, coo_col))) #coordinate

In [18]:
shape = coo.shape
print(shape)

(480189, 17771)


In [19]:
from sklearn.model_selection import train_test_split
train_row, test_row, train_col, test_col, train_data, test_data = train_test_split( 
    coo.row, coo.col, coo.data, test_size=0.2, random_state=42)

In [20]:
train_row, dev_row, train_col, dev_col, train_data, dev_data = train_test_split( 
    train_row, train_col, train_data, test_size=0.1, random_state=42)

In [21]:
print(len(train_data))
print(len(test_data))
print(len(dev_data))

72345964
20096102
8038441


## DataSet

In [22]:
class RatingDataSet(Dataset):
    def __init__(self, users, movies, ratings):
        super().__init__()
        self.users = users
        self.movies = movies
        self.ratings = ratings
        
    def __len__(self):
        return len(self.users)
    
    def __getitem__(self, idx): 
        
        user = self.users[idx]
        movie = self.movies[idx]
        rating = self.ratings[idx]
        
        
        user = torch.as_tensor(np.array(user).astype('long'))
        movie = torch.as_tensor(np.array(movie).astype('long'))
        rating = torch.as_tensor(np.array(rating).astype('float32'))
        
        return user, movie, rating

In [23]:
train_dataset = RatingDataSet(train_row, train_col, train_data)
dev_dataset = RatingDataSet(dev_row, dev_col, dev_data)
test_dataset = RatingDataSet(test_row, test_col, test_data)

In [24]:
train_loader = DataLoader(dataset=train_dataset, batch_size=10000, shuffle=True)
dev_loader = DataLoader(dataset=dev_dataset, batch_size=10000)
test_loader = DataLoader(dataset=test_dataset, batch_size=10000)

In [25]:
# train_movies = torch.LongTensor(train_col)
# train_users = torch.LongTensor(train_row)
# train_ratings = torch.FloatTensor(train_data)
# dev_movies = torch.LongTensor(dev_col)
# dev_users = torch.LongTensor(dev_row)
# dev_ratings = torch.FloatTensor(dev_data)
# test_movies = torch.LongTensor(test_col)
# test_users = torch.LongTensor(test_row)
# test_ratings = torch.FloatTensor(test_data)

In [26]:
# print(train_movies.unique())
# # print(train_movies.max())
# print(train_users.unique())
# # print(train_users.max())

##  Matrix Factorization 

### Model

In [27]:
rank = 50
numMovies = data['movie_id'].nunique()+1 # 영화 개수
numUsers = data['user_id'].nunique()+1 # 유저 수
print(numMovies)
print(numUsers)

17771
480190


In [28]:
class MF(torch.nn.Module):
    def __init__(self, n_users, n_movies, rank=20):
        super(MF, self).__init__()
        self.users = torch.nn.Embedding(n_users, rank)
        self.movies = torch.nn.Embedding(n_movies, rank) 
        
    def forward(self, user, movie):
        return torch.sum(self.users(user) * self.movies(movie), dim= 1)

In [29]:
class RMSELoss(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = torch.nn.MSELoss()
        
    def forward(self,yhat,y):
        return torch.sqrt(self.mse(yhat,y))



In [30]:
model = MF(numUsers, numMovies, rank=20).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr= 0.001)
# criterion = torch.nn.MSELoss() #Mean Squared Error
criterion = RMSELoss()


### Embedding이란?
학습을 한다는 것은 행렬과 벡터의 연산으로 가중치를 조절하는 것이기 때문에 **입력시키는 무언가(예를 들면 단어, 문장 등 토큰)를 벡터로 변경시켜주는 작업**

## Train

In [31]:
# ## permutation은 array를 복사해서 셔플을 한다. shuffle은 array를 셔플해서 INPLACE를 한다.
# p = np.random.permutation(len(train_movies)) 

# train_movies, train_users, train_ratings = train_movies[p], train_users[p], train_ratings[p]

In [36]:
accuracys = []
costs = []

In [35]:
for epoch in range(5):

    cost = 0
    batches = 0
#     num_records = len(train_users)
    for user, movie, rating in tqdm(train_loader):
    #for row, col, rating in zip(train_users, train_movies, train_ratings):
#         row = train_users[i]
#         col = train_movies[i]
#         rating = train_ratings[i]
        
#         #Tensor로 변경
#         rating = torch.FloatTensor([rating])
#         row = torch.LongTensor([row])
#         col = torch.LongTensor([col])
        user , movie, rating = user.to(device) , movie.to(device), rating.to(device)
    
        
        # Predict and calculate loss
        optimizer.zero_grad()
        predict = model(user, movie)
#         loss = criterion(predict, rating)
        loss = criterion(predict,rating)

        # Backpropagate
        loss.backward()
        # Update the parameters
        optimizer.step()

        cost += loss.item()
        batches += 1
        
    cost /=batches

    costs.append(cost)
    print('epoch : {}, cost = {:.6f}'.format(epoch+1, cost))
    print("Dev")
    test(dev_loader, model)

  0%|          | 0/7235 [00:00<?, ?it/s]

epoch : 1, cost = 1.498285
Dev


  0%|          | 0/804 [00:00<?, ?it/s]

Accuracy : 2.4880446858333016e-07 ()


  0%|          | 0/7235 [00:00<?, ?it/s]

epoch : 2, cost = 1.026589
Dev


  0%|          | 0/804 [00:00<?, ?it/s]

Accuracy : 2.4880446858333016e-07 ()


  0%|          | 0/7235 [00:00<?, ?it/s]

epoch : 3, cost = 0.964339
Dev


  0%|          | 0/804 [00:00<?, ?it/s]

Accuracy : 0.0 ()


  0%|          | 0/7235 [00:00<?, ?it/s]

epoch : 4, cost = 0.946075
Dev


  0%|          | 0/804 [00:00<?, ?it/s]

Accuracy : 0.0 ()


  0%|          | 0/7235 [00:00<?, ?it/s]

epoch : 5, cost = 0.936340
Dev


  0%|          | 0/804 [00:00<?, ?it/s]

Accuracy : 3.7320671708584996e-07 ()


## Test

In [None]:
print(criterion)

In [46]:
def test(dev_loader, model):

    predictions = torch.tensor([], dtype=torch.float).to(device)
    actual = torch.tensor([], dtype=torch.float).to(device)
    
#     n_predict = 0
#     n_correct = 0
    
    with torch.no_grad():
        model.eval()
        for user, item, rating in tqdm(dev_loader):
            user, item, rating = user.to(device), item.to(device), rating.to(device)
            predicted = model(user, item)
#             print(y_hat.shape)
#             _, predicted = torch.max(y_hat)
            
#             n_predict += len(predicted)
#             n_correct += (rating == predicted).sum()
            predictions = torch.cat((predictions, predicted), 0)
            actual = torch.cat((actual, rating), 0)
    
    predictions = predictions.cpu().numpy()
    actual = actual.cpu().numpy()
    rmse = np.sqrt(mean_squared_error(predictions, actual))
    
    return rmse
#     accuracy = n_correct/n_predict
#     print(f"Accuracy : {accuracy} ()")
    

#     accuracys.append(accuracy)

In [48]:
from sklearn.metrics import mean_squared_error

In [49]:
test_rmse = test(test_loader,model)
print(f'test rmse : {test_rmse}')

  0%|          | 0/2010 [00:00<?, ?it/s]

test rmse : 0.9448485970497131


In [None]:
data.head()

In [None]:
data_822109 = data[(data['user_id']== 822109) & (data['rating'] == 5)]
data_822109

In [None]:
movie_822109 = data_822109.movie_id.reset_index()
movie_822109 = movie_822109.drop('index', axis=1)
movie_822109.head()

In [None]:
np.where(user == 822109)

In [None]:
torch.tensor(149546)

In [None]:
estimate_rate = []
for item in movie_822109['movie_id']:
    user = torch.tensor(149546).to(device)
    item = torch.tensor(item).to(device)
    predicted = model(user, item)
    estimate_rate.append(predicted)

## Save

In [38]:
PATH = 'RMSE.tar'
torch.save({
#             'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'loss': criterion
            }, PATH)

## Load

In [None]:
model = MF(numUsers, numMovies, rank=20).to(device)
checkpoint = torch.load('model.pt')
model.load_state_dict(checkpoint)

In [33]:
checkpoint = torch.load('RMSE.tar')
model = MF(numUsers, numMovies, rank=20).to(device)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])
# epoch = checkpoint['epoch']
loss = checkpoint['loss']

# model.eval()
# # - or -
# model.train()