In [None]:
#Colab setting
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.append('/content/drive/My Drive/DSAIL')

import os
os.chdir('/content/drive/My Drive/DSAIL')

Mounted at /content/drive


In [None]:

# Data U1.base로 train, u1.test로 test data만들기
# U.data는 현재 (["user_id", "movie_id", "rating", "timestamp"])로 구성
import pandas as pd
import torch

file_path = "ml-100k/u1.base"
columns = ["user_id", "movie_id", "rating", "timestamp"]
train_data = pd.read_csv(file_path, sep='\t', names=columns)


user_movie_matrix = train_data.pivot(index='user_id', columns='movie_id', values='rating')
train = user_movie_matrix.fillna(0)

file_path = "ml-100k/u1.test"
columns = ["user_id", "movie_id", "rating", "timestamp"]
test_data = pd.read_csv(file_path, sep='\t', names=columns)
user_movie_matrix = test_data.pivot(index='user_id', columns='movie_id', values='rating')
test = user_movie_matrix.fillna(0)

# GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("device", device)

device cuda


In [None]:
import numpy as np

#input data가 {user, rated movie, other movie's rating, Time, Last Movie rated| one-hot- encoding으로
# y는 score

num_user_train  = train_data["user_id"].max()
num_movie_train = train_data["movie_id"].max()
num_user_test  = test_data["user_id"].max()
num_movie_test = test_data["movie_id"].max()

num_features_train = len(train_data)
num_features_test = len(test_data)

# Ratings
test_R = np.load("continous_test.npy")
train_R = np.load("continous_train.npy")

# test_R 1591 -> 1682
num = 1682 - 1591
test_R = np.concatenate([test_R, np.zeros((test_R.shape[0], num))], axis=1)

# normalized_ratings
normalized_ratings_test = test_R / (test_R.sum(axis=1, keepdims=True) + 1e-13)
normalized_ratings_train = train_R / (train_R.sum(axis = 1, keepdims=True) +1e-13)

# user & movie one-hot encoding
# user_id_onehot = pd.get_dummies(train_data['user_id'], prefix='user')
# movie_id_onehot = pd.get_dummies(train_data['movie_id'], prefix='movie')   이러면 movie id가 연속이 아니라서 1683개의 movie가 있는데 1650 column만 생김

user_train = np.zeros((num_features_train, num_user_train))
movie_train = np.zeros((num_features_train, num_movie_train))
other_ratings_train = np.zeros((num_features_train, num_movie_train))
time_train = (train_data["timestamp"]-874724727) / 18561911
last_movie_train = np.zeros((num_features_train, num_movie_train))


for i in range(num_features_train ):
  user_train[i, train_data.iloc[i]['user_id'] - 1] = 1
  movie_train[i, train_data.iloc[i]['movie_id'] - 1] = 1
  other_ratings_train[i, :] = normalized_ratings_train[train_data.iloc[i]['user_id'] - 1, :]
  if i > 0:
    if train_data.iloc[i - 1]['user_id'] == train_data.iloc[i]['user_id']:
      last_movie_train[i, train_data.iloc[i - 1]['movie_id'] - 1] = 1

user_test = np.zeros((num_features_test, num_user_train))
movie_test = np.zeros((num_features_test, num_movie_train))
other_ratings_test = np.zeros((num_features_test, num_movie_train))
time_test = (test_data["timestamp"]-874724710) /18552992
last_movie_test = np.zeros((num_features_test, num_movie_train))


time_train = time_train.values.reshape(-1, 1)
time_test = time_test.values.reshape(-1, 1)

for i in range(num_features_test):
  user_test[i, test_data.iloc[i]['user_id'] - 1] = 1
  movie_test[i, test_data.iloc[i]['movie_id'] - 1] = 1
  other_ratings_test[i, :] = normalized_ratings_test[test_data.iloc[i]['user_id'] - 1, :]
  if i > 0:
    if test_data.iloc[i - 1]['user_id'] == test_data.iloc[i]['user_id']:
      last_movie_test[i, test_data.iloc[i - 1]['movie_id'] - 1] = 1


train_features = np.concatenate([user_train, movie_train, other_ratings_train, time_train, last_movie_train], axis = 1)
test_features = np.concatenate([user_test, movie_test, other_ratings_test, time_test, last_movie_test], axis = 1)

In [None]:
np.save("train_features.npy", train_features)
np.save("test_features.npy", test_features)

In [None]:
print(train_data["timestamp"].min())
print(test_data["timestamp"].min())
print(train_data["timestamp"].max())
print(test_data["timestamp"].max())
print(893286638 - 874724727)
print(893277702 - 874724710)

874724727
874724710
893286638
893277702
18561911
18552992


In [None]:
print(train_features.shape)
print(test_features.shape)
print(num_movie_test)

(80000, 5990)
(20000, 5990)
(80000,)
1591


In [None]:
print(num_user_train, num_movie_train)

943 1682


In [None]:
import numpy as np

train_features = np.load("train_features.npy")
test_features = np.load("test_features.npy")
y_train = train_data["rating"]
y_test = test_data["rating"]

In [None]:
y_test.max()

5

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torch.nn.init as init

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class FM(nn.Module):
  def __init__(self, train = train_features, y_train=y_train, y_test = y_test, num_user =943, num_movie = 1682, test=test_features, k=30, learning_rate=1e-3, epochs=100, device=device):
    '''
    FM
    SVM처럼 general하게 적용 가능하지만 sparse한 데이터에서도 사용될 수 있도록 고안된 모델
    degree = 2인 경우로

    self.n: # of features
    self.k: dimension
    '''
    super(FM, self).__init__()

    self.train_features = train # (80000, 5990)
    self.test_features = test   # (20000, 5990)
    self.y_train = y_train # (80000,)
    self.y_test = y_test   # (20000,)

    self.train_features = torch.tensor(train, dtype=torch.float32, device=device)
    self.test_features = torch.tensor(test, dtype=torch.float32, device=device)
    self.y_train = torch.tensor(y_train, dtype=torch.float32, device=device)
    self.y_test = torch.tensor(y_test, dtype=torch.float32, device=device)

    self.num_user = num_user
    self.num_movie = 1682

    self.n = self.train_features.shape[1]
    self.num_features_train = self.train_features.shape[0]
    self.num_features_test = self.test_features.shape[0]


    # dimension
    self.k = k

    # parameter w0, wi, vi,f
    self.w0 = nn.Parameter(init.normal_(torch.randn(1)), requires_grad=True)  # 상수 (1,)
    self.w = nn.Parameter(init.normal_(torch.randn(self.n)), requires_grad=True)  # (n,)
    self.V = nn.Parameter(init.normal_(torch.randn(self.n, self.k)), requires_grad=True)  # (n, k)

    self.lr = learning_rate
    self.epoch = epochs
    self.criterion = nn.MSELoss()

  def forward(self, x):
    # input : (1, 5990) # V : (5990, k)
    #
    interactions = 0.5 * (torch.sum((torch.matmul(x, self.V))) ** 2 - torch.sum(torch.matmul(x, self.V)**2))
    #print("interactions", interactions.shape)
    #print("interactions", interactions)
    y_hat = self.w0 + torch.matmul(x, self.w) + interactions
    #print("y_hat", y_hat.shape)
    print("y_hat", y_hat)
    return y_hat

  def loss(self, train = True):
    loss = 0
    if train:
      for i in range(self.num_features_train):
        y_hat = self.forward(self.train_features[i])
        #loss += self.criterion(y_hat, self.y_train[i].unsqueeze(0))
        loss += (y_hat - self.y_train[i].item()) ** 2
    else:
      for i in range(self.num_features_test):
        y_hat = self.forward(self.test_features[i])
        #loss += self.criterion(y_hat, self.y_test[i])
        loss +=  (y_hat - self.y_test[i].item()) ** 2

    return loss

  def fit(self):
    train_loss_list = []
    test_loss_list = []
    self.optimizer = torch.optim.SGD(self.parameters(), lr=self.lr)

    for epoch in range(self.epoch):
      #print(epoch)
      self.train()
      train_loss = self.loss()
      self.optimizer.zero_grad()
      train_loss.backward()
      self.optimizer.step()

      rmse_loss = torch.sqrt(train_loss / self.num_features_train)
      train_loss_list.append(rmse_loss)


      with torch.no_grad():
        test_loss = self.loss(train = False)
        rmse_test_loss = torch.sqrt(test_loss / self.num_features_test)
      #if epoch % 20 == 0:
      print(f'Epoch [{epoch}/{self.epoch}], train rmse: {rmse_loss}, test_rmse: {rmse_test_loss}')
    return train_loss_list, test_loss_list


In [None]:
import matplotlib.pyplot as plt

model = FM(epochs = 10).to(device)

# Train the model
train_loss_list, test_loss_list = model.fit()

plt.plot(train_loss_list.cpu().numpy())
plt.plot(test_loss_list.cpu().numpy())
plt.show()

time stamp 그대로 넣었더니 range 이상해짐

=> normalize를 했는데

=> torch.empty하면 값이 너무 큰 값도 들어감 ㄷ ㄷ ㄷ ㄷ (randn로 바꿈)

 sequential이 아닌데 timestamp가 필요할지??

 +) DataLoader의 중요성...
 메모리를 아끼려면 하나씩 학습시키는게 맞다고 생각했는데 어쨌거나 DataLoader를 통해서 batch 만큼만 메모리에 올리는게 더 효율적

 위와같이 코드를 작성하면 메모리에 train data가 전부 올라가서 매우 비효율적임



 ## GPU 이슈,, 나중에 다시 학습시켜 보기!

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

train_features = torch.tensor(np.load("train_features.npy"), dtype=torch.float32, device=device)
test_features = torch.tensor(np.load("test_features.npy"), dtype=torch.float32, device=device)
y_train = torch.tensor(train_data["rating"], dtype=torch.float32, device=device)
y_test = torch.tensor(test_data["rating"], dtype=torch.float32, device=device)

batch_size=64
train_dataset = TensorDataset(train_features, y_train)
test_dataset = TensorDataset(test_features, y_test)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)




In [None]:
import numpy as np
import torch.nn.init as init

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class FM(nn.Module):
  def __init__(self, train = train_features, y_train=y_train, y_test = y_test, num_user =943, num_movie = 1682, test=test_features, k=30, learning_rate=1e-3, epochs=100, constrain = False, device=device):
    '''
    FM
    SVM처럼 general하게 적용 가능하지만 sparse한 데이터에서도 사용될 수 있도록 고안된 모델
    degree = 2인 경우로

    self.n: # of features
    self.k: dimension
    '''
    super(FM, self).__init__()

    self.train_features = train # (80000, 5990)
    self.test_features = test   # (20000, 5990)
    self.y_train = y_train # (80000,)
    self.y_test = y_test   # (20000,)

    self.train_features = torch.tensor(train, dtype=torch.float32, device=device)
    self.test_features = torch.tensor(test, dtype=torch.float32, device=device)
    self.y_train = torch.tensor(y_train, dtype=torch.float32, device=device)
    self.y_test = torch.tensor(y_test, dtype=torch.float32, device=device)

    self.num_user = num_user
    self.num_movie = 1682

    self.n = self.train_features.shape[1]
    self.num_features_train = self.train_features.shape[0]
    self.num_features_test = self.test_features.shape[0]


    # dimension
    self.k = k

    # parameter w0, wi, vi,f
    self.w0 = nn.Parameter(init.normal_(torch.empty(1)), requires_grad=True)  # 상수 (1,)
    self.w = nn.Parameter(init.normal_(torch.empty(self.n)), requires_grad=True)  # (n,)
    self.V = nn.Parameter(init.normal_(torch.empty(self.n, self.k)), requires_grad=True)  # (n, k)

    self.lr = learning_rate
    self.epoch = epochs
    self.criterion = nn.MSELoss()

  def forward(self, x):
    # input : (batch_size, 5990) # V : (5990, k)
    #
    interactions = interactions = 0.5 * (torch.sum((torch.matmul(x, self.V))) ** 2 - torch.sum(torch.matmul(x, self.V)**2))
    y_hat = self.w0 + torch.matmul(x, self.w) + interactions

    return y_hat

  def loss(self, train = True, x =None, y= None):
    loss = 0
    if train:
      for i in range(len(x)):
        y_hat = self.forward(x[i])
        loss += self.criterion(y_hat, y[i])
    else:
      for i in range(self.num_features_test):
        y_hat = self.forward(self.test_features[i])
        loss += self.criterion(y_hat, self.y_test[i])

    return loss


  def fit(self):
    train_loss_list = []
    test_loss_list = []
    self.optimizer = torch.optim.SGD(self.parameters(), lr=self.lr)

    for epoch in range(self.epoch):
      self.train()
      for batch_x, batch_y in train_loader:
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)
        train_loss = self.loss( x=batch_x, y=batch_y)
        rmse_loss = torch.sqrt(train_loss / len(train_loader.dataset))
        train_loss_list.append(rmse_loss)
        self.optimizer.zero_grad()
        train_loss.backward()
        self.optimizer.step()

      with torch.no_grad():
        test_loss = self.loss(train = False)
        rmse_test_loss = torch.sqrt(test_loss / self.num_features_test)
      if epoch % 20 == 0:
        print(f'Epoch [{epoch}/{self.epoch}], train rmse: {rmse_loss}, test_rmse: {rmse_test_loss}')
    return train_loss_list, test_loss_list


In [None]:
import matplotlib.pyplot as plt

model = FM(train_loader, test_loader).to(device)

# Train the model
train_loss_list, test_loss_list = model.fit()

plt.plot(train_loss_list.cpu().numpy())
plt.plot(test_loss_list.cpu().numpy())
plt.show()

  y_hat = self.w0 + torch.matmul(x, self.w.T) + interactions
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [0/100], train rmse: 0.06672803312540054, test_rmse: 2.456717014312744


KeyboardInterrupt: 