#AutoEncoders

##Downloading the dataset

###ML-100K

In [34]:
!wget "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
!unzip ml-100k.zip
!ls

--2023-10-16 08:23:49--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip.1’


2023-10-16 08:23:49 (11.2 MB/s) - ‘ml-100k.zip.1’ saved [4924029/4924029]

Archive:  ml-100k.zip
replace ml-100k/allbut.pl? [y]es, [n]o, [A]ll, [N]one, [r]ename: ml-100k  ml-100k.zip  ml-100k.zip.1  ml-1m  ml-1m.zip  sample_data


###ML-1M

In [35]:
!wget "http://files.grouplens.org/datasets/movielens/ml-1m.zip"
!unzip ml-1m.zip
!ls

--2023-10-16 08:24:44--  http://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: ‘ml-1m.zip.1’


2023-10-16 08:24:44 (13.4 MB/s) - ‘ml-1m.zip.1’ saved [5917549/5917549]

Archive:  ml-1m.zip
replace ml-1m/movies.dat? [y]es, [n]o, [A]ll, [N]one, [r]ename: ml-100k  ml-100k.zip  ml-100k.zip.1  ml-1m  ml-1m.zip  ml-1m.zip.1  sample_data


##Importing the libraries

In [54]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

## Importing the dataset


In [37]:
# We won't be using this dataset.
movies = pd.read_csv('ml-1m/movies.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
users = pd.read_csv('ml-1m/users.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
ratings = pd.read_csv('ml-1m/ratings.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')

## Preparing the training set and the test set


In [55]:
training_set = pd.read_csv('ml-100k/u1.base', delimiter = '\t')
training_set = np.array(training_set, dtype = 'int')
test_set = pd.read_csv('ml-100k/u1.test', delimiter = '\t')
test_set = np.array(test_set, dtype = 'int')

## Getting the number of users and movies


In [56]:
nb_users = int(max(max(training_set[:, 0], ), max(test_set[:, 0])))
nb_movies = int(max(max(training_set[:, 1], ), max(test_set[:, 1])))

In [57]:
nb_users, nb_movies

(943, 1682)

## Converting the data into an array with users in lines and movies in columns


In [58]:
def convert(data):
  new_data = []
  # 1 user : n movie ratings 로 구조 변경
  for id_users in range(1, nb_users + 1):
    id_movies = data[:, 1] [data[:, 0] == id_users]
    id_ratings = data[:, 2] [data[:, 0] == id_users]
    # 평가하지 않은 영화 = 0
    ratings = np.zeros(nb_movies)
    ratings[id_movies - 1] = id_ratings
    new_data.append(list(ratings))
  return new_data

In [59]:
training_set = convert(training_set)
test_set = convert(test_set)

## Converting the data into Torch tensors


In [60]:
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

## Creating the architecture of the Neural Network


In [61]:
# import torch.nn as nn 상속
class SAE(nn.Module):

  def __init__(self, ):
    super(SAE,self).__init__()

    # 층 생성
    self.fc1 = nn.Linear(nb_movies, 32) # 입력 노드 수, 첫번째 은닉층 노드 수(인코딩)
    self.fc2 = nn.Linear(32, 16)        # 첫번째 은닉층 노드 수, 두번째 은닉층 노드 수(인코딩)
    self.fc3 = nn.Linear(16, 32)        # 두번째 은닉층 노드 수, 세번째 은닉층 노드 수(디코딩)
    self.fc4 = nn.Linear(32, nb_movies) # 두번째 은닉층 노드 수, 출력 노드 수(디코딩)

    # 활성화 함수
    self.activation = nn.Sigmoid()

  # 순전파
  def forward(self, x): # x : 입력벡터
    x = self.activation(self.fc1(x))
    x = self.activation(self.fc2(x))
    x = self.activation(self.fc3(x))
    x = self.fc4(x)   # 출력층에는 활성화함수X
    return x


In [62]:
sae = SAE()

In [63]:
criterion = nn.MSELoss()
optimizer = optim.RMSprop(sae.parameters(), lr = 0.01, weight_decay = 0.5)

## Training the SAE (Stacked Auto Encoder)


In [64]:
# epoch 수 지정
nb_epoch = 200

for epoch in range(1, nb_epoch+1):
  # 오차 초기화
  train_loss = 0
  # 적어도 한 편 이상의 영화를 평가한 유저의 수
  s = 0.

  for id_user in range(nb_users):
    # from torch.autograd import Variable
    input = Variable(training_set[id_user]).unsqueeze(0) # unsqueeze : 더미 차원 생성
    # 수정 입력 벡터
    target = input.clone()

    # 메모리 최적화를 위해 적어도 한 편 이상의 영화를 평가한 유저만 사용
    if torch.sum(target.data > 0) > 0 :
      output = sae.forward(input)
      target.require_grad = False # 미분 계산x
      output[target == 0] = 0
      # 오차 계산
      loss = criterion(output, target)
      # 오차의 평균( + 1e-10 : 분모가 0인 것을 피하기 위함)
      mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
      loss.backward()
      train_loss += np.sqrt(loss.data * mean_corrector)
      s += 1.

      # optimizer 생성
      optimizer.step()

  print('epoch : '+str(epoch)+'  loss : '+str(train_loss/s))

epoch : 1  loss : tensor(1.6008)
epoch : 2  loss : tensor(1.0753)
epoch : 3  loss : tensor(1.0507)
epoch : 4  loss : tensor(1.0422)
epoch : 5  loss : tensor(1.0384)
epoch : 6  loss : tensor(1.0360)
epoch : 7  loss : tensor(1.0349)
epoch : 8  loss : tensor(1.0334)
epoch : 9  loss : tensor(1.0335)
epoch : 10  loss : tensor(1.0324)
epoch : 11  loss : tensor(1.0322)
epoch : 12  loss : tensor(1.0314)
epoch : 13  loss : tensor(1.0315)
epoch : 14  loss : tensor(1.0284)
epoch : 15  loss : tensor(1.0273)
epoch : 16  loss : tensor(1.0226)
epoch : 17  loss : tensor(1.0238)
epoch : 18  loss : tensor(1.0180)
epoch : 19  loss : tensor(1.0211)
epoch : 20  loss : tensor(1.0149)
epoch : 21  loss : tensor(1.0165)
epoch : 22  loss : tensor(1.0120)
epoch : 23  loss : tensor(1.0121)
epoch : 24  loss : tensor(1.0088)
epoch : 25  loss : tensor(1.0108)
epoch : 26  loss : tensor(1.0058)
epoch : 27  loss : tensor(1.0073)
epoch : 28  loss : tensor(1.0027)
epoch : 29  loss : tensor(1.0031)
epoch : 30  loss : tens

In [65]:
# 20->10->20 | epoch : 200  loss : tensor(0.9065)
# 32->16->32 | epoch : 200  loss : tensor(0.9046)

## Testing the SAE


In [68]:
test_loss = 0
s = 0.

for id_user in range(nb_users):
  input = Variable(training_set[id_user]).unsqueeze(0)
  target = Variable(test_set[id_user])

  if torch.sum(target.data > 0) > 0 :
    output = sae(input)[0]
    target.require_grad = False
    output[target == 0] = 0
    loss = criterion(output, target)
    mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
    test_loss += np.sqrt(loss.data * mean_corrector)
    s += 1.


print('loss : '+str(test_loss/s))

loss : tensor(0.9502)


In [67]:
# 20->10->20 | loss : tensor(0.9577)
# 32->16->32 | loss : tensor(0.9502)