In [1]:
import pandas as pd 
import os 
import scipy.sparse as sp
import numpy as np
import torch 
import torch.nn as nn 
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv(os.path.join('../ml-100k/','u.data'), sep='\t', header=None)
df.columns = ['user_id', 'item_id', 'rating', 'timestamp']
user2idx = {j:i for i,j in enumerate(df.user_id.unique())}
item2idx = {j:i for i,j in enumerate(df.item_id.unique())}

df['user_id'] = df['user_id'].map(user2idx)
df['item_id'] = df['item_id'].map(item2idx)

def preprocess_rating(x):
    if x == 0:
        return -1 
    else:
        return (x-1) / 4

df['rating'] = df['rating'].apply(preprocess_rating)

In [3]:
num_items = df.item_id.nunique()
num_users = df.user_id.nunique()

In [4]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_user, train_item, train_rating = train_df.user_id.values, train_df.item_id.values, train_df.rating.values
test_user, test_item, test_rating = test_df.user_id.values, test_df.item_id.values, test_df.rating.values


In [5]:
class AutoRecDataset(Dataset):
    def __init__(self, user_list, item_list, rating_list, num_user, num_item, is_item=True):
        super(AutoRecDataset, self).__init__()
        self.is_item = is_item
        self.user_list = user_list
        self.item_list = item_list
        self.rating_list = rating_list
        self.num_user = num_user
        self.num_item = num_item
        
        self.make_mat()
        
    def make_mat(self):
        if self.is_item==True:
            self.matrix = torch.zeros(self.num_item, self.num_user)
            for user, item, rating in zip(self.user_list, self.item_list, self.rating_list):
                self.matrix[item, user] = rating
        else:
            self.matrix = torch.zeros(self.num_user, self.num_item)
            for user, item, rating in zip(self.user_list, self.item_list, self.rating_list):
                self.matrix[user, item] = rating
    
    def __len__(self):
        if self.is_item==True:
            return self.num_item
        else:
            return self.num_user
    
    def __getitem__(self, idx):
        return self.matrix[idx]
        
        

In [6]:
train_dataset = AutoRecDataset(train_user, train_item, train_rating, num_users, num_items, True)
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_dataset = AutoRecDataset(test_user, test_item, test_rating, num_users, num_items, True)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
class AutoRec(nn.Module):
    def __init__(self, num_users, num_items, hidden_dim, is_item = True):
        super(AutoRec, self).__init__()
        if is_item == True:
            self.hidden_dim = [num_users] + hidden_dim
        else:
            self.hidden_dim = [num_items] + hidden_dim
        self.encoder = nn.ModuleList([nn.Linear(self.hidden_dim[idx], self.hidden_dim[idx+1])
                                        for idx in range(len(self.hidden_dim)-1)])
        self.decoder = nn.ModuleList([nn.Linear(self.hidden_dim[idx], self.hidden_dim[idx-1])
                                        for idx in range(len(self.hidden_dim)-1, 0, -1)])
        self.init_weights()
        
    def init_weights(self):
        for layer in self.encoder:
            nn.init.xavier_uniform_(layer.weight)
            nn.init.zeros_(layer.bias)
        for layer in self.decoder:
            nn.init.xavier_uniform_(layer.weight)
            nn.init.zeros_(layer.bias)
            
    def forward(self, x):
        for layer in self.encoder:
            x = F.relu(layer(x))
        for idx in range(len(self.decoder)):
            if idx == len(self.decoder) -1:
                x = torch.sigmoid(self.decoder[idx](x))
            else:
                x = F.relu(self.decoder[idx](x))
        return x

In [10]:
model = AutoRec(num_users, num_items, [64], True)

In [11]:
lr = 0.001 
reg = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=reg)
def RMSELoss(x, xhat):
    return torch.sqrt(torch.mean((x-xhat)**2))

criterion = RMSELoss

In [12]:
model = model.to(device)

In [13]:
def train(model, train_loader, test_loader, criterion, optimizer, device):
    summary = pd.DataFrame(columns=['epoch', 'train_loss', 'test_loss'])
    
    for epoch in range(100):
        model.train()
        
        for batch in train_loader:
            batch = batch.to(device)
            mask = batch >= 0 
            neg = batch == -1 
            batch[neg] = 0.5 
            
            optimizer.zero_grad()
            output = model(batch)
            loss = criterion(output[mask], batch[mask])
            
            loss.backward()
            optimizer.step()
            
            train_loss = loss.item()
            
        train_loss = train_loss / len(train_loader)
        
        with torch.no_grad():
            model.eval()
            
            for batch in test_loader:
                batch = batch.to(device)
                mask = batch >= 0 
                
                output = model(batch)
                loss = criterion(output[mask], batch[mask])
                
                test_loss = loss.item()
                
            test_loss = test_loss / len(test_loader)
            
        summary = pd.concat([summary, pd.DataFrame([[epoch, train_loss, test_loss]], columns=['epoch', 'train_loss', 'test_loss'])])
        
    return summary

In [14]:
train(model, train_loader, test_loader, criterion, optimizer, device)

Unnamed: 0,epoch,train_loss,test_loss
0,0,0.070065,0.071147
0,1,0.069343,0.070849
0,2,0.068138,0.070471
0,3,0.065774,0.069925
0,4,0.061977,0.069121
...,...,...,...
0,95,0.025813,0.023275
0,96,0.023809,0.023321
0,97,0.027281,0.023006
0,98,0.025595,0.022588
