In [59]:
import os 
import numpy as np
import pandas as pd 

import torch 
import torch.nn as nn 
import torch.nn.functional as F

from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

In [47]:
dpath = '../ml-100k/'

def load_csv(dpath):
    data = pd.read_csv(os.path.join(dpath,'u.data'), sep='\t', header=None)
    data.columns = ['user_id', 'item_id', 'rating', 'timestamp']
    return data

df = load_csv(dpath)
user2idx = {j:i for i,j in enumerate(df.user_id.unique())}
item2idx = {j:i for i,j in enumerate(df.item_id.unique())}

df['user_id'] = df['user_id'].map(user2idx)
df['item_id'] = df['item_id'].map(item2idx)

# def rating_func(x,k):
#     tx = (x-1) / (k-1)
#     return tx 

# df['rating'] = df['rating'].apply(rating_func, args=(5,))
    

df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,0,3,881250949
1,1,1,3,891717742
2,2,2,1,878887116
3,3,3,2,880606923
4,4,4,1,886397596


In [48]:
movies_df = pd.read_csv(os.path.join(dpath,'u.item'), sep='|', header=None, encoding='latin-1')
movies_df.columns = ['movie_id', 'movie_title', 'release_date', 'video_release_date',
                     'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 
                     'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
                     'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
                     'Thriller', 'War', 'Western']
users_df = pd.read_csv(os.path.join(dpath,'u.user'), sep='|', encoding='latin-1', header=None)
users_df.columns = ['user_id', 'age', 'gender', 'occupation', 'zip_code']

In [49]:
users_df

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


In [50]:
movies_df

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
def age_map(x):
    x = int(x)
    if x < 20:
        return '10'
    elif x >= 20 and x < 30:
        return '20'
    elif x >= 30 and x < 40:
        return '30'
    elif x >= 40 and x < 50:
        return '40'
    elif x >= 50 and x < 60:
        return '50'
    else:
        return '60'

users_df['age'] = users_df['age'].apply(age_map)

In [52]:
movies_df.isna().sum()

movie_id                 0
movie_title              0
release_date             1
video_release_date    1682
IMDb_URL                 3
unknown                  0
Action                   0
Adventure                0
Animation                0
Children                 0
Comedy                   0
Crime                    0
Documentary              0
Drama                    0
Fantasy                  0
Film-Noir                0
Horror                   0
Musical                  0
Mystery                  0
Romance                  0
Sci-Fi                   0
Thriller                 0
War                      0
Western                  0
dtype: int64

In [53]:
movies_df.drop(['movie_title', 'release_date', 'video_release_date', 'IMDb_URL'], axis=1, inplace=True)
movies_df['movie_id'] = movies_df['movie_id'].map(item2idx)
users_df['user_id'] = users_df['user_id'].map(user2idx)

df.rename(columns={'item_id':'movie_id'}, inplace=True)

df = pd.merge(df, movies_df,how='left', on = 'movie_id')
df = pd.merge(df, users_df, how='left',on = 'user_id')
df

Unnamed: 0,user_id,movie_id,rating,timestamp,unknown,Action,Adventure,Animation,Children,Comedy,...,Mystery,Romance,Sci-Fi,Thriller,War,Western,age,gender,occupation,zip_code
0,0,0,3,881250949,0,0,0,0,0,1,...,0,0,0,0,0,0,40,M,writer,55105
1,1,1,3,891717742,0,0,0,0,0,0,...,1,0,0,1,0,0,30,F,executive,00000
2,2,2,1,878887116,0,0,0,0,1,1,...,0,0,0,0,0,0,20,M,writer,40206
3,3,3,2,880606923,0,0,0,0,0,0,...,0,1,0,0,1,1,20,M,technician,80525
4,4,4,1,886397596,0,0,0,0,0,0,...,0,0,0,0,0,0,40,M,educator,55113
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,875,173,3,880175444,0,0,0,0,0,1,...,0,0,0,0,0,0,10,M,student,83702
99996,708,247,5,879795543,0,0,0,0,0,1,...,0,0,1,0,0,0,30,F,administrator,44265
99997,37,1004,1,874795795,0,0,0,0,0,0,...,0,0,0,1,0,0,20,M,student,95064
99998,58,443,2,882399156,0,0,0,0,1,1,...,0,0,0,0,0,0,40,M,educator,29206


In [54]:
df.drop(['timestamp', 'zip_code'], axis=1, inplace=True)
df['user_id'] = df['user_id'].astype('category')
df['movie_id'] = df['movie_id'].astype('category')
df['age'] = df['age'].astype('category')
df['gender'] = df['gender'].astype('category')
df['occupation'] = df['occupation'].astype('category')
df['rating'] = [int(i/4) for i in df.rating]

fm_df = pd.get_dummies(df)
fm_df

Unnamed: 0,rating,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,occupation_marketing,occupation_none,occupation_other,occupation_programmer,occupation_retired,occupation_salesman,occupation_scientist,occupation_student,occupation_technician,occupation_writer
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
99996,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
99998,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [118]:
class Factorization(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Factorization, self).__init__()
        self.factor = nn.Parameter(torch.randn(input_dim, output_dim))
        
    def forward(self, x):
        square_of_sum = torch.pow(torch.matmul(x, self.factor),2)
        sum_of_square = torch.matmul(torch.pow(x,2), torch.pow(self.factor,2))
        
        return 0.5 * torch.sum(square_of_sum - sum_of_square, dim=1)

In [119]:
class FM(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(FM, self).__init__()
        self.linear = nn.Linear(input_dim, 1, bias=True)
        self.factorization = Factorization(input_dim, output_dim)
        
        nn.init.normal_(self.linear.weight, std=0.01)
        nn.init.normal_(self.factorization.factor, std=0.01)
    
    def forward(self, x):
        x = self.linear(x).squeeze() + self.factorization(x)
        
        return x

In [120]:

def train(epochs, model, dataloader, criterion, optimizer, device):
    
    summary = pd.DataFrame(columns=['Epoch', 'Loss'])

    for epoch in range(epochs):
        
        print(f'Epoch {epoch}')

        model.train()
        train_loss = 0.0
        for x, y in dataloader:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            
            pred = model(x)
            loss = criterion(pred, y)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            
        train_loss /= len(dataloader)
        
        print(f'Epoch {epoch} | Loss: {train_loss}')
        
        summary = pd.concat([summary, pd.DataFrame([[epoch, train_loss]], columns=['Epoch', 'Loss'])])
    
    summary.to_csv('summary.csv', index=False)    
        


In [81]:
train_X, test_X, train_y, test_y = train_test_split(
    fm_df.loc[:, fm_df.columns != 'rating'], fm_df['rating'], test_size=0.2, random_state=42)

In [102]:
input_dim = fm_df.shape[1] - 1 
factor_dim = 10
batch_size = 64
lr = 0.0005 
epochs = 50 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_dataset_fm = TensorDataset(torch.Tensor(np.array(train_X)), torch.Tensor(np.array(train_y)))
test_dataset_fm = TensorDataset(torch.Tensor(np.array(test_X)), torch.Tensor(np.array(test_y)))

In [103]:
train_dataloader_fm = DataLoader(train_dataset_fm, batch_size=batch_size, shuffle=True)
test_dataloader_fm = DataLoader(test_dataset_fm, batch_size=batch_size, shuffle=True)

In [121]:
model = FM(input_dim, factor_dim).to(device)
criterion = nn.MSELoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [122]:
train(epochs, model, train_dataloader_fm, criterion, optimizer, device)

Epoch 0
Epoch 0 | Loss: 0.22026978272199632
Epoch 1


KeyboardInterrupt: 

In [116]:
x, y = next(iter(test_dataloader_fm))
x, y = x.to(device), y.to(device)
model(x).size()

torch.Size([64, 64])

In [117]:
y.size()

torch.Size([64])