In [80]:
import sys
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn import metrics
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

from google.colab import drive
drive.mount('/content/gdrive/')
file_path = "/content/gdrive/MyDrive/data/movie_len/"

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# FM implementation using pytorch

In [9]:
users = pd.read_table(file_path +'users.dat', sep = '::', header=None, engine='python', encoding='utf-8').rename(columns={0: 'user_id', 1: 'gender', 2: 'age', 3: 'occupation', 4: 'zipcode'})
users.head()

Unnamed: 0,user_id,gender,age,occupation,zipcode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [22]:
ratings = pd.read_table(file_path +'ratings.dat', sep='::', header=None, engine='python', encoding='utf-8').drop(columns=[3]).rename(columns={0: 'user_id', 1: 'movie_id', 2: 'ratings'})
ratings.head()

Unnamed: 0,user_id,movie_id,ratings
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [18]:
movies = pd.read_table(file_path +'movies.dat', sep='::', header=None, engine='python', encoding='latin-1').rename(columns={0: 'movie_id', 1: 'title', 2: 'genres'})
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [66]:
df = pd.merge(users, ratings, on=['user_id'])
df.head()

Unnamed: 0,user_id,gender,age,occupation,zipcode,movie_id,ratings
0,1,F,1,10,48067,1193,5
1,1,F,1,10,48067,661,3
2,1,F,1,10,48067,914,3
3,1,F,1,10,48067,3408,4
4,1,F,1,10,48067,2355,5


In [67]:
# perform label encoding
features_to_encoding = ['user_id', 'gender', 'occupation', 'movie_id', 'age']
le = LabelEncoder()
for feature in features_to_encoding:
  df[feature] = le.fit_transform(df[feature])

In [68]:
df = df.drop(columns=['zipcode'])
features = ['user_id', 'movie_id', 'age', 'gender', 'occupation']
labels = df['ratings']
df = df[features]
df.head()

Unnamed: 0,user_id,movie_id,age,gender,occupation
0,0,1104,0,0,10
1,0,639,0,0,10
2,0,853,0,0,10
3,0,3177,0,0,10
4,0,2162,0,0,10


### The numbers in the matrix represent the feature value index, since I will use torch.embedding to get vector representation of each feature

In [69]:
feature_offset = {}
next_offset = 0
for feature in features:
  feature_offset[feature] = next_offset
  next_offset += len(df[feature].unique())
  
feature_offset

{'user_id': 0,
 'movie_id': 6040,
 'age': 9746,
 'gender': 9753,
 'occupation': 9755}

In [70]:
for feature in features:
  df[feature] = df[feature] + feature_offset[feature]
df.head()

Unnamed: 0,user_id,movie_id,age,gender,occupation
0,0,7144,9746,9753,9765
1,0,6679,9746,9753,9765
2,0,6893,9746,9753,9765
3,0,9217,9746,9753,9765
4,0,8202,9746,9753,9765


In [71]:
train_x, test_x, train_y, test_y = train_test_split(df[features], labels, test_size=0.2, random_state=42)
train_x = torch.tensor(train_x.values)
train_y = torch.tensor(train_y.values).float()

In [72]:
class MovieDataset(Dataset):
  def __init__(self, train_set, target):
    self.train_set = train_set
    self.target = target

  def __len__(self):
    return len(self.target)

  def __getitem__(self, idx):
    x = self.train_set[idx,:]
    y = self.target[idx]
    return x, y

train_set = MovieDataset(train_x, train_y)
train_dataloader = DataLoader(train_set, batch_size = 1024 , shuffle=True)

# Model

In [73]:
# copied from fastai: 
def trunc_normal_(x, mean=0., std=1.):
    "Truncated normal initialization."
    return x.normal_().fmod_(2).mul_(std).add_(mean)

In [78]:
class FMModel(nn.Module):
  def __init__(self, n, k):
    super().__init__()
    self.w0 = nn.Parameter(torch.zeros(1))
    self.bias = nn.Embedding(n, 1)
    self.embeddings = nn.Embedding(n, k)

    # improve learning speed, see https://arxiv.org/abs/1711.09160
    with torch.no_grad(): trunc_normal_(self.embeddings.weight, std=0.01)
    with torch.no_grad(): trunc_normal_(self.bias.weight, std=0.01)

  def forward(self, X):
    xw = self.embeddings(X)
    pow_of_sum = xw.sum(dim=1).pow(2)
    sum_of_pow = xw.pow(2).sum(dim=1)
    pairwise = (pow_of_sum-sum_of_pow).sum(1)*0.5
    bias = self.bias(X).squeeze().sum(1)
    return torch.sigmoid(self.w0 + bias + pairwise)

In [90]:
def fit(train_loader, model, optimizer, criterion):
  train_loss = 0
  for i, data in enumerate(train_loader, 0):
    inputs, target = data
    optimizer.zero_grad()
    outputs = model(inputs.to(device))
    loss = criterion(outputs, target.to(device))
    loss.backward()
    optimizer.step()

    train_loss += loss.item()
    if i % 1000 == 0:    # print every 1000 mini-batches
      print('loss: {}'.format(train_loss))
      train_loss = 0

In [91]:
model = FMModel(train_x.max() + 1, 120).to(device)
wd=1e-5
lr=0.001
epochs=10
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[7], gamma=0.1)
criterion = nn.MSELoss().to(device)
for epoch in tqdm(range(epochs)):
  fit(train_dataloader, model, optimizer, criterion)
  scheduler.step()

  0%|          | 0/10 [00:00<?, ?it/s]

loss: 10.746367454528809


 10%|█         | 1/10 [00:11<01:39, 11.10s/it]

loss: 7.925200939178467


 20%|██        | 2/10 [00:22<01:28, 11.09s/it]

loss: 7.8492326736450195


 30%|███       | 3/10 [00:32<01:14, 10.68s/it]

loss: 8.365583419799805


 40%|████      | 4/10 [00:43<01:04, 10.68s/it]

loss: 7.880077362060547


 50%|█████     | 5/10 [00:54<00:54, 10.85s/it]

loss: 7.787215232849121


 60%|██████    | 6/10 [01:05<00:43, 10.94s/it]

loss: 7.935617446899414


 70%|███████   | 7/10 [01:17<00:33, 11.29s/it]

loss: 7.942439079284668


 80%|████████  | 8/10 [01:28<00:22, 11.14s/it]

loss: 7.744191646575928


 90%|█████████ | 9/10 [01:38<00:10, 10.81s/it]

loss: 7.977591514587402


100%|██████████| 10/10 [01:49<00:00, 10.93s/it]


In [99]:
prediction = model(torch.tensor(test_x.values).to(device))

In [103]:
prediction.shape

torch.Size([200042])