In [1]:
import os, sys
dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path: sys.path.append(dir1)

In [2]:
import torch
import numpy as np

tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')    # Download vocabulary from S3 and cache.
tokenizer.save_pretrained('../transformers/BERT/tokenizer/')

model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased')    # Download model and configuration from S3 and cache.# tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', './test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
model.save_pretrained('../transformers/BERT/model/')

Using cache found in /home/rctejon/.cache/torch/hub/huggingface_pytorch-transformers_main
  from .autonotebook import tqdm as notebook_tqdm
Using cache found in /home/rctejon/.cache/torch/hub/huggingface_pytorch-transformers_main


In [3]:
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', '../transformers/BERT/tokenizer/')
model = torch.hub.load('huggingface/pytorch-transformers', 'model', '../transformers/BERT/model/')

Using cache found in /home/rctejon/.cache/torch/hub/huggingface_pytorch-transformers_main
Using cache found in /home/rctejon/.cache/torch/hub/huggingface_pytorch-transformers_main


In [4]:
model.config.hidden_size

768

In [5]:
for param in model.parameters():
    param.requires_grad = False

In [6]:
from transformers import BatchEncoding

text1 = "I really like the movie Titanic."
text2 = "I really like the book the Da Vinci Code."
encoded_inputs = tokenizer([text1, text2], return_tensors='pt', max_length=16, padding='max_length', truncation=True)
type(encoded_inputs)

transformers.tokenization_utils_base.BatchEncoding

In [7]:
encoded_input1 = tokenizer(text1, return_tensors='pt', max_length=16, padding='max_length', truncation=True)
encoded_input2 = tokenizer(text2, return_tensors='pt', max_length=16, padding='max_length', truncation=True)

encoded_inputs_list = [encoded_input1, encoded_input2]


input_ids = torch.cat(tuple(map(lambda x: x['input_ids'], encoded_inputs_list)), dim=0)
attention_mask = torch.cat(tuple(map(lambda x: x['attention_mask'], encoded_inputs_list)), dim=0)
token_type_ids = torch.cat(tuple(map(lambda x: x['token_type_ids'], encoded_inputs_list)), dim=0)

encoded_inputs = BatchEncoding({'input_ids': input_ids, 'attention_mask': attention_mask, 'token_type_ids': token_type_ids})
encoded_inputs.input_ids.shape

torch.Size([2, 16])

In [8]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")
encoded_inputs = encoded_inputs.to(device)
model = model.to(device)
output = model(**encoded_inputs)
output.pooler_output


Using cuda:0 device


tensor([[-0.9141, -0.4043, -0.8202,  ..., -0.7174, -0.6313,  0.9061],
        [-0.9036, -0.3730, -0.6548,  ..., -0.5355, -0.6265,  0.9074]],
       device='cuda:0')

In [9]:
v1 = torch.randn(256, 1, 768).reshape(256, 768)
v2 = torch.randn(256, 8)
v3 = torch.randn(256, 32)

v = torch.cat((v1, v2, v3), dim=1)
v

tensor([[ 1.2100e+00,  1.7972e+00,  8.3262e-01,  ...,  1.3088e+00,
          8.3779e-01, -1.3769e+00],
        [ 7.6178e-02,  5.9780e-01, -4.7973e-02,  ...,  2.5567e-01,
         -8.1189e-01,  7.5764e-02],
        [ 2.5063e-02,  7.6688e-01, -6.1524e-01,  ..., -9.7178e-01,
         -1.1545e-01, -1.4916e-01],
        ...,
        [-2.0185e+00, -1.0795e+00, -3.1225e-01,  ...,  1.3692e-03,
         -6.8215e-01, -1.3954e-01],
        [-1.0053e+00, -7.6392e-01,  3.1799e-01,  ..., -7.2641e-03,
         -8.7118e-02, -3.6104e-01],
        [-8.4138e-01, -8.0648e-01,  8.8313e-02,  ..., -1.9819e+00,
         -7.0850e-01, -6.3064e-03]])

In [10]:
import os
import time
import random
import argparse
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from metrics.metrics import metrics
from architectures.BertMF.bert_mf import BertMF
from loaders.create_dataloader import CreateDataloader
from tqdm import tqdm
import pickle
os.environ["TOKENIZERS_PARALLELISM"] = "true"

def tokenize(text, tokenizer, max_length):
    """
    Tokenize the text
    """
    tokenization = tokenizer(text, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')

    tokenization = {
        'input_ids': tokenization['input_ids'],
        'attention_mask': tokenization['attention_mask'],
        'token_type_ids': tokenization['token_type_ids']
    }
    return tokenization


def create_text_df(tokenizer, max_length):
    """
    Create a dataframe from the text dictionary
    """
    course_texts = pickle.load(open('../pickles/course_texts.pkl', 'rb'))
    text_df = pd.DataFrame.from_dict(course_texts, orient='index')
    text_df.reset_index(inplace=True)
    text_df.columns = ['course_id', 'text']
    text_df['course_id'] = text_df['course_id'].apply(lambda x: x.split('_')[-1])
    text_df['course_id'] = text_df['course_id'].astype(int)
    text_df['text'] = text_df['text'].astype(str)
    text_df['tokenization'] = text_df['text'].apply(lambda x: tokenize(x, tokenizer, max_length))
    text_df.drop(columns=['text'], inplace=True)
    return text_df

def _reindex(ratings):
    """
    Process dataset to reindex userID and itemID, also set rating as binary feedback
    """
    user2id = pickle.load(open(MAIN_PATH + 'user2id.pkl', 'rb'))

    item2id = pickle.load(open(MAIN_PATH + 'item2id.pkl', 'rb'))

    ratings['user_id'] = ratings['user_id'].apply(lambda x: user2id[x])
    ratings['item_id'] = ratings['item_id'].apply(lambda x: item2id[x])
    ratings['rating'] = ratings['rating'].apply(lambda x: float(x > 0))
    return ratings

DATASET_NAME = 'MOOCCubeX'
MODEL_NAME = 'BertMF'
TRAIN_DATASET_FILE = 'train.feather'
TEST_DATASET_FILE = 'test.feather'
MAIN_PATH = f'../data/{DATASET_NAME}/'
TRAIN_DATA_PATH = MAIN_PATH + TRAIN_DATASET_FILE
TEST_DATA_PATH = MAIN_PATH + TEST_DATASET_FILE
MODEL_PATH = f'../models/{DATASET_NAME}/'
MODEL = f'{DATASET_NAME}-{MODEL_NAME}'

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True



# set device and parameters
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

# seed for Reproducibility
seed_everything(51)

tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', '../transformers/BERT/tokenizer/')

# load data
print(TRAIN_DATA_PATH)
train_rating_data = pd.read_feather(TRAIN_DATA_PATH)
test_rating_data = pd.read_feather(TEST_DATA_PATH)
print(train_rating_data.head())
print(train_rating_data.dtypes)
print('Begin Tokenization')
text_df = create_text_df(tokenizer=tokenizer, max_length=16)
print(text_df.head())
print(text_df.dtypes)

train_rating_data = train_rating_data.merge(text_df, how='left', on='course_id')
test_rating_data = test_rating_data.merge(text_df, how='left', on='course_id')

default_tokenization = tokenize('Description: ', tokenizer, 16)

print(default_tokenization)

train_rating_data['tokenization'] = train_rating_data['tokenization'].apply(lambda x: default_tokenization if type(x) == float else x)
test_rating_data['tokenization'] = test_rating_data['tokenization'].apply(lambda x: default_tokenization if type(x) == float else x)

Using cuda:0 device


Using cache found in /home/rctejon/.cache/torch/hub/huggingface_pytorch-transformers_main


../data/MOOCCubeX/train.feather
     id  course_id  rating
0  U_24     597214       1
1  U_24     605512       1
2  U_24     597211       1
3  U_24     597314       1
4  U_24     597208       1
id           object
course_id     int64
rating        int64
dtype: object
Begin Tokenization
   course_id                                       tokenization
0     584313  {'input_ids': [[tensor(101), tensor(8476), ten...
1     584329  {'input_ids': [[tensor(101), tensor(8476), ten...
2     584381  {'input_ids': [[tensor(101), tensor(8476), ten...
3     597208  {'input_ids': [[tensor(101), tensor(8476), ten...
4     597225  {'input_ids': [[tensor(101), tensor(6412), ten...
course_id        int64
tokenization    object
dtype: object
{'input_ids': tensor([[ 101, 6412, 1024,  102,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [11]:
train_rating_data['tokenization'] = train_rating_data['tokenization'].apply(lambda x: default_tokenization if type(x) == float else x)
train_rating_data

Unnamed: 0,id,course_id,rating,tokenization
0,U_24,597214,1,"{'input_ids': [[tensor(101), tensor(6412), ten..."
1,U_24,605512,1,"{'input_ids': [[tensor(101), tensor(6412), ten..."
2,U_24,597211,1,"{'input_ids': [[tensor(101), tensor(6412), ten..."
3,U_24,597314,1,"{'input_ids': [[tensor(101), tensor(8476), ten..."
4,U_24,597208,1,"{'input_ids': [[tensor(101), tensor(8476), ten..."
...,...,...,...,...
6683569,U_34712108,782490,1,"{'input_ids': [[tensor(101), tensor(6412), ten..."
6683570,U_34712115,883345,1,"{'input_ids': [[tensor(101), tensor(6412), ten..."
6683571,U_34712115,770738,1,"{'input_ids': [[tensor(101), tensor(6412), ten..."
6683572,U_34712115,676937,1,"{'input_ids': [[tensor(101), tensor(8476), ten..."


In [12]:
print(train_rating_data.head())

train_rating_data = train_rating_data.rename(columns={'id': 'user_id', 'course_id': 'item_id'})
test_rating_data = test_rating_data.rename(columns={'id': 'user_id', 'course_id': 'item_id'})

ratings = pd.concat([train_rating_data, test_rating_data], ignore_index=True)

tokenizations = None

if not os.path.exists(f'{MAIN_PATH}/test_tokenizations_{100}.pkl') and not os.path.exists(f'{MAIN_PATH}/train_tokenizations_{4}.pkl'):
    tokenizations = ratings[['item_id', 'tokenization']].drop_duplicates(subset=['item_id'])
    tokenizations.set_index('item_id', inplace=True)

# set the num_users, items
num_users = ratings['user_id'].nunique()+1
num_items = ratings['item_id'].nunique()+1

print(num_users, num_items)

train_rating_data = _reindex(train_rating_data)
test_rating_data = _reindex(test_rating_data)


# construct the train and test datasets
class Args:
    def __init__(self, num_ng, num_ng_test, batch_size, seed, factor_num, layers, dropout, lr, token_size, bert_path):
        self.num_ng = num_ng
        self.num_ng_test = num_ng_test
        self.batch_size = batch_size
        self.seed = seed
        self.factor_num = factor_num
        self.layers = layers
        self.dropout = dropout
        self.lr = lr
        self.token_size = token_size
        self.bert_path = bert_path
args = Args(4, 100, 256, 51, 32, [64,32,16,8], 0.1, 0.001, 768, '../transformers/BERT/model/')

args.bert_path

     id  course_id  rating                                       tokenization
0  U_24     597214       1  {'input_ids': [[tensor(101), tensor(6412), ten...
1  U_24     605512       1  {'input_ids': [[tensor(101), tensor(6412), ten...
2  U_24     597211       1  {'input_ids': [[tensor(101), tensor(6412), ten...
3  U_24     597314       1  {'input_ids': [[tensor(101), tensor(8476), ten...
4  U_24     597208       1  {'input_ids': [[tensor(101), tensor(8476), ten...
694530 4701


'../transformers/BERT/model/'

In [19]:


data = CreateDataloader(args, train_rating_data, test_rating_data, MAIN_PATH, True, tokenizations)
print('Create Train Data Loader')
train_loader = data.get_train_instance()

# set model and loss, optimizer
model = BertMF(args, num_users, num_items)
# model = torch.load('{}{}.pth'.format(MODEL_PATH, MODEL))
model = model.to(device)
# print(model)
loss_function = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=args.lr)

it = iter(train_loader)
len(it)

(6683574, 4) (1983584, 4) (8667158, 4)
Create Train Data Loader


Using cache found in /home/rctejon/.cache/torch/hub/huggingface_pytorch-transformers_main


130539

In [22]:

start_time = time.time()

t = next(it, None)
while t is not None:
    start_time = time.time()
    user, item, label, tokenization = t

    user = user.to(device)
    item = item.to(device)
    label = label.to(device)
    tokenization = tokenization.to(device)

    optimizer.zero_grad()

    prediction = model(user, item, tokenization)
    loss = loss_function(prediction, label)
    loss.backward()
    optimizer.step()
    end_time = time.time()
    t = next(it, None)
    print(f"Time: {end_time - start_time} seconds")

Time: 0.007116079330444336 seconds
Time: 0.1030416488647461 seconds
Time: 0.10625696182250977 seconds
Time: 0.11337614059448242 seconds
Time: 0.11348271369934082 seconds
Time: 0.11427831649780273 seconds
Time: 0.1149754524230957 seconds
Time: 0.12044954299926758 seconds
Time: 0.11875534057617188 seconds
Time: 0.11550736427307129 seconds
Time: 0.11506247520446777 seconds
Time: 0.11113834381103516 seconds
Time: 0.11017680168151855 seconds
Time: 0.11149883270263672 seconds
Time: 0.10703921318054199 seconds
Time: 0.11527228355407715 seconds
Time: 0.11738753318786621 seconds
Time: 0.11019325256347656 seconds
Time: 0.1103053092956543 seconds
Time: 0.11354207992553711 seconds
Time: 0.1093144416809082 seconds
Time: 0.11489200592041016 seconds
Time: 0.11279964447021484 seconds
Time: 0.11248922348022461 seconds
Time: 0.12108135223388672 seconds
Time: 0.1065976619720459 seconds
Time: 0.11140894889831543 seconds
Time: 0.10970830917358398 seconds
Time: 0.11317229270935059 seconds
Time: 0.1052350997

KeyboardInterrupt: 

In [18]:

# train, evaluation
best_hr = 0
for epoch in range(1, 1+1):
    model.train() # Enable dropout (if have).
    start_time = time.time()

    for user, item, label, tokenization in tqdm(train_loader):
        # print(user.size(), item.size(), label.size())
        # print(user, item, label)
        
        user = user.to(device)
        item = item.to(device)
        label = label.to(device)
        tokenization = tokenization.to(device)

        optimizer.zero_grad()
        # print('Zero Grad')
        prediction = model(user, item, tokenization)
        # print('Prediction')
        loss = loss_function(prediction, label)
        # print('Loss')
        loss.backward()
        # print('Backward')
        optimizer.step()
        # print('Step')
        time.sleep(0.03)

    print('Epoch: {}, Loss: {:.4f}'.format(epoch, loss.item()))
    print('epoch time: {:.4f}s'.format(time.time()-start_time))

    model.eval()

if not os.path.exists(MODEL_PATH):
    os.mkdir(MODEL_PATH)
torch.save(model,
    '{}{}.pth'.format(MODEL_PATH, MODEL))

print('Train done')

  0%|          | 0/130539 [00:00<?, ?it/s]

  0%|          | 163/130539 [00:27<6:08:51,  5.89it/s]


KeyboardInterrupt: 