# Using BERT and FAISS for smimilarity search in descriptions as recomendations engine

First we Fine-Tune Bert model to gets better embeddings. Then we get embeddings of every descriptions and finally we get similarity of embeddings using FAISS (Facebook AI similarity search).
This is simple recomendations engine to help begginers to start using BERT (and BERT is one of the most powerfull tool in NLP at the moment) for some tasks.

Credits to:
https://wandb.ai/cayush/bert-finetuning/reports/Sentence-Classification-With-Huggingface-BERT-and-W-B--Vmlldzo4MDMwNA


In [None]:
#!pip install transformers #if you are using google colab 

# 1.0 Fine-Tuning

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



# 1.1. Read Data and import libraries

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os 
import torch
from transformers import BertForSequenceClassification, AdamW, BertConfig # model
from transformers import BertTokenizer # tokenizer
from keras.preprocessing.sequence import pad_sequences # add padding
from sklearn.model_selection import train_test_split # split dataset for train and test
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler # create data batches
from transformers import get_linear_schedule_with_warmup # schedule for training BERT (updating weights etc)
import time
import datetime
import random

os.chdir('../input/netflix-shows')
df = pd.read_csv('netflix_titles.csv')

# training parameters:
batch_size = 32
epochs = 6
# optimizer:
learning_rate = 2e-5
epsilon = 1e-8

Task for Fine-Tuning is very simple. Basing on descriptions, BERT must decide whether movie is for adult only (1) or no (0).

In [None]:
# creating column that tells if movie is for adults or no

df.insert(2, "for_adult", 0) 
df.loc[(df.rating=='TV-MA') | (df.rating=='R'),'for_adult'] = 1
df.head()


In [None]:
# taking values
descriptions = df.description.values
df.drop('description', axis=1, inplace=True)
labels = df.for_adult.values


# 1.2. Load model and tokenizer

In [None]:
# load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
# load model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification. 
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = True, # Whether the model returns all hidden-states. We will need embeddings later.
)

model.cuda() # run on gpu

if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# 1.3. Format inputs to match BERT expectations.

In [None]:
# need to format inputs (decriptions)
#  1.Add additional needed tokens
input_ids = []
for description in descriptions:
    encoded_description = tokenizer.encode(
                        description,                      
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                   )
    input_ids.append(encoded_description)

#  2.every input must be the same length, but descriptions are different so me must add padding (adding token id0 to shorter inputs)

MAX_LEN = max([len(desc) for desc in input_ids])

input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post")

#  3. Creating attention masks
attention_masks = []
for desc in input_ids:
    # Create the attention mask.
    #   - If a token ID is 0, then it's padding, set the mask to 0.
    #   - If a token ID is > 0, then it's a real token, set the mask to 1.
    att_mask = [int(token_id > 0) for token_id in desc]
    attention_masks.append(att_mask)

#  4. Split dataset (masks and inputs must match each other)
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=44, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
                                             random_state=44, test_size=0.1)

# Converting inputs and outputs into pyTorch tensors (becouse Bert is implemented in pyTorch)
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# 1.4. Define Optimizer, scheduler and acc function.

In [None]:
# optimizer
optimizer = AdamW(model.parameters(),lr = learning_rate, eps = epsilon )
# total steps of training is epoch * batch size
total_steps = len(train_dataloader) * epochs
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps)
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))



# 1.6. Training of BERT

In [None]:
# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128


# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch so we can plot them.
loss_values = []

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple.
        loss = outputs[0]

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")

# 1.7. Loss training PLOT

In [None]:
import matplotlib.pyplot as plt
#% matplotlib inline
import seaborn as sns
sns.set(style='darkgrid')
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)
plt.plot(loss_values, 'b-o')
plt.title("Training loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")

plt.show()

# 2.0 Creating Recomendations engine

# 2.1. Install and import libraries

In [None]:
!pip install faiss # k-nn  to calculate semantic similarity (FACEBOOK AI SIMILARITY SEARCH)
!pip install faiss-gpu
df = pd.read_csv('netflix_titles.csv')
import faiss # sentence similarity
import plotly.express as px #plots

# 2.2. Define function to get embeddings

In [None]:
def time_elapsed(sec):
  h = int(sec/3600)
  m = int(sec/60)
  s = sec % 60
  return "{}:{:>02}:{:>05.2f}".format(h,m,s)

def description_embedding(tokenizer,model,description):
  MAX_LEN = 128
  # need to format inputs (decriptions)
  #  1.Add additional needed tokens
  input_ids = tokenizer.encode(
                        description,                      
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                   )
  #  2.every input must be the same length, but descriptions are different so me must add padding (adding token id0 to shorter inputs)
  results = pad_sequences([input_ids], maxlen=MAX_LEN, dtype="long", 
                            truncating="post", padding="post")
  input_ids = results[0] #pad sequences requires list of a lists, co to take only one list we make this
  #  3. Creating attention masks
  attention_masks = [int(token_id > 0) for token_id in input_ids]
  # 4. Create torch tensor and add dimension for number of batches (1)
  input_ids = torch.tensor(input_ids)
  attention_masks = torch.tensor(attention_masks)
  input_ids = input_ids.unsqueeze(0)
  attention_masks = attention_masks.unsqueeze(0)
  # put model in eval mode
  model.eval()
  # copy inputs to GPU
  input_ids = input_ids.to(device)
  attention_masks = attention_masks.to(device)
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
  with torch.no_grad():        
    logits, encoded_layers = model(input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=attention_masks)
    
  layer_nr = 12 #last layer
  batch_nr = 0 # nr of inputs in batch 0 = 1
  token_nr = 0 # nr of first token
  emb_vec = encoded_layers[layer_nr][batch_nr][token_nr]
  # Move to cpu
  result = emb_vec.detach().cpu().numpy()
  return result

In [None]:
descriptions = df.description.values
start_time = time.time()
embeddings = []
# embedding in loop
for i, desc in enumerate(descriptions):
  embedding = description_embedding(tokenizer,model,desc)
  embeddings.append(embedding)
  if ((i % 1000 ==0) and (i>0) or (i == len(descriptions)-1)):
    elapsed_time = time_elapsed(time.time()-start_time)
    print(f'Progress: {round(i/len(descriptions)*100,2)}%')
    print(f'Time elapsed: {elapsed_time}')
embeddings = np.array(embeddings)


# 2.3. Initialize FAISS

In [None]:
# k-NN to calculate simillarity
# initialize of FAISS
cpu_index = faiss.IndexFlatL2(embeddings.shape[1]) #embeddings.shape[1] is number of features in feature vector
co = faiss.GpuMultipleClonerOptions()
co.shard = True
gpu_index = faiss.index_cpu_to_all_gpus(cpu_index, co=co,ngpu=1)
gpu_index.add(embeddings) # adding dataset


# 2.4. Function to get recomendations

In [None]:
# function to calculate recomendations
indices = pd.Series(df.index, index=df['title']).drop_duplicates()
def get_recomendations(title):
  idx = indices[title]
  distances, movies = gpu_index.search(embeddings[idx].reshape(1,768),k=11)
  movie_titles = []
  for i in range(movies.shape[1]-1):
    movie_titles.append(df.iloc[movies[0,i+1]].title)
  likehood = (1-distances[0,1:]/(max(max(distances)))*0.9)
  likehood = likehood/max(likehood)
  return movie_titles,likehood

# 3.0. Choose your movie HERE!

In [None]:
movie_you_watched = 'Transformers Prime'
recomendations, likehood = get_recomendations(movie_you_watched)

# 3.1. Plot with recomendations

In [None]:
# creating plot with recomendations
df_temp = pd.DataFrame(columns = df.columns)
temp_tittle=[]
for i,t in enumerate(recomendations):
  df_temp = df_temp.append(df[(df.title==t)])
  temp_tittle.append(movie_you_watched)

plot_title = 'Recomendations after watching ' + movie_you_watched 
fig = px.treemap(
    data_frame = df_temp,
    names = df_temp['title'],
    values = likehood,
    parents = temp_tittle,
    hover_name = df_temp['title'],
    hover_data=['director','release_year','rating','country','cast'],
    title = plot_title
)
fig.show()