In [11]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # This returns word embedding values
                                  )

def get_semantics(text):
    
    # Add the special tokens.
    marked_text = "[CLS] " + text + " [SEP]"

    # Split the sentence into tokens.
    tokenized_text = tokenizer.tokenize(marked_text)

    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    segments_ids = [1] * len(tokenized_text)
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    with torch.no_grad():

        outputs = model(tokens_tensor, segments_tensors)

        hidden_states = outputs[2]

    token_embeddings = torch.stack(hidden_states, dim=0)
    token_embeddings.size()

    # remove batch size
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    token_embeddings = token_embeddings.permute(1,0,2)

    # The first number = length of sentence. # words
    token_embeddings.size()

    # Stores the token vectors, with shape [22 x 768]
    token_vecs_sum = []

    # `token_embeddings` is a [22 x 12 x 768] tensor.

    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [12 x 768] tensor

        # Sum the vectors from the last four layers.
        sum_vec = torch.sum(token[-4:], dim=0)

        # Use `sum_vec` to represent `token`.
        token_vecs_sum.append(sum_vec)

    # `hidden_states` has shape [13 x 1 x 22 x 768]

    # `token_vecs` is a tensor with shape [22 x 768]
    token_vecs = hidden_states[-2][0]

    # Calculate the average of all 22 token vectors.
    sentence_embedding = torch.mean(token_vecs, dim=0)
    sentence_embedding.shape
    
    return sentence_embedding

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
df = pd.read_csv('anime_with_synopsis.csv')

In [9]:
# Remove any non-alphanumeric characters other than basic punctuations
df['sypnopsis'] = df['sypnopsis'].fillna('')
df['sypnopsis'] = df['sypnopsis'].str.replace("[^a-zA-Z0-9 .,']", '', regex=True)

In [23]:
# The Bert model's output shape is 768 columns
for i, x in df.iterrows():
    
    if i < 11451:
        continue
        
    synopsis = x['sypnopsis']

    if synopsis == '' or pd.isna(x['sypnopsis']):
        semantic = [0]*768
    else:
        # can take upto 512 characters
        semantic = get_semantics(x['sypnopsis'][:512])

    semantic = pd.DataFrame(get_semantics(x['sypnopsis'][:512])).transpose()

    if i==0:
        semantic.to_csv('semantics.csv', index=False)
    else:
        semantic.to_csv('semantics.csv', index=False, mode='a', header=None)

    print('{}/{}'.format(i+1, df.shape[0]), flush=True, end='\r')

16214/16214

In [24]:
semantics = pd.read_csv('semantics.csv')

In [26]:
semantics.shape, df.shape

((16214, 768), (16214, 5))