In [76]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

In [77]:
model = AutoModelForSequenceClassification.from_pretrained('model/fine_tuned_BERT')
tokenizer = AutoTokenizer.from_pretrained("tokenizer")

In [100]:
# helper functions

def preProcessInput(titles, summaries, genres):
# titles: list of strings in the form: [title_1, title_2, ...]
# summaries: list of summaries(strings) in the form: [summary_1, summary_2, ...]
# genres: list of genres in the form: [[genres_1], [genres_2], ...] with genres_i = "genres_i1", "genres_i2", ...
    
    inputs = []
    
    for i in range(len(titles)):
        # normalice spacing in the titles
        title_i = (' ').join(titles[i].split())
        
        # normalice spacing in the summaries
        summary_i = (' ').join(summaries[i].split())
        
        if genres[i] == []:
            genres_i = 'NonGiven'
        else:
            # convert the lists of genres to strings separated by '|'
            genres_i = '|'.join(genres[i])
            
        input_i = {'title': title_i, 'summary': summary_i, 'genres': genres_i}
        inputs.append(input_i)
        
    return inputs


def tokenizeInputs(inputs):
    
    tokenized_movies = []
    
    for movie in inputs:
        token_movie = tokenizer(movie['title'] + '<SEP>' + movie['summary'], movie['genres'], 
                                padding = 'max_length', truncation = True)
        token_movie['title'] = movie['title']
        token_movie['summary'] = movie['summary']
        token_movie['genres'] = movie['genres']
        
        tokenized_movies.append(token_movie)
        
    return tokenized_movies
    
    

In [79]:
movies = preProcessInput(["this is the    first title", "here is another title"], 
                ["the first movie   is about icecream", "the second movie is about forests"], 
                [["Action", "Fiction"], ["Romance", "Thriller"]])

In [80]:
movies

[{'title': 'this is the first title',
  'summary': 'the first movie is about icecream',
  'genres': 'Action|Fiction'},
 {'title': 'here is another title',
  'summary': 'the second movie is about forests',
  'genres': 'Romance|Thriller'}]

In [103]:
tokenizeInputs(movies)[1]

{'input_ids': [101, 1303, 1110, 1330, 1641, 133, 12342, 2101, 135, 1103, 1248, 2523, 1110, 1164, 5775, 102, 13589, 197, 157, 24657, 1200, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,