In [1]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

import torch

import numpy as np

In [2]:
# model = AutoModelForSequenceClassification.from_pretrained('model/fine_tuned_BERT')
# tokenizer = AutoTokenizer.from_pretrained("tokenizer")

model_path = 'model/fine_tuned_BERT'
tokenizer_path = 'tokenizer'

In [9]:
# helper functions

def preProcessInput(titles, summaries, genres):
# titles: list of strings in the form: [title_1, title_2, ...]
# summaries: list of summaries(strings) in the form: [summary_1, summary_2, ...]
# genres: list of genres in the form: [[genres_1], [genres_2], ...] with genres_i = "genres_i1", "genres_i2", ...
    
    inputs = []
    
    for i in range(len(titles)):
        # normalice spacing in the titles
        title_i = (' ').join(titles[i].split())
        
        # normalice spacing in the summaries
        summary_i = (' ').join(summaries[i].split())
        
        if genres[i] == []:
            genres_i = 'NonGiven'
        else:
            # convert the lists of genres to strings separated by '|'
            genres_i = '|'.join(genres[i])
            
        input_i = {'title': title_i, 'summary': summary_i, 'genres': genres_i}
        inputs.append(input_i)
        
    return inputs


def tokenizeInputs(inputs):
    title_mod = [movie['title'] + '<SEP>' + movie['summary'] for movie in inputs]
    genres_list = [movie['genres'] for movie in inputs]
    
        
    return tokenizer(title_mod, genres_list, padding = 'max_length', truncation = True, 
                     return_tensors = "pt")


def modelPredictions(model, tokenized_input):
# generate model predictions using the model logits and tokenized input and determine 
# the most likely rating using
    
    with torch.no_grad():
        model_output = model(**tokenized_input)
        
    logits = model_output.logits
    predictions = np.argmax(logits, axis = -1)
    
    return predictions           
            
def predMovieRating(predictions):
    predicted_ratings = []
    
    for pred in predictions:
        if pred == 0:
            predicted_ratings.append((pred, "bad"))
        elif pred == 1:
            predicted_ratings.append((pred, "average"))
        else:
            predicted_ratings.append((pred, "good"))
            
    return predicted_ratings
            

class MovieClassifier:
#     this class predicts movie ratings using a fine-tuned BERT model, using title,summary and genres as inputs
    ratings = ['bad', 'average', 'good']

    # initialize the model and tokenizer
    def __init__(self, model_path, tokenizer_path):
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
        
    def __preProcessInput(self, titles, summaries, genres):
    # titles: list of strings in the form: [title_1, title_2, ...]
    # summaries: list of summaries(strings) in the form: [summary_1, summary_2, ...]
    # genres: list of genres in the form: [[genres_1], [genres_2], ...] with genres_i = "genres_i1", "genres_i2", ...
    
        inputs = []
    
        for i in range(len(titles)):
        # normalice spacing in the titles
            title_i = (' ').join(titles[i].split())
        
        # normalice spacing in the summaries
            summary_i = (' ').join(summaries[i].split())
        
            if genres[i] == []:
                genres_i = 'NonGiven'
            else:
            # convert the lists of genres to strings separated by '|'
                genres_i = '|'.join(genres[i])
            
            input_i = {'title': title_i, 'summary': summary_i, 'genres': genres_i}
            inputs.append(input_i)
        
        return inputs
    
    def __tokenizeInputs(self, inputs):
        title_mod = [movie['title'] + '<SEP>' + movie['summary'] for movie in inputs]
        genres_list = [movie['genres'] for movie in inputs]
        
        return self.tokenizer(title_mod, genres_list, padding = 'max_length', 
                         truncation = True, 
                         return_tensors = "pt")
    
    def __modelPredictions(self, model, tokenized_input):
    # generate model predictions using the model logits and tokenized input and determine 
    # the most likely rating using
    
        with torch.no_grad():
            model_output = self.model(**tokenized_input)
        
        logits = model_output.logits
        predictions = np.argmax(logits, axis = -1)
    
        return predictions
    
    def __predMovieRating(self, predictions):
        predicted_ratings = []
    
        for pred in predictions:
            predicted_ratings.append((pred, self.ratings[pred]))
            
        return predicted_ratings
    
    def predict(self, title, summary, genre):
        movies = self.__preProcessInput(title, summary, genre)
        tokenized_movies = self.__tokenizeInputs(movies)
        predictions = self.__modelPredictions(self.model, tokenized_movies)
        pred_ratings = self.__predMovieRating(predictions)
        
        return pred_ratings
        
        
        
    
# cls = MovieClassifier(blah, blarg)

# pred = cls.predict

# MovieClassifier().predict(cls, title, ...)

In [16]:
cls = MovieClassifier(model_path, tokenizer_path)
pred = cls.predict(['this is the title', 'second movie'], ["this movie is about icecream", 'second summary'], [['Romance'], []])
pred

[(tensor(1), 'average'), (tensor(1), 'average')]

In [19]:
[rating[1] for rating in pred]

['average', 'average']

In [39]:
pred[0][1]

'average'

In [5]:
movies = preProcessInput(["this is the    first title", "here is another title"], 
                ["the first movie   is about icecream", "the second movie is about forests"], 
                [["Action", "Fiction"], ["Romance", "Thriller"]])

In [6]:
movies

[{'title': 'this is the first title',
  'summary': 'the first movie is about icecream',
  'genres': 'Action|Fiction'},
 {'title': 'here is another title',
  'summary': 'the second movie is about forests',
  'genres': 'Romance|Thriller'}]

In [7]:
[m['genres'] for m in movies]

['Action|Fiction', 'Romance|Thriller']

In [14]:
tokenized_input = tokenizeInputs(movies)
tokenized_input

{'input_ids': tensor([[ 101, 1142, 1110,  ...,    0,    0,    0],
        [ 101, 1303, 1110,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [22]:
type(tokenized_input.input_ids[0])

torch.Tensor

In [30]:
model(**tokenized_input)

SequenceClassifierOutput(loss=None, logits=tensor([[-2.9529,  5.2467, -2.4924],
        [-2.3565,  3.7838, -1.5596]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [31]:
# with torch.no_grad():
    
#     a = model(**tokenized_input)
#     a
    
# logits = a.logits
# logits

# predictions = np.argmax(logits, axis = -1)
# predictions

In [34]:
# for i in range(len(predictions)):
#     print(predictions[i] == 1)

In [35]:
preds = modelPredictions(model, tokenized_input)
preds

tensor([1, 1])

In [82]:
modelMovieRating(preds)

[(tensor(1), 'average'), (tensor(1), 'average')]