In [40]:
import regex as re
from langdetect import detect
from nltk.corpus import stopwords

import pandas as pd
import numpy as np
import torch
import random
from transformers import DistilBertModel, DistilBertTokenizer, BertModel, BertTokenizer
import joblib

# set seed
seed = 0
np.random.seed(seed)
torch.manual_seed(seed)
random.seed(seed)

stop = stopwords.words('english')

In [54]:
def clean_text(text):
    # assert detect(text) == 'en', 'Text is not in English'
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)
    cleaned_text = cleaned_text.lower()
    cleaned_text = ' '.join([word for word in cleaned_text.split() if word not in (stop)])
    return cleaned_text

In [55]:
def get_embeddings(sample_comments, model, tokenizer):
    sample_comments = [clean_text(comment) for comment in sample_comments]
    tokenized = [tokenizer.encode(comment, add_special_tokens=True) for comment in sample_comments]
    
    # pad to max length
    max_len = 0 # the maximum sequence length of the reviews
    for i, review in enumerate(tokenized):
        if len(review) > max_len:
            max_len = len(review)

    # pad the sequences to the maximum length
    padded = np.array([review + [0]*(max_len-len(review)) for i, review in enumerate(tokenized)])
    
    # get attn mask
    attention_mask = np.where(padded != 0, 1, 0) # 0 means ignore
    attention_mask = torch.tensor(attention_mask)
    input_ids = torch.tensor(padded)
    
    with torch.no_grad():
        last_hidden_states = model(input_ids, attention_mask=attention_mask)
    embeddings = last_hidden_states[0][:,0,:].numpy()
    return embeddings

In [56]:
def pipeline(comments, model_path):
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    model = DistilBertModel.from_pretrained('distilbert-base-uncased')
    embeddings = get_embeddings(comments, model, tokenizer)
    model = joblib.load(model_path)
    predictions = model.predict(embeddings)
    sentiment_map = {-1: 'Negative', 1: 'Positive', 0: 'Neutral'}
    predictions = [sentiment_map[pred] for pred in predictions]
    prediction_df = pd.DataFrame({'comment': comments, 'sentiment': predictions})
    return prediction_df

In [57]:
sample_comments = [
    # "I didn't think they'd be able to top the first one but goddammit they pulled it off.  This is immediately my new favorite Spider-Man movie",
    # "Who knew movies about the multiverse were so fascinated with bagels?",
    # "I absolutely loved it! Only thing is, I don’t know if it was just how they had the sound at my theater or if it was the sound mixing in the movie, but there were a lot of moments where the soundtrack was booming and the dialogue got lost and I couldn’t hear what they were saying. Like even Gwen’s opening monologue I could barely hear. I almost wished I had closed captioning. Was it like this for anyone else or was it just the speakers in my theater? My husband and I might try to watch in a different theater",
    # "That might have been the best first half of an animated movie I’ve ever seen",
    # "Cool twist at the end when Miles Morales met his evil alt-universe twin Miles NoMorales",
    "I did not like the movie."
]

In [58]:
pipeline(sample_comments, model_path='sentiment_analysis_model.pkl')

Unnamed: 0,comment,sentiment
0,I did not like the movie.,Positive
