# Review Sentiment Prediction
In this notebook, we will attempt to predict whether a review has a positive or a negative sentiment.

In [None]:
import pandas as pd

from bs4 import BeautifulSoup

import nltk, re, torch

from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.pipeline import Pipeline

from transformers import BertTokenizer, BertForSequenceClassification, AdamW

from zipfile import ZipFile

In [None]:
with ZipFile("/kaggle/input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip","r") as file:
    file.extractall("input")

In [None]:
df = pd.read_csv("./input/labeledTrainData.tsv",delimiter='\t')
df.head()

In [None]:
df['sentiment'].value_counts()

The dataset is perfectly balanced, so no need for data augmentation or oversampling.

## Vader Baseline
A first attempt will be to use unsupervised sentiment classification from Vader as provided by nltk, to provide a baseline for our metrics.

In [None]:
analyzer = SentimentIntensityAnalyzer()
def predict_sentiment(review):
    sentences = sent_tokenize(review)
    scores = [analyzer.polarity_scores(sentence)['compound'] for sentence in sentences]
    return 1 if pd.Series(scores).mean() > 0 else 0
    
predictions = df['review'].apply(predict_sentiment)
print(classification_report(df['sentiment'], predictions))

Vader seems to perform pretty well! It seems to struggle a bit with positive reviews, it might be because the dataset is labeled positive on a rating >= 7/10, so there are many non-negative reviews classified as negative. We might be able to account for this by adjusting the classification threshold from 0 to a positive value, e.g. 0.3.

## Count Vectorization
Another attempt might be to use count vectorization for the reviews, with manual preprocessing and tokenization. We'll try Random Forest and Multinomial Naive Bayes

In [None]:
alphanum_re = re.compile(r"\W+")
stemmer =  nltk.stem.snowball.SnowballStemmer("english")
stop_words = set(stopwords.words("english"))

def preprocess(review):
    text = BeautifulSoup(review).get_text()
    text = alphanum_re.sub(" ", text)
    return text

def tokenize(review):
    tokens = nltk.tokenize.word_tokenize(review)
    tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]
    return tokens

def evaluate(model):
    results = cross_validate(
        model, 
        df['review'], 
        df['sentiment'], 
        scoring=['precision_macro','recall_macro','accuracy']
    )
    return results

We will remove the HTML entities from the review text, as well as any non-word characters, and then tokenize and stem the reviews before passing them to our models.

In [None]:
vectorizer = CountVectorizer(preprocessor=preprocess, tokenizer=tokenize, min_df=10)
rf_model = Pipeline([
    ('bow', vectorizer),
    ('cls', RandomForestClassifier())
])
results = evaluate(rf_model)
pd.DataFrame(results).mean(axis=0)

In [None]:
vectorizer = CountVectorizer(preprocessor=preprocess, tokenizer=tokenize, min_df=10)
nb_model = Pipeline([
    ('bow', vectorizer),
    ('cls', MultinomialNB())
])
results = evaluate(rf_model)
pd.DataFrame(results).mean(axis=0)

We can already see an improvement over the unsupervised classification. Random Forest seems to perform slightly better.

## BERT Classification
Now we'll try fine-tuning BERT for our classification task

In [None]:
class ReviewDataSet(torch.utils.data.Dataset):

    def __init__(self, X, y):
        self.classes = y
        reviews = X.apply(preprocess)
        tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
        
        self.items = [tokenizer(
            review, 
            max_length = 128, 
            truncation=True, 
            padding='max_length',
            return_tensors='pt'
        ) for review in reviews]

    def classes(self):
        return self.classes

    def __len__(self):
        return len(self.items)

    def __getitem__(self, idx):
        return self.items[idx], self.classes.iloc[idx]

In [None]:
train_x, test_x, train_y, test_y = train_test_split(df['review'],df['sentiment'], shuffle=True, random_state=1)
dataset = ReviewDataSet(train_x, train_y)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=10, shuffle=True)

This is very slow on the CPU, so we'll do the computations on the GPU which is significantly faster

In [None]:
cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")

log_interval = 10
epochs = 3

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)
model.train()

loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

if cuda:
    model.cuda()
    loss_fn.cuda()

for epoch in range(epochs):
    running_loss = 0.0
    
    for i, batch in enumerate(dataloader):
        inputs, labels = batch
        
        optimizer.zero_grad()
        
        mask = inputs['attention_mask'].to(device)
        input_id = inputs['input_ids'].squeeze(1).to(device)
        labels = labels.to(device)
        
        outputs = model(input_id,attention_mask=mask, labels=labels)
        
        loss = outputs[0]
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if i % log_interval == log_interval - 1:
            print(f"Epoch {epoch+1} Batch {i+1} avg. loss: {running_loss/log_interval}")
            running_loss = 0.0