In [None]:
# General Stuff:
import numpy as np
import pandas as pd
import os
import sys
from datetime import datetime
import gc
from collections import defaultdict, Counter


# Transformers:
if 'transformers' not in sys.modules:
  !pip install transformers
import transformers
from transformers import PreTrainedModel, PreTrainedTokenizer, PretrainedConfig
from transformers import BertForSequenceClassification, BertTokenizer, BertConfig, BertModel
from transformers import DistilBertTokenizer, DistilBertModel

from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig, DistilBertModel

# Dimensionality Reduction:
from sklearn.random_projection import johnson_lindenstrauss_min_dim
from sklearn import random_projection

# Neural Networks Stuff
import torch
from torch import nn, optim
from torch.utils import data

# Statistics Stuff
from sklearn.model_selection import train_test_split, cross_val_score

# Visualization Stuff
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_colwidth', 240)
    
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device
if torch.cuda.is_available():
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# Faster Realtime Sentiment Analysis with Random Projections

<a id="toc"></a>
## Table of Content
1. [Introduction](#introduction)
1. [Data Preparation](#preparation)
1. [Embeddings](#embeddings)
1. [Classification](#classification)
1. [Affective Space](#affective)
1. [Dimensionality Reduction](#reduction)
1. [Conclusions](#conclusions)

<a id="introduction"></a>

## Introduction

In this notebook I will try to improve the time and size complexity of sentiment analysis, in order to make it more suitable for online and real-time usage.

My approach is to reduce the dimensions of the word embedding using random progections, and choose such reduction that will maintain the "affective distances".

By "affective distances" I mean my heuristic for concluding if some reduced embedding maintain the following property: following the Plutchick emotion circumplex, the distances between all the words in the circumplex should maintain relatively the same.

The notebook is avaible on [Kaggle](https://www.kaggle.com/odedgolden/sentiment-analysis-with-dimensionality-reduction/)

[Table of content](#toc)



<table><td><img src='http://www.feelguide.com/wp-content/uploads/2011/06/Plutchik.jpg', width="600"></td></table>

<a id="preparation"></a>
## Data Preparation

[Table of content](#toc)


> “Creative minds are uneven, and the best of fabrics have their dull spots.” - H.P. Lovecraft

In [None]:
root = "/kaggle/input/sentiment-analysis-on-movie-reviews/"
train = pd.read_csv(root + 'train.tsv.zip', sep="\t")
test = pd.read_csv(root + 'test.tsv.zip', sep="\t")
print(train.shape,test.shape)
train.head(10)

### Initiate BERT Model

The BERT model was proposed in [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It’s a bidirectional transformer pre-trained using a combination of masked language modeling objective and next sentence prediction on a large corpus comprising the Toronto Book Corpus and Wikipedia.

It will be very helpfull to use the pretrained BERT model, in order to get the tokenizer and the word embeddings.

In [None]:
PRE_TRAINED_MODEL_NAME = 'distilbert-base-cased'
tokenizer = DistilBertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
bert_model = DistilBertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

bert_embeddings = bert_model.get_input_embeddings()

In [None]:
sample_sentence = "Creative minds are uneven, and the best of fabrics have their dull spots."
tokens = tokenizer.tokenize(sample_sentence)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
encoded = tokenizer.encode(sample_sentence)
print(tokens)
print(token_ids)
print(encoded)

<a id="embeddings"></a>

## Embeddings

[Table of content](#toc)


> "Words have no power to impress the mind without the exquisite horror of their reality." - Edgar Allan Poe.

We will use the BERT pre-trained embeddings which map each word id to its (1,768) vector.

In [None]:
def word_to_index(word):
    token_id = tokenizer.convert_tokens_to_ids(word)
    return token_id

print(word_to_index('Like'))

def index_to_vec(word_id):
    embeded_token = bert_embeddings(torch.Tensor([word_id]).to(torch.long))
    return embeded_token.detach().numpy()
vec = index_to_vec(word_to_index('Like'))
print(vec.shape)

<a id="classification"></a>

## Classification

I will test three classification models.

1. BERT assisted model.
2. Simple linear model using only BERT embeddings.
3. Simple linear model using BERT embeddings and dimensionality redection.

[Table of content](#toc)



### Torch Classes

First let's define the Datasets that will hold the train samples and test samples.

They will take the raw reviews, tokenize them and return the encoded vector. (not yet the embedding vectors)

In [None]:
class TrainDataSet(data.Dataset):
    def __init__(self, reviews, labels, tokenizer, max_len):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, item):
        review = str(self.reviews[item])
        
        encoding  = self.tokenizer.encode_plus(
            review,
            max_length = self.max_len,
            truncation=True,
            add_special_tokens=True,
            padding='max_length',
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt'
        )
        
        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(self.labels[item], dtype=torch.long)
        }

Note that for the test dataset we would like to have no labels.

In [None]:
class TestDataSet(data.Dataset):
    def __init__(self, ids, reviews, tokenizer, max_len):
        self.ids = ids
        self.reviews = reviews
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, item):
        review = str(self.reviews[item])
        reviewexcerpt_id = str(self.ids[item])
        
        encoding  = self.tokenizer.encode_plus(
            review,
            max_length = self.max_len,
            truncation=True,
            add_special_tokens=True,
            padding='max_length',
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt'
        )
        
        return {
            'review_id': review_id,
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }

In [None]:
def create_train_data_loader(df, tokenizer, max_len, batch_size):
    reviews = df['Phrase'].to_numpy(),
    print(f'Reviews size: {len(reviews)}')
    labels = df['Sentiment'].to_numpy(),
    dataset = TrainDataSet(reviews=reviews[0], labels=labels[0], tokenizer=tokenizer, max_len=max_len)
    print(f'Dataset size: {len(dataset)}')
    return data.DataLoader(dataset, batch_size=batch_size, num_workers=4, shuffle=True)

def create_test_data_loader(df, tokenizer, max_len, batch_size):
    reviews = df['Phrase'].to_numpy(),
    ids = df['id'].to_numpy(),
    print(f'Reviews size: {len(reviews)}')
    dataset = TestDataSet(ids= ids[0], reviews=reviews[0], tokenizer=tokenizer, max_len=max_len)
    print(f'Dataset size: {len(dataset)}')
    return data.DataLoader(dataset, batch_size=batch_size, num_workers=4)

Now let's split the train DataFrame into train set and validation set.

In [None]:
train_set, val_set = train_test_split(train, test_size=0.2)

In [None]:
BATCH_SIZE = 4
MAX_LEN = 160

train_data_loader = create_train_data_loader(train_set, tokenizer, max_len=MAX_LEN, batch_size=BATCH_SIZE)
val_data_loader = create_train_data_loader(val_set, tokenizer, max_len=MAX_LEN, batch_size=BATCH_SIZE)

In [None]:
sample = next(iter(train_data_loader))
print(sample['input_ids'].shape)

### Models:

<a id="models"></a>
1. [BertReviewClassifier](#BertReviewClassifier)
1. [LinearReviewClassifier](#LinearReviewClassifier)

<a id="BertReviewClassifier"></a>
#### Bert Review Classifier

In [None]:
class BertReviewClassifier(nn.Module):
    def __init__(self):
        super(BertReviewClassifier, self).__init__()
        self.num_labels = 5

        self.softmax = nn.Softmax(dim=1)
        self.bert = DistilBertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
#         print(self.bert.config)
        self.classifier = nn.Linear(self.bert.config.dim, 5)
        self.dropout = nn.Dropout(0.3)

        nn.init.xavier_normal_(self.classifier.weight)

    def forward(self, input_ids, attention_mask):
        bert_output = self.bert(input_ids=input_ids,
                                            attention_mask=attention_mask)
        hidden_state = bert_output[0]
        # print(f'hidden_state shape: {hidden_state.shape}')                
        # print(f'hidden_state shape[2]: {hidden_state.shape[2]}')                
        pooled_output = hidden_state[:, 0, :]                   
        # pooled_output = self.pre_classifier(pooled_output)   
        # pooled_output = nn.ReLU()(pooled_output)             
        pooled_output = self.dropout(pooled_output)        
        logits = self.classifier(pooled_output)
        # logits = self.softmax(logits)
        return logits

<a id="LinearReviewClassifier"></a>
#### Linear Review Classifier

The linear model will get an optional parameter - the reduction matrix, that will reduce the embedding vectors from their origin size (768 in our case) to a smaller size (32 in our case).

In [None]:
class LinearReviewClassifier(nn.Module):
    def __init__(self, reduction_transformer=None):
        super(LinearReviewClassifier, self).__init__()
        self.num_labels = 5

        self.softmax = nn.Softmax(dim=1)
        self.bert_embeddings = DistilBertModel.from_pretrained(PRE_TRAINED_MODEL_NAME).get_input_embeddings()
        self.classifier = nn.Linear(768, 5) if reduction_transformer is None else nn.Linear(reduction_transformer.n_components, 5)
        self.dropout = nn.Dropout(0.3)
        self.reduction_transformer = reduction_transformer
        self.reduction_matrix = torch.from_numpy(np.transpose(reduction_transformer.components_)).float().to(device) if reduction_transformer else None
        
        
        nn.init.xavier_normal_(self.classifier.weight)

    def forward(self, input_ids, attention_mask):
        bert_embeddings_output = self.bert_embeddings(input_ids)
#         print(f'bert_embeddings_output shape: {bert_embeddings_output.shape}')                
        if self.reduction_transformer:
#             print(f'bert_embeddings_output shape: {bert_embeddings_output.shape}')                
#             print(f'self.reduction_matrix shape: {self.reduction_matrix.shape}')
#             print(f'self.reduction_matrix type: {self.reduction_matrix.dtype}')
#             print(f'self.bert_embeddings_output type: {bert_embeddings_output.dtype}')
            bert_embeddings_output = torch.matmul(bert_embeddings_output, self.reduction_matrix)
            
        hidden_state = bert_embeddings_output[:, 0, :]
#         print(f'hidden_state shape: {hidden_state.shape}')                
        pooled_output = self.dropout(hidden_state)
        output = self.classifier(pooled_output)
        return output

Sanity test:

In [None]:
a = torch.LongTensor(1).random_(0, 10)
a = a.to(device)

In [None]:
model = LinearReviewClassifier().to(device)
input_ids = sample['input_ids'].to(device)
attention_mask = sample['attention_mask'].to(device)

print(input_ids.shape)
print(attention_mask.shape)
prob, pred = torch.max(model(input_ids=input_ids, attention_mask=attention_mask),dim=1)
print(prob)
print(pred)
print(sample['targets'])

### Training

In [None]:
def train_epoch(model, data_loader, loss_fn, optimizer, scheduler, n_examples):
    model = model.train()
    
    losses = []
    correct_predictions = 0
    
    for d in data_loader:
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        targets = d['targets'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        # print(targets.shape)
        loss = loss_fn(outputs, targets)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        loss.backward()
        
        # nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        del outputs
        gc.collect()
    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
def eval_model(model, data_loader, loss_fn, n_examples):
    model = model.eval()
    losses = []
    
    correct_predictions = 0
    with torch.no_grad():
        for d in data_loader:
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            targets = d['targets'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())
    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
def predict_sentiment(model, data_loader, submission_df):
    model = model.eval()
    with torch.no_grad():
        for d in data_loader:
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            print(outputs)

First we will train the linear model without the reduction:

In [None]:
def run_training(model, epoches, output_file):
    optimizer = transformers.AdamW(model.parameters(), lr=2e-5, correct_bias=False)
    total_steps = len(train_data_loader) * epoches

    scheduler = transformers.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    loss_fn = nn.CrossEntropyLoss().to(device)
    model = model.to(device)

    history = defaultdict(list)
    best_accuracy = 0
    start_time = datetime.now()

    for epoch in range(epoches):
        print(f'Epoch {epoch + 1}/{epoches} started at {datetime.now()}')
        print('-'*10)

        train_acc, train_loss = train_epoch(
            model,
            train_data_loader,
            loss_fn,
            optimizer,
            scheduler,
            len(train_set)   
        )
        print(f'Train loss: {train_loss}, accuracy: {train_acc}')

        val_acc, val_loss = eval_model(
            model,
            val_data_loader,
            loss_fn,
            len(val_set)   
        )
        print(f'Validation loss: {val_loss}, accuracy: {val_acc}')
        history['train_acc'].append(train_loss)
        history['val_acc'].append(val_acc.cpu().detach().numpy())
    
    running_time = datetime.now() - start_time
    torch.save(model.state_dict(), output_file)
    return history, best_accuracy, running_time

In [None]:
filename = '/kaggle/working/linear_model.pt'
EPOCHES = 5
history, best_accuracy, running_time = run_training(model, EPOCHES, filename)
print(f'Best accuracy: {best_accuracy}, took: {running_time}')
print(history)

In [None]:

# a = np.array(history['val_acc']).reshape((len(history['val_acc']),1))
# print(a.shape)
# b = np.array(history['train_acc']).reshape((len(history['train_acc']),1))
# print(b.shape)
# c = np.array([i in range(EPOCHES)]).reshape(EPOCHES,1))
# print(c.shape)

# data = pd.DataFrame(np.concatenate((a,b, c),axis=1), columns = ["Validation Loss","Train Loss","Epochs"])
# # data.head()
# ax = sns.lineplot(x="Epochs", y="Loss", data=data)

In [None]:
plt.plot(history['train_acc'], label='train accuracy')
plt.plot(history['val_acc'], label='validation accuracy')

plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1]);

<a id="affective"></a>
## Affective Space
1. [Table of content](#toc)

In [None]:
plutchick_words = [["ecstasy", "joy", "serenity"],["admiration", "trust", "acceptance"],
                   ["terror", "fear", "apprehension"],["amazement", "surprise", "distraction"],
                   ["grief", "sadness", "pensiveness"],["loathing", "disgust", "boredom"],
                   ["rage", "anger", "annoyance"], ["vigilance", "anticipation", "interest"]]

circumplex_emotions = ["ecstasy", "joy", "serenity","admiration", "trust", "acceptance",
                   "terror", "fear", "apprehension","amazement", "surprise", "distraction",
                   "grief", "sadness", "pensiveness","loathing", "disgust", "boredom",
                   "rage", "anger", "annoyance", "vigilance", "anticipation", "interest"]

In [None]:
def get_distances_df(circumplex_emotions, index_to_vec):
    l_emotions = len(circumplex_emotions)
    distance_matrix = np.zeros((l_emotions,l_emotions))
    for i in range(l_emotions):
        for j in range(l_emotions):
            word1 = index_to_vec(word_to_index((circumplex_emotions[i])))
            word2 = index_to_vec(word_to_index((circumplex_emotions[j])))
            distance_matrix[i][j] = np.linalg.norm(word1 - word2)

    emotions_df = pd.DataFrame(distance_matrix, columns=circumplex_emotions)
    emotions_df.index = circumplex_emotions
    return emotions_df

In [None]:
baseline_df = get_distances_df(circumplex_emotions, index_to_vec)


fig, ax = plt.subplots(figsize=(20,20))
sns.heatmap(ax=ax, data=baseline_df, annot=True)

<a id="reduction"></a>

## Dimensionality Reduction

[Table of content](#toc)



In [None]:
emotions_vectors = np.array([index_to_vec(word_to_index((word))) for word in circumplex_emotions]).squeeze()
emotions_vectors.shape

In [None]:
transformer = random_projection.GaussianRandomProjection(n_components=32)
transformer.fit(emotions_vectors)
emotions_vectors_reduced = transformer.transform(emotions_vectors)

In [None]:
transformer.components_.shape

In [None]:
def index_to_vec_transform(word_id):
    full_vec = index_to_vec(word_id)
#     print(full_vec.shape)
    return transformer.transform(full_vec)
index_to_vec_transform(word_to_index("Like"))

In [None]:
reduced_df = get_distances_df(circumplex_emotions, index_to_vec_transform)


fig, ax = plt.subplots(figsize=(20,20))
sns.heatmap(ax=ax, data=reduced_df, annot=True)

Now that we have a fitted transformer, we can train a new linear model, this time with reduction

In [None]:
# del model
# del bert_model
# del bert_embeddings

gc.collect()

model = LinearReviewClassifier(reduction_transformer=transformer).to(device)
filename = '/kaggle/working/linear_reduced_model.pt'
EPOCHES = 5

history, best_accuracy, running_time = run_training(model, EPOCHES, filename)
print(f'Best accuracy: {best_accuracy}, took: {running_time}')
print(history)

In [None]:
plt.plot(history['train_acc'], label='train accuracy')
plt.plot(history['val_acc'], label='validation accuracy')

plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1]);

<a id="conclusions"></a>

## Conclusions

[Table of content](#toc)

Unfortunately, I didn't succeed in showing any improvement in accuracy or speed with this fairly simple setup.
However, here are some thoughts and ideas for possible improvements:
1. While implementing, one should be very careful when trying to write custom layers, and make sure to utilize the deep learning library (torch in this case) and do not use numpy functions for training.
2. A major disadvantage of my method is the lack of fine tuning of the reduction matrix while training the complete network. This may be addressed by allowing some extent of training for the matrix as well.
3. While I didn't really optimized the reduction matrix for the different emotions, but merely optimized for the actual words (some of them even missing from the corpus), one may want to use more elaborated method to optimized for the conceptual emotions. For example, one may use entire paragraphs that were validated in psychological experiments, and prooved to portray the intended emotions.
4. One should certainly try to use a larger, perhaps more complex network, train it longer, and on more diverse and rich data.
5. While this implementation is very naive and simple, the same idea can be used by a pure deep artificial neural network, where instead of fitting a reduction matrix and use it as is, one may add an affective target function that will represent the affective distances between the emotions.
