# [Kaggle competition](https://www.kaggle.com/c/amazon-reviews-sentiment-2020-hse/leaderboard)

Accuracy ~0.708

In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
pd.set_option('max_colwidth', 400)

In [None]:
# ! unzip amazon-reviews-sentiment-2020-hse.zip

Archive:  amazon-reviews-sentiment-2020-hse.zip
  inflating: eng_train_data.csv      
  inflating: fr_text.csv             
  inflating: sample_submission.csv   


In [None]:
import time
import numpy as np
import pandas as pd
import random

import torch
from keras.preprocessing.sequence import pad_sequences

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig, BertPreTrainedModel, BertModel
from transformers import get_linear_schedule_with_warmup

import os

from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler


MAX_LEN = 128
batch_size = 32


def create_attention_masks(encoded_sentences):
    attention_masks = []
    for sent in encoded_sentences:
        att_mask = [int(token_id > 0) for token_id in sent]
        attention_masks.append(att_mask)
    return attention_masks


def preprocessing(df, is_train=True):
    sentences = df.sentence.values
    if is_train:
        labels = np.array([l for l in df.label.values])

    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True)

    encoded_sentences = []
    for sent in sentences:
        encoded_sent = tokenizer.encode(
            sent,
            add_special_tokens=True,
            truncation=True,
            max_length=MAX_LEN
        )

        encoded_sentences.append(encoded_sent)
    encoded_sentences = pad_sequences(encoded_sentences, maxlen=MAX_LEN, dtype="long",
                                      value=0, truncating="post", padding="post")
    if is_train:
        return encoded_sentences, labels
    return encoded_sentences

def accuracy(preds, labels):
    p = np.argmax(preds, axis=1).flatten()
    l = labels.flatten()
    return np.sum(p == l) / len(l)


def run_train(model, train_dataloader, validation_dataloader, device, epochs, optimizer):
    losses = []
    for e in range(epochs):
        print(f'======== Epoch {e + 1} / {epochs} ========')
        start_train_time = time.time()
        total_loss = 0
        model.train()
        for step, batch in enumerate(train_dataloader):

            if step % 10 == 0:
                elapsed = time.time() - start_train_time
                print(f'{step}/{len(train_dataloader)} --> Time elapsed {elapsed}')

            input_data = batch[0].to(device)
            input_masks = batch[1].to(device)
            input_labels = batch[2].to(device)

            model.zero_grad()

            out = model(input_data,
                        token_type_ids=None,
                        attention_mask=input_masks,
                        labels=input_labels)

            loss = out[0]
            total_loss = total_loss + loss.item()

            loss.backward()

            torch.nn.utils.clip_grad_norm(model.parameters(), 1)

            optimizer.step()

        epoch_loss = total_loss / len(train_dataloader)
        losses.append(epoch_loss)
        print(f"Training took {time.time() - start_train_time}")

        start_validation_time = time.time()
        model.eval()
        eval_loss, eval_acc = 0, 0
        for step, batch in enumerate(validation_dataloader):
            batch = tuple(t.to(device) for t in batch)
            eval_data, eval_masks, eval_labels = batch
            with torch.no_grad():
                out = model(eval_data,
                            token_type_ids=None,
                            attention_mask=eval_masks)
            logits = out[0]

            logits = logits.detach().cpu().numpy()
            eval_labels = eval_labels.to('cpu').numpy()
            batch_acc = accuracy(logits, eval_labels)

            eval_acc += batch_acc
        print(f"Accuracy: {eval_acc / (step + 1)}, Time elapsed: {time.time() - start_validation_time}")
    return losses, model

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

df = pd.read_csv("eng_train_data.csv")
df.columns = ['sentence', 'label']
df_test = pd.read_csv("fr_text.csv")
df_test.columns = ['id', 'sentence']

In [None]:
val = df.sample(int(len(df)*0.1))
tr = df[~df.index.isin(val.index)]

In [None]:
train_encoded_sentences, train_labels = preprocessing(tr)
train_attention_masks = create_attention_masks(train_encoded_sentences)

test_encoded_sentences, test_labels = preprocessing(val)
test_attention_masks = create_attention_masks(test_encoded_sentences)

train_inputs = torch.tensor(train_encoded_sentences)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_attention_masks)

validation_inputs = torch.tensor(test_encoded_sentences)
validation_labels = torch.tensor(test_labels)
validation_masks = torch.tensor(test_attention_masks)

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = SequentialSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

seed_val = 18

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


In [None]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=3,
    output_attentions=False,
    output_hidden_states=False,
)

model.cuda()

optimizer = AdamW(model.parameters(),
                    lr=3e-5,
                    eps=1e-8,
                    weight_decay=0.01
                    )

epochs = 1
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

In [None]:
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

losses, model = run_train(model, train_dataloader, validation_dataloader, device, epochs, optimizer)

0/844 --> Time elapsed 0.0022945404052734375




10/844 --> Time elapsed 6.925151348114014
20/844 --> Time elapsed 13.86934781074524
30/844 --> Time elapsed 20.934029817581177
40/844 --> Time elapsed 28.11949610710144
50/844 --> Time elapsed 35.356322288513184
60/844 --> Time elapsed 42.650262117385864
70/844 --> Time elapsed 49.84650111198425
80/844 --> Time elapsed 56.97730875015259
90/844 --> Time elapsed 64.02476119995117
100/844 --> Time elapsed 71.02489924430847
110/844 --> Time elapsed 78.0154767036438
120/844 --> Time elapsed 84.98816752433777
130/844 --> Time elapsed 91.99870920181274
140/844 --> Time elapsed 99.00054264068604
150/844 --> Time elapsed 106.06283140182495
160/844 --> Time elapsed 113.1650800704956
170/844 --> Time elapsed 120.28612470626831
180/844 --> Time elapsed 127.39550185203552
190/844 --> Time elapsed 134.48905634880066
200/844 --> Time elapsed 141.5477590560913
210/844 --> Time elapsed 148.60513877868652
220/844 --> Time elapsed 155.6456425189972
230/844 --> Time elapsed 162.70149731636047
240/844 --> 

In [None]:
def run_evaluation(df_test, model, device):
    test_encoded_sentences = preprocessing(df_test, False)
    test_attention_masks = create_attention_masks(test_encoded_sentences)

    test_inputs = torch.tensor(test_encoded_sentences)
    test_masks = torch.tensor(test_attention_masks)

    test_data = TensorDataset(test_inputs, test_masks)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

    model.eval()
    eval_loss, eval_acc = 0, 0
    res = []
    for step, batch in enumerate(test_dataloader):
        batch = tuple(t.to(device) for t in batch)
        eval_data, eval_masks = batch
        with torch.no_grad():
            out = model(eval_data,
                        token_type_ids=None,
                        attention_mask=eval_masks)
        logits = out[0]
        logits = logits.detach().cpu().numpy()
        res.extend(logits)

    return res


labels_fr = run_evaluation(df_test,  model, device)

In [None]:
labels_fr_concat = [np.argmax(i) for i in labels_fr]

In [None]:
df_test['class'] = labels_fr_concat
df_test.head()
df_test[['id', 'class']].to_csv('lr_solution_bert.csv', index=False)