In [4]:
import torch
import pandas as pd
import random
import numpy as np
import time
import datetime

In [5]:
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from transformers import get_linear_schedule_with_warmup
from transformers import BertTokenizer, XLMRobertaTokenizer
from transformers import BertForSequenceClassification, XLMRobertaForSequenceClassification, AdamW, BertConfig

In [2]:
from helper_func import *
from model import *

## Train part
### Generate tokenizer & model

In [6]:
# Used to pad and tokenize words
tokenizer_bert = init_tokenizer(BertTokenizer, 'bert-base-uncased')
model_bert = init_model(BertForSequenceClassification, "bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

### Read training data

In [17]:
filename_train = "./data/sentiment-train.csv"
total_text, total_labels = read_sentiment_file(filename_train)

### Generate train, val dataset

In [18]:
train_dataset, val_dataset = generate_train_val_dataset(generate_dataset(total_text, total_labels, tokenizer_bert), 0.9)



### Generate train, val dataloader

In [19]:
train_dataloader = generate_dataloader(train_dataset, sampler=RandomSampler)
valid_dataloader = generate_dataloader(val_dataset, sampler=SequentialSampler)

### Train

In [10]:
device = choose_device()

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 2080 Ti


In [7]:
fix_random_seed()
train_model(model_bert, device, train_dataloader, valid_dataloader)

### Test
- English

In [8]:
filename_test_en = "./data/sentiment-test.csv"
test_text_en, test_labels_en = read_sentiment_file(filename_test_en)
test_dataset_en = generate_dataset(test_text_en, test_labels_en, tokenizer_bert)
test_dataloader_en = generate_dataloader(test_dataset_en, sampler=SequentialSampler)



In [16]:
evaluate_model(model_bert, device, test_dataloader_en)

- Spanish

In [13]:
filename_test_sp = "./data/spanish-sentiment-test.csv"
test_text_sp, test_labels_sp = read_sentiment_file(filename_test_sp)
test_dataset_sp = generate_dataset(test_text_sp, test_labels_sp, tokenizer_bert)
test_dataloader_sp = generate_dataloader(test_dataset_sp, sampler=SequentialSampler)



In [17]:
evaluate_model(model_bert, device, test_dataloader_sp)