# XLM-Roberta Sentiment Analysis
## Hindi-English

In [18]:
import sys
sys.version

'3.8.5 (default, Sep 18 2020, 23:02:24) \n[Clang 11.0.3 (clang-1103.0.32.62)]'

In [19]:
# import our packages...
import tensorflow as tf
import torch


# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

SystemError: GPU device not found

## Parsing data

In [96]:
import pandas as pd

# Load the datasets into a pandas dataframe.
df_train = pd.read_csv("data/hindi-english/train_14k_split.csv" )
df_val = pd.read_csv("data/hindi-english/val_3k_split.csv" )

# Report the number of sentences.
print('Number of training sentences in training set: {:,}\n'.format(df_train.shape[0]))
print('Number of training sentences in val set: {:,}\n'.format(df_val.shape[0]))

# Get the lists of sentences and their labels for train and val datasets
sentences_train = df_train.sentence.values
labels_train = df_train.label.values
sentences_val = df_val.sentence.values
labels_val = df_val.label.values

print('Train dataset: \n', df_train.head())
print('Val dataset: \n',df_val.head())

Number of training sentences in training set: 13,935

Number of training sentences in val set: 2,988

Train dataset: 
       id                                           sentence  label sentiment
0   4330  nen á vist bolest vztek smutek zmatek osam ě l...      1   neutral
1  41616  Haan yaar neha pensive pensive kab karega woh ...      1   neutral
2   6648  television media congress ke liye nhi h Ye toh...      0  negative
3   2512  All India me nrc lagu kare w Kashmir se dhara ...      2  positive
4    610  who Pagal hai kya They aren t real issues Mand...      1   neutral
Val dataset: 
       id                                           sentence  label sentiment
0  30258  modi mantrimandal may samil honay par badhai n...      2  positive
1  16648                Rashid Tu toh naamakool hai Mare h       0  negative
2  28511  U saw caste and religion in them nation saw ta...      0  negative
3  10466  sir local police station pe complaint krne par...      1   neutral
4  19266  Ve Maahi 

## Tokenizing the data

In [105]:
from transformers import XLMRobertaTokenizer

# Load the BERT tokenizer.
print('Loading XLMRobertaTokenizer ...')
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base', do_lower_case=True)
tokenized_train = tokenizer.tokenize(sentences_train[0])
tokenized_ids_train = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences_train[0]))
tokenized_val = tokenizer.tokenize(sentences_val[0])
tokenized_ids_val = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences_val[0]))

# Print the original train sentence; tokenized and IDs mapped.
print('Original Train Sentence: ', sentences_train[13249])
print('Tokenized Train Sentence: ', tokenized_train)
print('Token Train IDs: ', tokenized_ids_train)

# Print the original val sentence; tokenized and IDs mapped.
print('Original Val Sentence: ', sentences_val[0])
print('Tokenized Train Sentence: ', tokenized_val)
print('Token Train IDs: ', tokenized_ids_val)


Loading XLMRobertaTokenizer ...
Original Train Sentence:  ONE OF MY FAVORITE PHOTO BY THE WAY SHAM USE IT AS A WHATS APP WALLPAPER zany face grinning face with one large and one small eye zany face grinning face with one large and one small eye zany face grinning face with one large and one small eye zany face grinning face with one large and one small eye zany face grinning face with one large and one small eye zany face grinning face with one large and one small eye zany face grinning face with one large and one small eye zany face grinning face with one large and one small eye zany face grinning face with one large and one small eye zany face grinning face with one large and one small eye zany face grinning face with one large and one small eye zany face grinning face with one large and one small eye zany face grinning face with one large and one small eye zany face grinning face with one large and one small eye zany face grinning face with one large and one small eye zany face grin

In [104]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids_train = []
attention_masks_train = []
input_ids_val = []
attention_masks_val = []

# Loop through sentences for Train and Val datasets
for sent in sentences_train:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 256,           # Pad & truncate all sentences.
                        padding = 'max_length',
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )

    # Add the encoded sentence to the list.
    input_ids_train.append(encoded_dict['input_ids'])

    # Add its attention mask (simply differentiates padding from non-padding).
    attention_masks_train.append(encoded_dict['attention_mask'])

for sent in sentences_val:
    encoded_dict = tokenizer.encode_plus(
                        sent,
                        add_special_tokens = True,
                        max_length = 256,
                        padding = 'max_length',
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )

    input_ids_val.append(encoded_dict['input_ids'])
    attention_masks_val.append(encoded_dict['attention_mask'])
    
# Convert the lists into tensors.
input_ids_train = torch.cat(input_ids_train, dim=0)
attention_masks_train = torch.cat(attention_masks_train, dim=0)
labels_train = torch.tensor(labels_train)

input_ids_val = torch.cat(input_ids_val, dim=0)
attention_masks_val = torch.cat(attention_masks_val, dim=0)
labels_val = torch.tensor(labels_val)

print('Original train sentence \n: ', sentences_train[1])
print('Train Token IDs \n:', input_ids_train[1])
print('Train labels: \n', labels_train)
print('\n')
print('Original val sentence \n: ', sentences_train[1])
print('Val Token IDs \n:', input_ids_train[1])
print('Val labels: \n', labels_train)


RuntimeError: Sizes of tensors must match except in dimension 0. Got 256 and 289 in dimension 1 (The offending index is 13249)

## Train and Validate

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
train_dataset = TensorDataset(input_ids, attention_masks, labels)
val_dataset = TensorDataset(input_ids, attention_masks, labels)

# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

## Train Classification Model

In [None]:
from transformers import XLMRobertaForSequenceClassification, AdamW, BertConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = XLMRobertaForSequenceClassification.from_pretrained(
    "xlm-roberta-base", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

In [None]:
from google.colab import drive
drive.mount('/content/drive')