In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import transformers
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# Importing the data

In [None]:
df = pd.read_csv("../input/twitter-airline-sentiment/Tweets.csv")
df.head()

# Cleaning the Data

In [None]:
print(len(df.text))
df=df[["text", "airline_sentiment"]]
df.head()

In [None]:
class_names = ['negative', 'neutral', 'positive']
def to_sentiment(rating):
    return class_names.index(rating)

df['sentiment'] = df.airline_sentiment.apply(to_sentiment)
df=df[["text", "sentiment"]]
df.head()

In [None]:
import re

clean_text = []

for text in df['text']:
    text = re.sub("@[A-Za-z0-9]+","",text) 
    clean_text.append(text)
    
df['text'] = clean_text
df.head()

# Shuffling the data 

In [None]:
df = df.sample(frac=1).reset_index(drop=True)
df.head()

# BERT-Tokenizer:

In [None]:
import torch
from transformers import BertTokenizer, TFBertModel
pre_tr_mdl='bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')

input_ids = []
attention_masks = []


for sent in zip(df['text']):
    encoded_dict = tokenizer.encode_plus(
                        sent,
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 250,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
       
    input_ids.append(encoded_dict['input_ids'])
    
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(df['sentiment'])
# print(len(labels))

# Spliting the dataset:

In [None]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

train_size = int(0.85 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

# Creating data-loaders:

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 16
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

# Selecting BERT of our choice:

In [None]:
from transformers import BertForSequenceClassification, AdamW, BertConfig
# BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', 
    num_labels = 3,
    output_attentions = False,
    output_hidden_states = False, 
)


Selecting the gpu for training

In [None]:
model.to(device)
import numpy as np
def flat_accuracy(preds, labels):
    p=[]
    for i in preds:
        i=i.cpu().detach().numpy()
        p.append(i.argmax())
    labels_flat = labels.flatten().cpu().numpy()
    return np.sum(p == labels_flat) / len(labels_flat)
def flat_accuracy_v2(preds, labels):
    p=[]
    for i in preds:
        i=i.cpu().detach().numpy()
        p.append(i.argmax())
    labels_flat = labels.flatten().cpu().numpy()
    
    return np.sum(p == labels_flat) / len(labels_flat),labels_flat,p

# Training our BERT-Model:

In [None]:
from transformers import BertTokenizer, glue_convert_examples_to_features
import tensorflow as tf
import tensorflow_datasets as tfds

acc=[]
optim = AdamW(model.parameters(), lr=5e-5)
model.eval()
test_res=[]
for batch in validation_dataloader:
    input_ids = batch[0].to(device)
    attention_mask = batch[1].to(device)
    labels = batch[2].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    test_res.append(flat_accuracy(outputs[1],labels))
    model.train()

print("UNTUNED ACCURACY==>",sum(test_res)/len(test_res))

Epochs=2

for epoch in range(Epochs):
    print("Epoch:",epoch+1," of ",Epochs)
    c=0
    l=len(train_dataloader)
    model.train()

    train_res=[]
    for batch in train_dataloader:
        c+=1
#         print("Epoch:",epoch+1,"Running ",c," of ",l)
        print("Progress {:2.1%}".format(c/ l), end="\r")
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        train_res.append(flat_accuracy(outputs[1],labels))
        loss.backward()
        optim.step()
    print("TRAIN ACCURACY==>",sum(train_res)/len(train_res))
    model.eval()
    test_res=[]
    for batch in validation_dataloader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        test_res.append(flat_accuracy(outputs[1],labels))
    print("TEST ACCURACY==>",sum(test_res)/len(test_res))
    model.train()

# Generating Test Results:

In [None]:
label_s,pred_s,dsen=[],[],[]
model.eval()
for batch in validation_dataloader:
    for ii in batch[0]:
        s=tokenizer.convert_ids_to_tokens(ii)
        s=tokenizer.convert_tokens_to_string(s)
        dsen.append(s)
    input_ids = batch[0].to(device)
    attention_mask = batch[1].to(device)
    labels = batch[2].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    test_res.append(flat_accuracy(outputs[1],labels))
    label_s.append(flat_accuracy_v2(outputs[1],labels)[1])
    pred_s.append(flat_accuracy_v2(outputs[1],labels)[2])
    
print("TEST ACCURACY==>",sum(test_res)/len(test_res))
