In [2]:
# To mount google drive onto google colab
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [1]:
import pandas as pd

In [10]:
# Load the dataset
df = pd.read_csv('FYPDATASET/balanced.csv')

In [11]:
# Dataset is already preprocessed
df.head()

Unnamed: 0,Review,Sentiment
0,best candy corn on the planet ill keep this sh...,Positive
1,cat food my cats eat it that is all i can say ...,Positive
2,onions overwhelm otherwise lowkey flavor the o...,Negative
3,yummy tasted good spicy those that dont like s...,Positive
4,good flavor the product is the same as what we...,Positive


In [12]:
value_counts = df['Sentiment'].value_counts()

print(value_counts)

Positive    82037
Negative    82037
Name: Sentiment, dtype: int64


In [13]:
df['Value'] = df['Sentiment'].apply(lambda x: 1 if 'Positive' in x else 0)

In [14]:
df = df[['Review', 'Value']]

In [15]:
df = df.head(10000)

In [16]:
df.head()


Unnamed: 0,Review,Value
0,best candy corn on the planet ill keep this sh...,1
1,cat food my cats eat it that is all i can say ...,1
2,onions overwhelm otherwise lowkey flavor the o...,0
3,yummy tasted good spicy those that dont like s...,1
4,good flavor the product is the same as what we...,1


In [18]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [19]:
score_dtype = df['Value'].dtype

print(score_dtype) 

int64


In [20]:
# Get the lists of sentences and their labels.
sentences = df["Review"]
labels = df["Value"]

In [21]:
print(len(sentences))
print(len(labels))

10000
10000


In [23]:
max_len = 0

for sent in sentences:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max length: ', max_len)

Token indices sequence length is longer than the specified maximum sequence length for this model (587 > 512). Running this sequence through the model will result in indexing errors


Max sentence length:  1905


In [None]:
import time
import datetime

def format_time(elapsed):
 
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    return str(datetime.timedelta(seconds=elapsed_rounded))


In [24]:
import torch

In [29]:
input_ids = []
attention_masks = []

for sent in sentences:

    encoded_dict = tokenizer.encode_plus(
                        sent,      
                        truncation=True,               
                        add_special_tokens = True, 
                        max_length = 64,           
                        pad_to_max_length = True,
                        return_attention_mask = True, 
                        return_tensors = 'pt',   
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

Original:  best candy corn on the planet ill keep this sho and sweet  i have a sweet tooth and i love candy corn  i also love caramels  this candy corn brings a whole new light on the candy corn industry and if you havent tried them yet pick up a bagbest candy ever
Token IDs: tensor([  101,  2190,  9485,  9781,  2006,  1996,  4774,  5665,  2562,  2023,
        26822,  1998,  4086,  1045,  2031,  1037,  4086, 11868,  1998,  1045,
         2293,  9485,  9781,  1045,  2036,  2293, 14418, 10199,  2015,  2023,
         9485,  9781,  7545,  1037,  2878,  2047,  2422,  2006,  1996,  9485,
         9781,  3068,  1998,  2065,  2017,  4033,  2102,  2699,  2068,  2664,
         4060,  2039,  1037,  4524, 12681,  2102,  9485,  2412,   102,     0,
            0,     0,     0,     0])




In [36]:
from torch.utils.data import TensorDataset, random_split

dataset = TensorDataset(input_ids, attention_masks, labels)


train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('training samples')
print(train_size)
print('validation samples')
print(val_size)

training samples
9000
validation samples
1000


In [37]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler


batch_size = 32

train_dataloader = DataLoader(
            train_dataset, 
            sampler = RandomSampler(train_dataset), 
            batch_size = batch_size 
        )

validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size 
        )

In [38]:
from transformers import BertForSequenceClassification, AdamW


model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels = 2,       
    output_attentions = False, 
    output_hidden_states = False,
)



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [40]:

optimizer = AdamW(model.parameters(),
                  lr = 2e-5, 
                  eps = 1e-8
                )




In [41]:
from transformers import get_linear_schedule_with_warmup

# The BERT authors recommend between 2 and 4. 
epochs = 2

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [42]:
import numpy as np

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [45]:
import torch

# GPU setup
if torch.cuda.is_available():    

    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [52]:
import torch
torch.cuda.is_available()

False

In [46]:
import random
import numpy as np

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


training_stats = []

# Measure the total training time
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):

        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))


        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        


        # outputs prior to activation.
        output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = output.loss
        logits = output.logits

  
        total_train_loss += loss.item()

        loss.backward()

       
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

   
        optimizer.step()

        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        

    print("Running Validation...")

    t0 = time.time()


    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        

        with torch.no_grad():        

            output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            loss = output.loss
            logits = output.logits
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

     
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...


KeyboardInterrupt: 

In [40]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

for sent in sentences:

    encoded_dict = tokenizer.encode_plus(
                        sent,                      
                        add_special_tokens = True, 
                        max_length = 64,           
                        pad_to_max_length = True,
                        return_attention_mask = True,   
                        return_tensors = 'pt',    
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Set the batch size.  
batch_size = 32  

# Create the DataLoader.
prediction_data = TensorDataset(input_ids, attention_masks, labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  labels = torch.tensor(labels)


Loading the trained model

In [71]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(output_dir)


Testing on custom data

In [84]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

text = "taste like good food"
inputs = tokenizer.encode_plus(text, add_special_tokens=True, return_tensors='pt')


In [85]:
outputs = model(**inputs)
predictions = outputs.logits.argmax(dim=-1)


In [86]:
print(predictions)

tensor([1])
