In [1]:
import pandas as pd
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments, AdamW, BertConfig
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import TensorDataset, random_split
from sklearn.model_selection import train_test_split

In [2]:
tokenizer = BertTokenizer.from_pretrained ('bert-base-uncased', do_lower_case=True)

In [3]:
train_file = "Data/train.tsv"
trainDF = pd.read_csv (train_file, sep='\t')
# df = df[0:20]
trainDF['text']     = trainDF.text.astype (str)
trainDF['labels']   = trainDF.target.astype (np.int64)
trainDF      = trainDF[['text', 'labels']]
train_texts  = list (trainDF.text.values)
train_labels = list (trainDF.labels.values)

In [4]:
test_file = "Data/test.tsv"
testDF = pd.read_csv (test_file, sep='\t')
# df = df[0:20]
testDF['id']     = testDF.id.astype (int)
testDF['text']   = testDF.text.astype (str)
testDF['labels'] = testDF.target.astype (np.int64)
testDF      = testDF[['id', 'text', 'labels']]
test_texts  = list (testDF.text.values)
test_labels = list (testDF.labels.values)

In [5]:
sentences = trainDF.text.values
# Print the original sentence.
print(' Original: ', sentences[0])
# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(sentences[0]))
# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids (tokenizer.tokenize (sentences[0])))

 Original:  Just happened a terrible car crash
Tokenized:  ['just', 'happened', 'a', 'terrible', 'car', 'crash']
Token IDs:  [2074, 3047, 1037, 6659, 2482, 5823]


In [6]:
max_len = 0

# For every sentence...
for sent in sentences:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode (sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max (max_len, len(input_ids))

print ('Max sentence length: ', max_len)

Max sentence length:  84


# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus (
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 64,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append (encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append (encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat (input_ids, dim=0)
attention_masks = torch.cat (attention_masks, dim=0)
labels = torch.tensor (labels)

# Print sentence 0, now as a list of IDs.
print ('Original: ', sentences[0])
print ('Token IDs:', input_ids[0])

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset (input_ids, attention_masks, labels)

# Create a 70-30 train-validation split.

# Calculate the number of samples to include in each set.
train_size = int (0.7 * len (dataset))
val_size = len (dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader (
    
    train_dataset,  # The training samples.
    sampler = RandomSampler (train_dataset), # Select batches randomly
    batch_size = batch_size # Trains with this batch size.
)

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader (
    
    val_dataset, # The validation samples.
    sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
    batch_size = batch_size # Evaluate with this batch size.
)

In [7]:
train_texts, val_texts, train_labels, val_labels = train_test_split (train_texts, train_labels, test_size=.3)

train_encodings = tokenizer (train_texts, add_special_tokens = True, max_length = max_len, truncation=True, padding=True)
val_encodings   = tokenizer (val_texts,   add_special_tokens = True, max_length = max_len, truncation=True, padding=True)
test_encodings  = tokenizer (test_texts,  add_special_tokens = True, max_length = max_len, truncation=True, padding=True)

In [8]:
class Tweet_Dataset (torch.utils.data.Dataset):
    
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [9]:
train_dataset = Tweet_Dataset (train_encodings, train_labels)
val_dataset   = Tweet_Dataset (val_encodings,   val_labels)
test_dataset  = Tweet_Dataset (test_encodings,  test_labels)

In [10]:
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained (
    
    "bert-base-uncased",          # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2,               # The number of output labels--2 for binary classification.
                                  # You can increase this for multi-class tasks.   
    output_attentions = False,    # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
# model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [11]:
def compute_metrics (pred):
    
    labels = pred.label_ids
    preds = pred.predictions.argmax (-1)
    precision, recall, f1, _ = precision_recall_fscore_support (labels, preds, average='binary')
    acc = accuracy_score (labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments (
    
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=360,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    evaluation_strategy='epoch',
    # evaluation_strategy='steps',
    save_steps=int (len (train_dataset)/32),
    fp16=True,
    # eval_steps=100
)

trainer = Trainer (
    
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics
)



In [12]:
# trainer.train ()

In [13]:
# Now train on the whole dataset

train_file = "Data/train.tsv"
trainDF = pd.read_csv (train_file, sep='\t')
# df = df[0:20]
trainDF['text']     = trainDF.text.astype (str)
trainDF['labels']   = trainDF.target.astype (np.int64)
trainDF = trainDF[['text', 'labels']]
train_texts  = list (trainDF.text.values)
train_labels = list (trainDF.labels.values)

train_encodings = tokenizer (train_texts, add_special_tokens = True, max_length = max_len, truncation=True, padding=True)
train_dataset = Tweet_Dataset (train_encodings, train_labels)

training_args = TrainingArguments (
    
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total # of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=10,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    evaluation_strategy='no',
    save_steps=int (len (train_dataset)/32),
    fp16=True,
)

trainer = Trainer (
    
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    compute_metrics=compute_metrics,
)
VERSION = 2



In [14]:
trainer.train ()
trainer.save_model ('tweet_bert-base-uncased_' + str (VERSION))

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=258.0, style=ProgressStyle(description_wi…







In [15]:
predictions, label_ids, metrics = trainer.predict (test_dataset)
metrics

HBox(children=(FloatProgress(value=0.0, description='Prediction', max=51.0, style=ProgressStyle(description_wi…




{'eval_loss': 0.38612647808990047,
 'eval_accuracy': 0.8375727857799571,
 'eval_f1': 0.800601956358164,
 'eval_precision': 0.8471337579617835,
 'eval_recall': 0.7589158345221113}

In [16]:
submitDF           = testDF[['id']]
submitDF['target'] = label_ids
submitDF['id']     = submitDF.id.astype (int)
submitDF['target'] = submitDF.target.astype (int)
submitDF.to_csv ('submission_bert_v'+str (VERSION) + '.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
