In [32]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:
cd /content/drive/MyDrive/"Colab Notebooks"/561_Final_Project

/content/drive/MyDrive/Colab Notebooks/561_Final_Project


In [34]:
!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import pickle, argparse, os, sys
from collections import Counter
from tqdm import tqdm
import numpy as np
import pandas as pd
import random
import pickle
import nltk
import math
import re

import torch.functional as F
from torch import optim
import torch.nn as nn
import torch

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [35]:
!pip install transformers
import torch
from tqdm.notebook import tqdm
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup




### Prepare data: split into three sets, replace unfrequent words with UNKA token, and remove stopwords

Manually checked the quality of the lables and decided to remove rows that have confidence less than 0.4. The removed ones accounted for 1.6 percent of the original data.

In [36]:
def read_airline_data(data_path = None):
    '''
        read csv data and filter rows with confidence level of the sentiment label lower than 0.4
        
        parms: data path str
        
        return: pandas dataframe
    '''
    #vader sentiment scorer
    analyzer = SentimentIntensityAnalyzer()
    data = pd.read_csv(data_path)
    data = data[["airline_sentiment", "airline_sentiment_confidence", "airline", "text"]]
    data = data[data["airline_sentiment_confidence"] > .4]
    data = data.reset_index(drop = True)
    data["vader_sentiment"] = data["text"].apply(lambda x: analyzer.polarity_scores(x)["compound"])
    return data

In [37]:
def split_data(data):
    '''
        Split data into three parts
        
        params: pandas dataframe
        return: three pandas dataframe and save as pickle files
    '''
    # split data into training_validation(0.8), validation(0.1), and testing sets(0.1)
    train_valid, test = train_test_split(data, test_size=0.1, random_state=42, shuffle=True, \
                                         stratify=data["airline_sentiment"])
    train, validation = train_test_split(train_valid, test_size=0.1, random_state=42, shuffle=True, \
                                         stratify=train_valid["airline_sentiment"])
    train_pkl = open("train.pkl", "wb")
    pickle.dump(train, train_pkl)
    train_pkl.close()
    
    validation_pkl = open("validation.pkl", "wb")
    pickle.dump(validation, validation_pkl)
    validation_pkl.close()
    
    test_pkl = open("test.pkl", "wb")
    pickle.dump(test, test_pkl)
    test_pkl.close()

    return

In [38]:
def prepare_data(data):
    '''
        create token list for the tweets and create a vocabulary dictionary 
        to replace unfrequent words with UNKA
        
        Also create a token_to_idx dictionary include special token <PAD>
        
        Besides, convert sentiment label into integers and create one hot vectors

        param: pandas dataframe
        return: #token_to_idx, dict
                
    '''
    
    # remove stopwords
    # tokenize tweets
    # create vocab list for checking frequency
    stop_words = set(stopwords.words('english'))
    filtered_text_list = []
    vocab_list = []
    for tweet in tqdm(data["text"]):
        tweet = re.sub("\@[A-Za-z]+", "", tweet)
        tweet = re.sub("http\S+", "", tweet)
        tweet = re.sub("[!.,@#$%^&*]", "", tweet)
        tokens = word_tokenize(tweet)
        filtered_text = [w for w in tokens if not w in stop_words] 
        filtered_text_list.append(filtered_text)
        vocab_list += filtered_text
    data["filtered_text"] = filtered_text_list
    
    # replace unfrequent words with "UNKA"
    vocab_dict = Counter(vocab_list)
    token_to_replace = {k: v for k, v in vocab_dict.items() if v < 3}
    
    for tweet in tqdm(data["filtered_text"]):
        for token_idx in range(len(tweet)):
            if tweet[token_idx] in token_to_replace:
                tweet[token_idx] = "UNKA"

    # create index for sentiment labels
    sentiment_to_idx = {"neutral":0, "negative":1, "positive":2}
    data["label_idx"] = data["airline_sentiment"].apply(lambda x: sentiment_to_idx[x])

    # convert tweet from list into str for BERT
    data["filtered_text"] = data["filtered_text"].apply(lambda x: " ".join(x))
        
    return 

Tokenize data and prepare dataset for training

In [40]:
def prepare_dataset():
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                            do_lower_case=True)
    train_file = open("train.pkl", "rb")
    train_data = pickle.load(train_file)
    train_file.close()

    valid_file = open("validation.pkl", "rb")
    valid_data = pickle.load(valid_file)
    valid_file.close()

    test_file = open("test.pkl", "rb")
    test_data = pickle.load(test_file)
    test_file.close()

    # Data cleaning before training and testing
    prepare_data(train_data)
    prepare_data(valid_data)
    prepare_data(test_data)
                                            
    encoded_data_train = tokenizer.batch_encode_plus(
        train_data.filtered_text.values, 
        add_special_tokens=True, 
        return_attention_mask=True, 
        padding=True, 
        max_length=20, 
        return_tensors='pt'
    )

    encoded_data_val = tokenizer.batch_encode_plus(
        valid_data.filtered_text.values, 
        add_special_tokens=True, 
        return_attention_mask=True, 
        padding=True, 
        max_length=20, 
        return_tensors='pt'
    )

    encoded_data_test = tokenizer.batch_encode_plus(
        test_data.filtered_text.values, 
        add_special_tokens=True, 
        return_attention_mask=True, 
        padding=True, 
        max_length=20, 
        return_tensors='pt'
    )

    input_ids_train = encoded_data_train['input_ids']
    input_ids_train[:, -1] = torch.tensor(train_data["vader_sentiment"].tolist())
    attention_masks_train = encoded_data_train['attention_mask']
    labels_train = torch.tensor(train_data.label_idx.values)

    input_ids_val = encoded_data_val['input_ids']
    input_ids_val[:, -1] = torch.tensor(valid_data["vader_sentiment"].tolist())
    attention_masks_val = encoded_data_val['attention_mask']
    labels_val = torch.tensor(valid_data.label_idx.values)

    input_ids_test = encoded_data_test['input_ids']
    input_ids_test[:, -1] = torch.tensor(test_data["vader_sentiment"].tolist())
    attention_masks_test = encoded_data_test['attention_mask']
    labels_test = torch.tensor(test_data.label_idx.values)

    dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
    dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)
    dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

    return dataset_train, dataset_val, dataset_test

  0%|          | 0/11666 [00:00<?, ?it/s]

  0%|          | 0/11666 [00:00<?, ?it/s]

  0%|          | 0/1297 [00:00<?, ?it/s]

  0%|          | 0/1297 [00:00<?, ?it/s]

  0%|          | 0/1441 [00:00<?, ?it/s]

  0%|          | 0/1441 [00:00<?, ?it/s]

  "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "


BERT pre-trained model

In [41]:
def train(dataset_train, dataset_val):


    epochs = 5
    seed_val = 17
    batch_size = 10
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    sentiment_to_idx = {"neutral":0, "negative":1, "positive":2}
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                        num_labels=len(sentiment_to_idx),
                                                        output_attentions=False,
                                                        output_hidden_states=False)
    
    
    dataloader_train = DataLoader(dataset_train, 
                                sampler=RandomSampler(dataset_train), 
                                batch_size=batch_size)
    dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

    optimizer = AdamW(model.parameters(),
                  lr=2e-5, 
                  eps=1e-8)    
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)
                                        
    model.to(device)

    # train starts
    for epoch in tqdm(range(1, epochs+1)):
        
        model.train()
        loss_train_total = 0

        progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
        for batch in progress_bar:

            model.zero_grad()
            
            batch = tuple(b.to(device) for b in batch)
            
            inputs = {'input_ids':      batch[0],
                    'attention_mask': batch[1],
                    'labels':         batch[2],
                    }       
            print("iput ids:", batch[0])
            print("labels", batch[2])

            outputs = model(**inputs)
            print("outputs", outputs)
            print("outputs[0]", outputs[0])
            loss = outputs[0]
            loss_train_total += loss.item()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Performs a single optimization step (parameter update).
            optimizer.step()
            # You call scheduler.step() every batch, right after optimizer.step(), to update the learning rate.
            scheduler.step()
            
            progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
            
            
        torch.save(model.state_dict(), f'second_finetuned_BERT_epoch_{epoch}.model')
            
        tqdm.write(f'\nEpoch {epoch}')
        
        loss_train_avg = loss_train_total/len(dataloader_train)            
        tqdm.write(f'Training loss: {loss_train_avg}')
        
        val_loss, predictions, true_vals = evaluate(dataloader_validation)
        _, predicted = torch.max(torch.tensor(predictions), 1)
        f1_score_pos, f1_score_neg, f1_score, accuracy_table = calculate_f1(predicted, true_vals)
        tqdm.write(f'Validation loss: {val_loss}')
        tqdm.write(f'F1 Score (Weighted): {f1_score}')
    


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [46]:
def evaluate(dataloader_val):
    
    sentiment_to_idx = {"neutral":0, "negative":1, "positive":2}
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                        num_labels=len(sentiment_to_idx),
                                                        output_attentions=False,
                                                        output_hidden_states=False)

    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        print("During training one batch, the loss of the output is:", loss)
        print("Loss.item:", loss.item())
        logits = outputs[1]
        print("During training one batch, the logits of the output is:", logits)
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [47]:
def calculate_f1(prediction, ground_truth):
    '''
        calculate the f1 score for positive and
        negative labels and the overall f1 score
        
        params: arrays
        return: three f1 scores
    '''
    accuracy_table = torch.zeros(3,3)

    for pred, truth in zip(prediction, ground_truth):
        if (pred, truth) == (2, 2):
            accuracy_table[0][0] += 1
        elif (pred, truth) == (2, 0):
            accuracy_table[0][1] += 1
        elif (pred, truth) == (2, 1):
            accuracy_table[0][2] += 1
        elif (pred, truth) == (0, 2):
            accuracy_table[1][0] += 1
        elif (pred, truth) == (0, 0):
            accuracy_table[1][1] += 1
        elif (pred, truth) == (0, 1):
            accuracy_table[1][2] += 1
        elif (pred, truth) == (1, 2):
            accuracy_table[2][0] += 1
        elif (pred, truth) == (1, 0):
            accuracy_table[2][1] += 1
        elif (pred, truth) == (1, 1):
            accuracy_table[2][2] += 1


    presition_pos = accuracy_table[0][0] / (accuracy_table[0][0] +  accuracy_table[0][1] + accuracy_table[0][2])
    recall_pos = accuracy_table[0][0] / (accuracy_table[0][0] +  accuracy_table[1][0] + accuracy_table[2][0])
    presition_neg = accuracy_table[2][2] / (accuracy_table[2][0] +  accuracy_table[2][1] + accuracy_table[2][2])
    recall_neg = accuracy_table[2][2] / (accuracy_table[0][2] +  accuracy_table[1][2] + accuracy_table[2][2])

    f1_score_pos = (2 * presition_pos * recall_pos) / (presition_pos + recall_pos)
    f1_score_neg = (2 * presition_neg * recall_neg) / (presition_neg + recall_neg)
    f1_score = (f1_score_pos + f1_score_neg) / 2

    return f1_score_pos, f1_score_neg, f1_score, accuracy_table

Evaluate the test set

In [57]:
def test(model_path = 'second_finetuned_BERT_epoch_5.model'):
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    sentiment_to_idx = {"neutral":0, "negative":1, "positive":2}

    model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                        num_labels=len(sentiment_to_idx),
                                                        output_attentions=False,
                                                        output_hidden_states=False)

    model.to(device)
    model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

    batch_size = 10
    dataloader_test = DataLoader(dataset_test, 
                                    sampler=SequentialSampler(dataset_test), 
                                    batch_size=batch_size)
    val_loss, predictions, true_vals = evaluate(dataloader_test)
    _, predicted = torch.max(torch.tensor(predictions), 1)
    f1_score_pos, f1_score_neg, f1_score, accuracy_table = calculate_f1(predicted, true_vals)
    print(f1_score)
    print(f1_score_pos)
    print(f1_score_neg)

During training one batch, the loss of the output is: tensor(1.3755, device='cuda:0')
Loss.item: 1.3754916191101074
During training one batch, the logits of the output is: tensor([[ 5.0085, -2.4256, -2.6018],
        [-1.9738,  5.4768, -3.5031],
        [-2.0593,  5.5874, -3.4753],
        [-3.4115,  5.1426, -1.9078],
        [-0.6394, -1.4386,  1.9912],
        [-1.9394,  4.8763, -3.2493],
        [-1.9062, -2.4988,  4.8202],
        [ 0.9728, -0.0138, -1.2189],
        [-2.5847,  5.8612, -3.2667],
        [-1.5838,  4.9368, -3.4926]], device='cuda:0')
During training one batch, the loss of the output is: tensor(0.6349, device='cuda:0')
Loss.item: 0.6349061131477356
During training one batch, the logits of the output is: tensor([[ 4.7010, -2.7121, -2.3938],
        [-3.7098,  2.4736,  1.5269],
        [-2.7467, -0.9309,  4.1108],
        [-1.9075,  5.3352, -3.3383],
        [-2.3440, -2.5689,  5.0318],
        [-1.2589,  3.0836, -2.3523],
        [-1.7379,  5.4016, -3.7258],
        [

## Run the training and testing steps

In [None]:
data = read_airline_data("Tweets_airlines.csv")
split_data(data)
dataset_train, dataset_val, dataset_test = prepare_dataset()
train(dataset_train, dataset_val)
test(model_path = 'second_finetuned_BERT_epoch_5.model')