# Model 4 - BERT + CNN (web_text included)

CNN can also be used for sentiment analysis. Embedded text also has two dimensions: embedded dimen& text length, very similar to 2d images.

And in the context of two text sourses, I think of them as 2d images with 2 channels: one channel representing the text, and the other representing the web_text.

## 1) Train and Test Split

In [1]:
import numpy as np
import pandas as pd

In [2]:
raw = pd.read_csv('interview_case_v4.csv')

In [3]:
df = raw.copy()
df['intact_name'] = df['intact_name'].str.rstrip('.') # remove the periord from the end of string
df = df.fillna('')
df['text'] = df['intact_name'].astype(str)+'. ' +df['SIC8_DESCRIPTION'].astype(str)+'. ' +df['4_Square_Description'].astype(str)
df1 = df[['text','web_text','target_for_prediction']]
df1 = df1.rename(columns={'target_for_prediction':'label'})

In [4]:
df1

Unnamed: 0,text,web_text,label
0,218685 Ontario Inc o/a Swagat Banquet Hall. ba...,"WE'RE MAJESTIC, REGAL, STYLISH& EXPERTS IN ALL...",Restaurant
1,Restaurant Pushap Sucrerie. eating places. sna...,,Restaurant
2,Transport Galf Inc. .,,Trucking & Hauling Service
3,On The Go Courier. . specialized freight (exce...,,Trucking & Hauling Service
4,"1484726 Alberta Ltd. local trucking, without s...",,Trucking & Hauling Service
...,...,...,...
1558,Asdin Hospitality Ltd. o/a Best Western Plus F...,,Hotel Accomodation
1559,Casa Moda Fine Furnishing Inc. .,780-784-0638info@splendidfurnishings.caABOUT U...,Trucking & Hauling Service
1560,Jia De Trinh o/a Oakridge Dragon Restaurant Lt...,,Restaurant
1561,2000650 Ontario Inc. o/a Golden Bell Thai Rest...,Home Page Menu Lunch Specials Dinner Specials ...,Restaurant


In [5]:
from sklearn.model_selection import train_test_split

training_data, test_data = train_test_split(df1, test_size=0.1, random_state=25, stratify = df1.label)
train_data, valid_data = train_test_split(training_data, test_size=0.1, random_state=25, stratify = training_data.label)

In [6]:
print(f'training data size: {train_data.shape[0]}')
print(f'validation data size: {valid_data.shape[0]}')
print(f'testing data size: {test_data.shape[0]}')

training data size: 1265
validation data size: 141
testing data size: 157


In [7]:
# save three datasets, for later torchtext use
train_data.to_csv('./data2/train.csv',index=False)
valid_data.to_csv('./data2/valid.csv',index=False)
test_data.to_csv('./data2/test.csv',index=False)

## 2) Prepare Data & Iterator

In [8]:
import torch

import random
import numpy as np

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [9]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [10]:
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)

In [11]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']
print(max_input_length)

512


In [12]:
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2] # we have to add two tokens: at the beginning and end of the text
    return tokens

In [13]:
from torchtext.legacy import data

TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = init_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)

LABEL = data.LabelField(dtype = torch.float)

In [14]:
fields = [('text', TEXT), ('web_text', TEXT), ('label', LABEL)]

In [15]:
train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path = 'data2',
                                        train = 'train.csv',
                                        validation = 'valid.csv',
                                        test = 'test.csv',
                                        format = 'csv',
                                        fields = fields,
                                        skip_header = True
)

In [16]:
print(vars(train_data[0]))

{'text': [22431, 19961, 2620, 2620, 4561, 4297, 1012, 1004, 23968, 1005, 1055, 10733, 1012, 10733, 7884, 1012], 'web_text': [], 'label': 'Restaurant'}


In [17]:
LABEL.build_vocab(train_data)
print(LABEL.vocab.stoi)

defaultdict(None, {'Restaurant': 0, 'Trucking & Hauling Service': 1, 'Hotel Accomodation': 2})


In [18]:
BATCH_SIZE = 16 # consider the samll dataset and limited computational resources, I set a small batch size

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    sort = False,
    batch_size = BATCH_SIZE, 
    device = device)

## 3) Build the Model

In [19]:
from transformers import BertTokenizer, BertModel

In [20]:
bert = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence

class BERTCNNSentiment(nn.Module):
    def __init__(self, bert, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
        
        self.bert = bert
        
        embedding_dim = bert.config.to_dict()['hidden_size']
        
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 2, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.pad_idx = pad_idx
        
    def forward(self, text, web_text):
        
        #text = [batch size, sent len]
                
        with torch.no_grad():
            text_embedded = self.bert(text)[0]  # freeze the bert para
            
        #text_embedded = [batch size, sent len, emb dim]
        
        text_embedded_pad = F.pad(text_embedded, (0, 0, 0, max_input_length-len(text_embedded)),"constant", 0)
                
        #text_embedded_pad = [batch size, 512, emb dim]
        
        #web_text = [batch size, sent len]
        
        with torch.no_grad():
            web_text_embedded = self.bert(web_text)[0]  # freeze the bert para
        
        #web_text_embedded = [batch size, sent len, emb dim]
        
        web_text_embedded_pad = F.pad(text_embedded, (0, 0, 0, max_input_length-len(text_embedded)),"constant", 0)
        
        #web_text_embedded_pad = [batch size, 512, emb dim]
        
        embedded = torch.stack((text_embedded_pad, web_text_embedded_pad), dim=1) # stack two embedded, now channel = 2
        
        #embedded = [batch size, 2, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        #conv_n = [batch size, n_filters, sent len - filter_sizes[n]]
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [22]:
N_FILTERS = 100
FILTER_SIZES = [2,3]
OUTPUT_DIM = len(LABEL.vocab)
DROPOUT = 0.1
PAD_IDX = tokenizer.convert_tokens_to_ids(pad_token)

model4 = BERTCNNSentiment(bert, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [23]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model4):,} trainable parameters')

The model has 110,251,043 trainable parameters


In [24]:
# too many parameters to train, I will freeze the bert para, due to the limited sources
for name, param in model4.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False

In [25]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model4):,} trainable parameters')

The model has 768,803 trainable parameters


In [26]:
for name, param in model4.named_parameters():                
    if param.requires_grad:
        print(name)

convs.0.weight
convs.0.bias
convs.1.weight
convs.1.bias
fc.weight
fc.bias


## 4) Train the Model

In [27]:
import sklearn.utils.class_weight as class_weight

In [28]:
train_df = pd.read_csv('./data2/train.csv')
train_Y = train_df.label
train_Y = train_Y.apply(lambda x: 0 if x=='Restaurant' else 1 if x=='Trucking & Hauling Service' else 2) # according to the LABEL.vocab 

In [29]:
class_weights=class_weight.compute_class_weight('balanced',np.unique(train_Y),train_Y.to_numpy())
class_weights=torch.tensor(class_weights,dtype=torch.float)
class_weights



tensor([0.5541, 0.9945, 5.2708])

In [30]:
import torch.optim as optim

optimizer = optim.Adam(model4.parameters())

criterion = nn.CrossEntropyLoss(weight=class_weights) # to deal with the imbalanced dataset

model4 = model4.to(device)
criterion = criterion.to(device)

In [31]:
def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch
    """
    top_pred = preds.argmax(1, keepdim = True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

In [32]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text, batch.web_text)
        
        loss = criterion(predictions, batch.label.long())
        
        acc = categorical_accuracy(predictions, batch.label.long())
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [33]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text, batch.web_text)
            
            loss = criterion(predictions, batch.label.long())
            
            acc = categorical_accuracy(predictions, batch.label.long())

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [34]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [35]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model4, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model4, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model4.state_dict(), 'model4.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 24m 20s
	Train Loss: 0.629 | Train Acc: 78.36%
	 Val. Loss: 0.617 |  Val. Acc: 83.55%
Epoch: 02 | Epoch Time: 24m 4s
	Train Loss: 0.314 | Train Acc: 88.98%
	 Val. Loss: 0.459 |  Val. Acc: 83.01%
Epoch: 03 | Epoch Time: 24m 2s
	Train Loss: 0.281 | Train Acc: 89.45%
	 Val. Loss: 0.583 |  Val. Acc: 87.18%
Epoch: 04 | Epoch Time: 22m 57s
	Train Loss: 0.276 | Train Acc: 89.77%
	 Val. Loss: 0.495 |  Val. Acc: 90.12%
Epoch: 05 | Epoch Time: 23m 17s
	Train Loss: 0.186 | Train Acc: 93.59%
	 Val. Loss: 0.513 |  Val. Acc: 86.65%
