In [1]:
!pip install transformers==2.8.0

Collecting transformers==2.8.0
  Downloading transformers-2.8.0-py3-none-any.whl (563 kB)
[?25l[K     |▋                               | 10 kB 37.2 MB/s eta 0:00:01[K     |█▏                              | 20 kB 37.1 MB/s eta 0:00:01[K     |█▊                              | 30 kB 20.2 MB/s eta 0:00:01[K     |██▎                             | 40 kB 16.7 MB/s eta 0:00:01[K     |███                             | 51 kB 8.7 MB/s eta 0:00:01[K     |███▌                            | 61 kB 8.7 MB/s eta 0:00:01[K     |████                            | 71 kB 9.0 MB/s eta 0:00:01[K     |████▋                           | 81 kB 9.1 MB/s eta 0:00:01[K     |█████▎                          | 92 kB 7.6 MB/s eta 0:00:01[K     |█████▉                          | 102 kB 8.3 MB/s eta 0:00:01[K     |██████▍                         | 112 kB 8.3 MB/s eta 0:00:01[K     |███████                         | 122 kB 8.3 MB/s eta 0:00:01[K     |███████▋                        | 133 kB 8.3 MB

In [2]:
import transformers
from transformers import AutoModel,AutoTokenizer,AdamW,get_linear_schedule_with_warmup
import torch
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import pickle
import warnings 
warnings.filterwarnings('ignore')

In [3]:
class DataPreprocessing():
    
    def __init__(self,trainpath,testpath,picklepath,test_size=0.25,random_state=42):
        self.train=trainpath
        self.test=testpath
        self.testsize=test_size
        self.random_state=random_state
        self.path=picklepath
        self.get_train_test_valid()
    
        
    def read_dataframe(self,path):
        dataframe = pd.read_csv(path,header=None)
        dataframe.columns = ['Intent','Utterances']
        return dataframe
    
    def save_file(self,model,filename):
        filepath=os.path.join(self.path,filename)
        with open(filepath,'wb') as f:
            pickle.dump(model,f)
    
    def get_train_test_valid(self):
        self.traindf = self.read_dataframe(self.train)
        testdf = self.read_dataframe(self.test)
        self.unique = self.traindf['Intent'].unique()
        self.label2id = {intent:counter for counter,intent in enumerate(self.unique)}
        self.idx2label = {j:i for i,j in self.label2id.items()}
        self.traindf['classes'] = self.traindf['Intent'].apply(lambda x :self.label2id[x])
        testdf['classes'] = testdf['Intent'].apply(lambda x :self.label2id[x])
        self.valid_df, self.test_df = train_test_split(testdf,test_size=self.testsize,random_state=self.random_state)
        self.save_file(self.unique,'unique_classes.pickle')
        self.save_file(self.label2id,'label2id.pickle')
        self.save_file(self.idx2label,'idx2label.pickle')

In [4]:
class IntentDataset(Dataset):
    
    def __init__(self, utterances, classes, tokenizer, max_len):
        self.utterances = utterances
        self.classes = classes
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer, use_fast=False)
        self.max_len = max_len
        
    def __len__(self):
        return len(self.utterances)
    
    def __getitem__(self, item):
        utterance = str(self.utterances[item])
        class_ = self.classes[item]
        encoding = self.tokenizer.encode_plus(utterance,
                                            add_special_tokens=True,
                                            max_length=self.max_len,
                                            return_token_type_ids=False,
                                            pad_to_max_length=True,
                                            return_attention_mask=True,
                                            return_tensors='pt'
                                        ) 
        return {'utterance': utterance,
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'classes': torch.tensor(class_, dtype=torch.long)}

In [5]:
class IntentClassifier(nn.Module):
    
    def __init__(self, n_classes):
        super(IntentClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
        
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        output = self.drop(pooled_output)
        return self.out(output)

In [6]:
def create_data_loader(dataframe,tokenizer, max_len, batch_size):
    
    ds = IntentDataset(utterances=dataframe['Utterances'].to_numpy(),
                        classes=dataframe['classes'].to_numpy(),
                        tokenizer=tokenizer,
                        max_len=max_len 
                    )
    return DataLoader(ds, batch_size=batch_size, num_workers=0)

In [7]:
def train_epoch(model,train_data_loader,loss_fn,optimizer,device,scheduler,n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
                        
    for d in train_data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["classes"].to(device)
        outputs = model(input_ids=input_ids,attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
                        
    return correct_predictions.double() / n_examples, np.mean(losses) 

In [8]:
def eval_model(model, valid_data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    with torch.no_grad():
        for d in valid_data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["classes"].to(device)
            outputs = model(input_ids=input_ids,attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())
    return correct_predictions.double() / n_examples, np.mean(losses)

In [9]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
BATCH_SIZE=32
EPOCHS=5
MAX_LEN=48

train_path = 'sample_data/atis_intents_train.csv'
test_path = 'sample_data/atis_intents_test.csv'
picklepath = 'sample_data/'

In [10]:
IntentData = DataPreprocessing(trainpath=train_path, testpath=test_path, picklepath=picklepath)
class_names = IntentData.unique
train_df = IntentData.traindf
valid_df = IntentData.valid_df
test_df = IntentData.test_df
tokenizer = 'bert-base-cased'

In [11]:
train_data_loader = create_data_loader(train_df,tokenizer , MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(valid_df, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(test_df, tokenizer, MAX_LEN, BATCH_SIZE)

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

In [12]:
model = IntentClassifier(len(class_names))
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [13]:
history = defaultdict(list)
best_accuracy = 0
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)
    train_acc, train_loss = train_epoch( model, train_data_loader, loss_fn, optimizer,
                                            device, scheduler, len(train_df) )

    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(model, val_data_loader, loss_fn, device, len(valid_df) )

    print(f'Val   loss {val_loss} accuracy {val_acc}')
    print()

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy = val_acc

Epoch 1/5
----------
Train loss 0.29115730632206815 accuracy 0.9222176251551509
Val   loss 0.04309395701918555 accuracy 0.9916666666666667

Epoch 2/5
----------
Train loss 0.03360508523397393 accuracy 0.9942076954902771
Val   loss 0.04336520217015947 accuracy 0.9933333333333334

Epoch 3/5
----------
Train loss 0.018750110584445985 accuracy 0.9964832436905254
Val   loss 0.04809804356926562 accuracy 0.9933333333333334

Epoch 4/5
----------
Train loss 0.008058311142361307 accuracy 0.9981381878361605
Val   loss 0.05217839801978124 accuracy 0.9916666666666667

Epoch 5/5
----------
Train loss 0.003495628492505363 accuracy 0.9991725279271824
Val   loss 0.05314820188428521 accuracy 0.9916666666666667



In [14]:
test_acc, test_loss = eval_model(model, test_data_loader, loss_fn, device, len(test_df) )
print(f'Test loss {val_loss} accuracy {val_acc}')

Test loss 0.05314820188428521 accuracy 0.9916666666666667
