In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Used BERT to classify the texts into 4 classes

In [None]:
import transformers 
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup

import torch 
import numpy as np 
import pandas as pd 
import seaborn as sns 
from pylab import rcParams 
import matplotlib.pyplot as plt 
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report 
from collections import defaultdict
from textwrap import wrap 


from torch import nn, optim 
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

sns.set(style='whitegrid', palette='muted', font_scale = 1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#ADFF02", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

In [None]:
train = pd.read_csv('/kaggle/input/ag-news-classification-dataset/train.csv')
test = pd.read_csv('/kaggle/input/ag-news-classification-dataset/test.csv')
train.head()

In [None]:
#shape 
train.shape, test.shape

In [None]:
train.info()

In [None]:
test.head()

In [None]:
labeling = {
    1:0, 
    2:1,
    3:2,
    4:3
}

In [None]:
train['Class Index'] = train['Class Index'].apply(lambda x : labeling[x])
test['Class Index'] = test['Class Index'].apply(lambda x: labeling[x])


In [None]:
train.head()

In [None]:
test.head()

In [None]:
sns.countplot(train['Class Index'])

In [None]:
# DATA PREPROCESSING 
class_names = ['1', '2', '3', '4']

In [None]:
pre_trained_model_name = 'bert-base-cased'

In [None]:
tokenizer = BertTokenizer.from_pretrained(pre_trained_model_name)

In [None]:
small_text = "This data is really really really huge but this time I will do it in detail"


In [None]:
tokens = tokenizer.tokenize(small_text)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print(f'Sentence: {small_text}')
print(f'Tokens : {tokens}')
print(f'Token-IDs : {token_ids}')

In [None]:
# SPECIAL TOKENS 

tokenizer.sep_token, tokenizer.sep_token_id

In [None]:
tokenizer.cls_token, tokenizer.cls_token_id

In [None]:
tokenizer.pad_token, tokenizer.pad_token_id

In [None]:
tokenizer.unk_token, tokenizer.unk_token_id

In [None]:
# ALL of the above work can be done with simple encode_plus() methods 

encoding = tokenizer.encode_plus(
    small_text,
    max_length=32, 
    add_special_tokens=True,
    return_token_type_ids=False, 
    padding='max_length', 
    return_attention_mask=True, 
    return_tensors='pt'
)
encoding.keys()

In [None]:
print(len(encoding['input_ids'][0]))
encoding['input_ids'][0]

In [None]:
print(len(encoding['attention_mask'][0]))
encoding['attention_mask']

In [None]:
print(tokenizer.convert_ids_to_tokens(encoding['input_ids'][0]))

In [None]:
# Choosing Sequence Length 
token_lens = []
train['content'] = train['Title']+' '+train['Description']
del train['Title']
del train['Description']


In [None]:
for txt in train.content:
    tokens = tokenizer.encode(txt, max_length=512, truncation=True)
    token_lens.append(len(tokens))

In [None]:
sns.distplot(token_lens)
plt.xlim([0, 256])
plt.xlabel('Token COunt')

In [None]:
MAX_LEN = 100

In [None]:
class AgNewsData(Dataset):
    
    def __init__(self, content, targets, tokenizer, max_len):
        self.content = content
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len 
        
    def __len__(self):
        return len(self.content)
    
    def __getitem__(self, item):
        content = str(self.content[item])
        target = self.targets[item]
        
        encoding = self.tokenizer.encode_plus(
            content, 
            max_length=self.max_len, 
            add_special_tokens=True,
            return_token_type_ids=False, 
            padding="max_length",
            truncation = True,
            return_attention_mask=True, 
            return_tensors='pt'
        )
        
        return {
            'content_text':content, 
            'input_ids':encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets':torch.tensor(target, dtype=torch.long)
        }

In [None]:
df_train, df_val = train_test_split(train, test_size=0.1, random_state=RANDOM_SEED)

In [None]:
df_train.shape, test.shape, df_val.shape

In [None]:
test['content'] = test['Title']+" "+test['Description']
del test['Title']
del test['Description']

In [None]:
df_test= test
del test

In [None]:
df_test.head()

In [None]:
# helper function to create dataloaders 

def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = AgNewsData(
        content = df.content.to_numpy(),
        targets = df['Class Index'].to_numpy(),
        tokenizer = tokenizer, 
        max_len = max_len
        
    )
    
    
    return DataLoader(
        ds, 
        batch_size = batch_size, 
        num_workers = 4
    )

In [None]:
BATCH_SIZE = 16

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
data = next(iter(train_data_loader))
data.keys()

In [None]:
print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['targets'].shape)

In [None]:
bert_model = BertModel.from_pretrained(pre_trained_model_name)

o=bert_model(
    input_ids = encoding['input_ids'],
    attention_mask = encoding['attention_mask']
)

In [None]:
o.keys()

In [None]:
o.get('last_hidden_state').shape


In [None]:
o.get('pooler_output').shape


In [None]:
bert_model.config.hidden_size

In [None]:
class SentimentClassifier(nn.Module):
    
    
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(pre_trained_model_name)
        self.drop = nn.Dropout(p=0.45)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
        
    def forward(self, input_ids, attention_mask):
        o = self.bert(
            input_ids = input_ids,
            attention_mask = attention_mask
        )
        
        output = self.drop(o.get('pooler_output'))
        
        return self.out(output)

In [None]:
model = SentimentClassifier(len(class_names))
model = model.to(device)

In [None]:
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)

print(input_ids.shape)
print(attention_mask.shape)

In [None]:
F.softmax(model(input_ids, attention_mask), dim=1)

In [None]:
# TRAINING 

EPOCHS = 1 
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0,
    num_training_steps = total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
def train_epoch(
    model, 
    data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    n_examples
):
    model = model.train()
    
    losses= []
    correct_predictions = 0 
    
    for d in data_loader:
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        targets = d['targets'].to(device)
        outputs = model(
            input_ids = input_ids, 
            attention_mask = attention_mask
        )
        
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        
        loss.backward()

        nn.utils.clip_grad_norm(model.parameters(), max_norm = 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
    
    
    return correct_predictions.double()/n_examples, np.mean(losses)

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    
    losses = []
    
    correct_predictions = 0
    
    with torch.no_grad():
        for d in data_loader:
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            targets = d['targets'].to(device)
            outputs = model(
                input_ids = input_ids, 
                attention_mask = attention_mask
            )

            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)
            
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())
        
    return correct_predictions.double() / n_examples , np.mean(losses)
        

In [None]:
%%time 
history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):
    
    print(f'Epoch {epoch+1}/{EPOCHS}')
    print('-'*10)
    
    
    train_acc, train_loss = train_epoch(
         model, 
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(df_train)
    )
    
    print(f'Train loss {train_loss} accuracy {train_acc}')
    
    val_acc, val_loss = eval_model(
        model, 
        val_data_loader,
        loss_fn,
        device,
        len(df_val)
    )
    
    print(f'Val loss {val_loss} val accuracy {val_acc}')
    print()
    
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy = val_acc


In [None]:
plt.plot(history['train_acc'], label='train accuracy')
plt.plot(history['val_acc'], label='validation accuracy')

plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1]);

## EVALUATION

In [None]:
test_acc, _ = eval_model(
  model,
  test_data_loader,
  loss_fn,
  device,
  len(df_test)
)

test_acc.item()

In [None]:
def get_predictions(model, data_loader):
  model = model.eval()
  
  content_texts = []
  predictions = []
  prediction_probs = []
  real_values = []

  with torch.no_grad():
    for d in data_loader:

      texts = d["content_text"]
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)

      probs = F.softmax(outputs, dim=1)

      content_texts.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(probs)
      real_values.extend(targets)

  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  return content_texts, predictions, prediction_probs, real_values

In [None]:
y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(
  model,
  test_data_loader
)

In [None]:
print(classification_report(y_test, y_pred, target_names=class_names))


In [None]:

def show_confusion_matrix(confusion_matrix):
  hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
  hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
  hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
  plt.ylabel('True sentiment')
  plt.xlabel('Predicted sentiment');

cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
show_confusion_matrix(df_cm)

In [None]:
idx = 2

content_text = y_review_texts[idx]
true_sentiment = y_test[idx]
pred_df = pd.DataFrame({
  'class_names': class_names,
  'values': y_pred_probs[idx]
})

In [None]:
print("\n".join(wrap(content_text)))
print()
print(f'True sentiment: {class_names[true_sentiment]}')

In [None]:
sns.barplot(x='values', y='class_names', data=pred_df, orient='h')
plt.ylabel('sentiment')
plt.xlabel('probability')
plt.xlim([0, 1]);

# PREDICTING WITH RAW TEXT 

In [None]:
content_text = 'Woah what a match! .. I love the way they played but my team failed'

In [None]:
encoded_review = tokenizer.encode_plus(
  content_text,
    max_length=100, 
    add_special_tokens=True,
    return_token_type_ids=False, 
    padding='max_length', 
    return_attention_mask=True, 
    return_tensors='pt'
)


In [None]:
input_ids = encoded_review['input_ids'].to(device)
attention_mask = encoded_review['attention_mask'].to(device)

output = model(input_ids, attention_mask)
_, prediction = torch.max(output, dim=1)

print(f'Review text: {content_text}')
print(f'Sentiment  : {class_names[prediction]}')