In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
RANDOM_SEED = 42
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
df = pd.read_csv('../input/amazonearphonesreviews/AllProductReviews.csv')
df.head()

In [None]:
df.info()

In [None]:
def grouping_review(rating):
    rating = int(rating)
    if rating == 3:
        return 1
    elif rating <= 2:
        return 0
    else:
        return 2

In [None]:
df['ReviewStar'] = df['ReviewStar'].apply(grouping_review)
df.head()

In [None]:
sns.countplot(df['ReviewStar'])

In [None]:
PRE_TRAINED_MODEL_NAME = "bert-base-cased"
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
def count_word(x):
    return len(x.split())

df['count'] = df['ReviewBody'].apply(lambda x : count_word(x))
df.head()

In [None]:
sns.kdeplot(df['count'])

In [None]:
MAX_LEN = 150

In [None]:
class ProductDataset(Dataset):
    def __init__(self,reviews,tokenizer,max_len,targets):
        self.reviews = reviews
        self.max_len = max_len
        self.tokenizer = tokenizer 
        self.targets = targets
        
    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self,idx):
        review = str(self.reviews[idx])
        target = self.targets[idx]
        
        encoding = self.tokenizer.encode_plus(
          review,
          add_special_tokens=True,
          max_length=self.max_len,
          return_token_type_ids=False,
          pad_to_max_length=True,
          return_attention_mask=True,
          truncation=True,
          return_tensors='pt'
        )
        
        return {
            'review_txt' : review,
            'input_ids' : encoding['input_ids'].flatten(),
            'attention_mask' : encoding['attention_mask'].flatten(),
            'targets' : torch.tensor(target, dtype=torch.long)
        }   

In [None]:
 df_train, df_test = train_test_split(
  df,
  test_size=0.3,
  random_state=RANDOM_SEED,
)
df_val, df_test = train_test_split(
  df_test,
  test_size=0.5,
  random_state=RANDOM_SEED,
)

In [None]:
sns.countplot(df_test['ReviewStar'])

In [None]:
df_train.shape, df_val.shape, df_test.shape

In [None]:
def create_data_loader(df,tokenizer,max_len,batch_size,num_workers):
    ds = ProductDataset(reviews = df.ReviewBody.to_numpy(),
                       tokenizer=tokenizer,
                       max_len=max_len,
                       targets = df.ReviewStar.to_numpy())
    
    return DataLoader(
                ds,
                batch_size = batch_size,
                num_workers = num_workers
                )

In [None]:
BATCH_SIZE = 16

In [None]:
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE,num_workers=4)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE,num_workers=1)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE,num_workers=1)

In [None]:
data = next(iter(train_data_loader))

In [None]:
data.keys()

In [None]:
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
n_classes = ['negative','neutral','positive']

In [None]:
class SentimentClassifier(nn.Module):
    def __init__(self,n_classes):
        super(SentimentClassifier,self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.drop = nn.Dropout(p=0.2)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
        
    def forward(self, input_ids, attention_mask):
        _, o2 = self.bert(
            input_ids = input_ids,
            attention_mask = attention_mask
        )
        
        output = self.drop(o2)
        
        return self.out(output)

In [None]:
model = SentimentClassifier(len(n_classes))
model = model.to(device)

In [None]:
EPOCHS = 5
optimizer = AdamW(model.parameters(),lr=2e-5,correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
schedular = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=0,
                num_training_steps=total_steps
            )
loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
from tqdm import tqdm

In [None]:
def train_epoch(
    model,
    data_loader,
    loss_fn,
    optimizer,
    device,
    schedular,
    len_examples):
    
    
    model = model.train()
    correct_predictions =  0
    losses = []
    
    for bi,d in tqdm(enumerate(data_loader), total=len(data_loader)):
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        targets = d['targets'].to(device)
        outputs = model(
            input_ids = input_ids,
            attention_mask = attention_mask
        )
        
        _,preds = torch.max(outputs,dim=1)
        loss = loss_fn(outputs,targets)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm(model.parameters(),max_norm=1.0)
        optimizer.step()
        schedular.step()
        optimizer.zero_grad()
    
    return correct_predictions.double() / len_examples, np.mean(losses)


In [None]:
def eval_model(
    model,
    data_loader,
    loss_fn,
    device,
    len_examples):
    
    
    model = model.eval()
    correct_predictions =  0
    losses = []
    
    with torch.no_grad():
        for bi,d in tqdm(enumerate(data_loader), total=len(data_loader)):
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            targets = d['targets'].to(device)
            outputs = model(
                input_ids = input_ids,
                attention_mask = attention_mask
            )
        
            _,preds = torch.max(outputs,dim=1)
            loss = loss_fn(outputs,targets)
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())
    
        return correct_predictions.double() / len_examples, np.mean(losses)


In [None]:
%%time
history = defaultdict(list)
best_accuracy = 0
for epoch in range(EPOCHS):
  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)
  train_acc, train_loss = train_epoch(
    model,
    train_data_loader,
    loss_fn,
    optimizer,
    device,
    schedular,
    len(df_train)
  )
  print(f'Train loss {train_loss} accuracy {train_acc}')
  val_acc, val_loss = eval_model(
    model,
    val_data_loader,
    loss_fn,
    device,
    len(df_val)
  )
  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()
  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)
  if val_acc > best_accuracy:
    torch.save(model.state_dict(), 'best_model_state.bin')
    best_accuracy = val_acc

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(history['train_acc'], label='train accuracy')
plt.plot(history['val_acc'], label='validation accuracy')
plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1])

In [None]:
plt.plot(history['train_loss'], label='train loss')
plt.plot(history['val_loss'], label='validation loss')
plt.title('Training history')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1])

In [None]:
test_acc, _ = eval_model(model,
  test_data_loader,
  loss_fn,
  device,
  len(df_test))

test_acc.item()

In [None]:
def get_predictions(model, data_loader):
  model = model.eval()

  news_headline = []
  predictions = []
  prediction_probs = []
  real_values = []

  with torch.no_grad():
    for d in data_loader:
      texts = d['review_txt']
      input_ids = d['input_ids'].to(device)
      attention_mask = d['attention_mask'].to(device)
      targets = d['targets'].to(device)

      outputs = model(
          input_ids = input_ids,
          attention_mask = attention_mask
      )

      _, preds = torch.max(outputs, dim=1)

      news_headline.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(outputs)
      real_values.extend(targets)

  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  return news_headline, predictions, prediction_probs, real_values


In [None]:
y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(
  model,
  test_data_loader
)

In [None]:
print(classification_report(y_test, y_pred, target_names = n_classes))

In [None]:
def show_confusion_matrix(confusion_matrix):
  hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
  hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
  hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
  plt.ylabel('True sentiment')
  plt.xlabel('Predicted sentiment');
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index=n_classes, columns=n_classes)
show_confusion_matrix(df_cm)