### HuggingFace | BERT | PyTorch
- [Code Source](https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/)

In [None]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f' Computations will utilize: {device}')
print('#####  Inital Program Setup is Complete  #####')

In [None]:
# Import data
df = pd.read_csv(r'C:\Data Sciences\Data\all_data_files\reviews.csv')
print(f' Dataframe Shape: {df.shape}')
df.info()
df.head()

In [None]:
sns.distplot(df['thumbsUpCount'])

In [None]:
def thumbs_up(thumbs):
    thumbs = int(thumbs)

    if thumbs <= 1:
        return '0-1'

    elif thumbs >=2 and thumbs <=3:
        return '2-3'

    elif thumbs >=4 and thumbs <=5:
        return '4-5'

    elif thumbs >=6 and thumbs <=7:
        return '6-7'

    else:
        return 'Over-8'

df['thumbs_up_grouping'] = df.thumbsUpCount.apply(thumbs_up)

In [None]:
order = ['0-1','2-3','4-5','6-7','Over-8']
sns.countplot(df['thumbs_up_grouping'], order=order)

In [None]:
sns.boxplot(x="thumbs_up_grouping", y="score", hue = "sortOrder", palette=["m", "g"],
            data=df)
sns.despine(offset=10, trim=True)

In [None]:
# Review distribution of review score:
sns.countplot(df.score)
plt.title('Count All Review Scores')
plt.ylabel('Count of Reviews')
plt.xlabel('Review Score')
plt.savefig('Pre-Processed Review Scores.png')

In [None]:
# Re-assign review scores into 0 (negative), 1 (neutral), and 2 (positive):
def to_sentiment(rating):
  rating = int(rating)
  if rating <= 2:
    return 0
  elif rating == 3:
    return 1
  else:
    return 2

df['sentiment'] = df.score.apply(to_sentiment)
class_names = ['negative', 'neutral', 'positive']
ax = sns.countplot(df.sentiment)
plt.title('Count All Sentiment Scores')
plt.ylabel('Count of Reviews')
plt.xlabel('Review Sentiment')
ax.set_xticklabels(class_names)
plt.savefig('Post-Processed Sentiment Scores.png')

In [None]:
# Constaint Variables:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
sample_txt = 'When was I last outside? I am stuck at home for 2 weeks.'

In [None]:
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(f' Sentence: {sample_txt}')
print(f'   Tokens: {tokens}')
print(f'Token IDs: {token_ids}')

In [None]:
encoding = tokenizer.encode_plus(
  sample_txt,
  max_length=32,
  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
  return_token_type_ids=False,
  pad_to_max_length=True,
  return_attention_mask=True,
  return_tensors='pt',  # Return PyTorch tensors
)
encoding.keys()

In [None]:
print(len(encoding['input_ids'][0]))
encoding['input_ids'][0]

In [None]:
print(len(encoding['attention_mask'][0]))
encoding['attention_mask']

In [None]:
tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])

In [None]:
token_lens = []

for txt in df.content:
  tokens = tokenizer.encode(txt, max_length=512)
  token_lens.append(len(tokens))

In [None]:
sns.distplot(token_lens)
plt.title('Token Count Distribution')
plt.xlim([0, 512]);
plt.xlabel('Token count')

In [None]:
MAX_LEN = 160

In [None]:
class GPReviewDataset(Dataset):

  def __init__(self, reviews, targets, tokenizer, max_len):
    self.reviews = reviews
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.reviews)
    
  def __getitem__(self, item):
    review = str(self.reviews[item])
    target = self.targets[item]
    encoding = self.tokenizer.encode_plus(
      review,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )
    return {
      'review_text': review,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

In [None]:
df_train, df_test = train_test_split(df, test_size = 0.1, random_state=RANDOM_SEED)

df_val, df_test = train_test_split(df_test, test_size = 0.5, random_state=RANDOM_SEED)

In [None]:
print('### Shape Describtion ###')
print(f'      Train Shape: {df_train.shape}')
print(f'       Test Shape: {df_test.shape}')
print(f' Validation Shape: {df_val.shape}')

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):

  ds = GPReviewDataset(
    reviews=df.content.to_numpy(),
    targets=df.sentiment.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )
  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
  )
  
BATCH_SIZE = 16
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
data = next(iter(train_data_loader))
data.keys()

In [None]:
print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['targets'].shape)

In [None]:
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
last_hidden_state, pooled_output = bert_model(
  input_ids=encoding['input_ids'],
  attention_mask=encoding['attention_mask']
)