In [0]:
!pip install -qq transformers
!gdown --id 1S6qMioqPJjyBLpLVz4gmRTnJHnjitnuV
!gdown --id 1zdmewp7ayS4js4VtrJEHzAheSW-5NBZv
!gdown --id 1V8itWtowCYnb2Bc9KlK9SxGff9WwmogA

[K     |████████████████████████████████| 675kB 2.8MB/s 
[K     |████████████████████████████████| 1.1MB 14.0MB/s 
[K     |████████████████████████████████| 890kB 17.0MB/s 
[K     |████████████████████████████████| 3.8MB 21.8MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
Downloading...
From: https://drive.google.com/uc?id=1S6qMioqPJjyBLpLVz4gmRTnJHnjitnuV
To: /content/apps.csv
100% 134k/134k [00:00<00:00, 33.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=1zdmewp7ayS4js4VtrJEHzAheSW-5NBZv
To: /content/reviews.csv
7.17MB [00:00, 109MB/s]
Downloading...
From: https://drive.google.com/uc?id=1V8itWtowCYnb2Bc9KlK9SxGff9WwmogA
To: /content/best_model_state.bin
433MB [00:03, 113MB/s]


In [0]:

import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
import pandas as pd
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

In [0]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [0]:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
sample_txt = 'When was I last outside? I am stuck at home for 2 weeks.'
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(f' Sentence: {sample_txt}')
print(f'   Tokens: {tokens}')
print(f'Token IDs: {token_ids}')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…


 Sentence: When was I last outside? I am stuck at home for 2 weeks.
   Tokens: ['When', 'was', 'I', 'last', 'outside', '?', 'I', 'am', 'stuck', 'at', 'home', 'for', '2', 'weeks', '.']
Token IDs: [1332, 1108, 146, 1314, 1796, 136, 146, 1821, 5342, 1120, 1313, 1111, 123, 2277, 119]


In [0]:
df = pd.read_csv("reviews.csv")
def to_sentiment(rating):
  rating = int(rating)
  if rating <= 2:
    return 0
  elif rating == 3:
    return 1
  else:
    return 2
df['sentiment'] = df.score.apply(to_sentiment)

In [0]:
encoding = tokenizer.encode_plus(
  sample_txt,
  max_length=32,
  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
  return_token_type_ids=False,
  pad_to_max_length=True,
  return_attention_mask=True,
  return_tensors='pt',  # Return PyTorch tensors
)

In [0]:
token_lens = []
for txt in df.content:
  tokens = tokenizer.encode(txt, max_length=512)
  token_lens.append(len(tokens))
MAX_LEN = 160

In [0]:
class GPReviewDataset(Dataset):
  def __init__(self, reviews, targets, tokenizer, max_len):
    self.reviews = reviews
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len
  def __len__(self):
    return len(self.reviews)
  def __getitem__(self, item):
    review = str(self.reviews[item])
    target = self.targets[item]
    encoding = self.tokenizer.encode_plus(
      review,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )
    return {
      'review_text': review,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

In [0]:
class GPSentenceDataset(Dataset):
  def __init__(self, sentences, tokenizer, max_len):
    self.sentences = sentences
    self.tokenizer = tokenizer
    self.max_len = max_len
  def __len__(self):
    return len(self.sentences)
  def __getitem__(self, item):
    sentence = str(self.sentences[item])
    encoding = self.tokenizer.encode_plus(
      sentence,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )
    return {
      'sentence_text': sentence,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten()
    }

In [0]:
df_train, df_test = train_test_split(
  df,
  test_size=0.1,
  random_state=RANDOM_SEED
)
df_val, df_test = train_test_split(
  df_test,
  test_size=0.5,
  random_state=RANDOM_SEED
)

In [0]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = GPReviewDataset(
    reviews=df.content.to_numpy(),
    targets=df.sentiment.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )
  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
  )

In [0]:
def create_sentence_loader(df, tokenizer, max_len, batch_size):
  ds = GPSentenceDataset(
    sentences=df.content.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )
  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
  )

In [0]:
BATCH_SIZE = 16
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [0]:
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
last_hidden_state, pooled_output = bert_model(
  input_ids=encoding['input_ids'],
  attention_mask=encoding['attention_mask']
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…




In [0]:
class SentimentClassifier(nn.Module):
  def __init__(self, n_classes):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    output = self.drop(pooled_output)
    return self.out(output)

In [0]:
class_names = ['negative', 'neutral', 'positive']
model = SentimentClassifier(len(class_names))
model = model.to(device)

In [0]:
data = next(iter(train_data_loader))
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)
F.softmax(model(input_ids, attention_mask), dim=1)

tensor([[0.4375, 0.2383, 0.3241],
        [0.2520, 0.4861, 0.2619],
        [0.4884, 0.2184, 0.2931],
        [0.2863, 0.3287, 0.3850],
        [0.5602, 0.2412, 0.1986],
        [0.3963, 0.2457, 0.3580],
        [0.4776, 0.2633, 0.2592],
        [0.3752, 0.4103, 0.2145],
        [0.3156, 0.3849, 0.2995],
        [0.3891, 0.4174, 0.1935],
        [0.3036, 0.3517, 0.3447],
        [0.3658, 0.4040, 0.2303],
        [0.4383, 0.2594, 0.3023],
        [0.2524, 0.3622, 0.3855],
        [0.4013, 0.1803, 0.4184],
        [0.3995, 0.2637, 0.3368]], device='cuda:0', grad_fn=<SoftmaxBackward>)

In [0]:
EPOCHS = 10
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)

In [0]:
def train_epoch(
  model,
  data_loader,
  loss_fn,
  optimizer,
  device,
  scheduler,
  n_examples
):
  model = model.train()
  losses = []
  correct_predictions = 0
  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["targets"].to(device)
    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, targets)
    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  return correct_predictions.double() / n_examples, np.mean(losses)

In [0]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, targets)
      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())
  return correct_predictions.double() / n_examples, np.mean(losses)

In [0]:
print(device)

cuda:0


Epoch 1/10
----------


KeyboardInterrupt: ignored

In [0]:
!gdown --id 1V8itWtowCYnb2Bc9KlK9SxGff9WwmogA
model = SentimentClassifier(len(class_names))
model.load_state_dict(torch.load('best_model_state.bin'))
model = model.to(device)

Downloading...
From: https://drive.google.com/uc?id=1V8itWtowCYnb2Bc9KlK9SxGff9WwmogA
To: /content/best_model_state.bin
433MB [00:03, 111MB/s]


In [0]:
def get_predictions(model, data_loader):
  model = model.eval()
  sentence_texts = []
  predictions = []
  prediction_probs = []
  with torch.no_grad():
    for d in data_loader:
      texts = d["sentence_text"]
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      sentence_texts.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(outputs)
  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  return sentence_texts, predictions, prediction_probs



In [0]:
from google.colab import files
uploaded = files.upload()

Saving inputs4.tsv to inputs4.tsv


In [0]:
sentence_df = pd.read_csv("inputs4.tsv", sep = "\t", header = None)
sentence_df = sentence_df.rename(columns = {0: "content"})
sentence_data_load = create_sentence_loader(sentence_df, tokenizer, MAX_LEN, BATCH_SIZE)

In [0]:
y_sentence_texts, y_pred, y_pred_probs = get_predictions(
  model,
  sentence_data_load
)

In [0]:
y_sentence_texts


['you looking for a special kind of food?',
 'there are five choices of food you like to eat.',
 'you looking for a special treat?',
 'what kind of food would you like?',
 'sorry, i can not find a place to eat in this restaurant.',
 "no, i haven't found any restaurants downtown yet.",
 'sorry, no place to eat with these qualities.',
 "your search results don't match the results you have searched. would you like to refine the search results?",
 "i see graffiti, it's a moderately priced place to eat around the center of town serving european food.",
 'is there a particular part of the city you are looking for? if not, i recommend graffiti in the west.',
 "i see cambridge, it's the place to dine in, the place to dine in, the place to dine in, the place to dine in.",
 'sure, how about zizzi cambridge, offering cheap italian food in the heart of the city?',
 'sorry, i can not find a place to put the dish.',
 "unfortunately, i couldn't find any restaurants that serve that kind of food in thi

In [0]:
sentence_df['sentiment'] = y_pred
sentence_df['pos_probs'] = y_pred_probs[:,2]
sentence_df['pos_probs'].to_csv("pos_probs4.tsv", index = False)
sentence_df['sentiment'].to_csv("sentiment4.tsv", index = False)
files.download('pos_probs4.tsv')
files.download('sentiment4.tsv')

In [0]:
sentence_df

Unnamed: 0,0,content,sentiment,pos_probs
0,ESNG0494.json/1-synth,are you looking for a specific type of food?,1,-3.870654
1,ESNG0494.json/1,"there are 5 places, which type of food do you ...",2,2.373385
2,ESNG01172.json/1-synth,are you looking for a specific food?,1,-3.453883
3,ESNG01172.json/1,what type of food would you like?,0,-1.540136
4,ESNG01172.json/3-synth,"sorry, i cannot find any place to eat like that.",0,-4.389456
...,...,...,...,...
91,ESNG0475.json/1,yes i there are 5 restaurants that match what ...,1,-3.029067
92,ESNG0554.json/1-synth,"i am sorry, there are no restaurant like that....",0,-3.601641
93,ESNG0554.json/1,"i am sorry, there are no german restaurants in...",1,-4.227464
94,ESNG0554.json/2-synth,what part of town do you want?,0,-3.261457
