<a href="https://colab.research.google.com/github/seonmia/Yelp_comments_Sentimental_Classfication/blob/main/bert_base_uncased.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hyperparameters

In [None]:
used_model = 'bert-base-uncased'
cased = False if 'uncased' in used_model else True

train_batch_size = 32
eval_batch_size = 32
test_batch_size = 32

learning_rate = 5e-5
train_epoch = 3
weight_decay = 0.001

wandb_project = "final_project1" # WandB에 넣어둘 프로젝트 이름 
wandb_team = "goorm-project-nlp-team-1" # WandB 팀명

# Import requirements

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers

In [None]:
!pip install wandb

Installing collected packages: smmap, gitdb, shortuuid, setproctitle, sentry-sdk, pathtools, GitPython, docker-pycreds, wandb
Successfully installed GitPython-3.1.27 docker-pycreds-0.4.0 gitdb-4.0.9 pathtools-0.1.2 sentry-sdk-1.7.2 setproctitle-1.2.3 shortuuid-1.0.9 smmap-5.0.0 wandb-0.12.21


In [None]:
import os
import pdb
import argparse
from dataclasses import dataclass, field
from typing import Optional
from collections import defaultdict
import wandb
from time import time

import torch
from torch.nn.utils.rnn import pad_sequence

import numpy as np
from tqdm import tqdm, trange
from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    AutoConfig,
    AdamW,
)

In [None]:
wandb.login() 

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
%cd /content/drive/MyDrive/GoormProject1/goorm-project-1-text-classification

/content/drive/.shortcut-targets-by-id/1ovgSHdL_LDsDV-KWBQ2NNEs2v8Mpi0fm/GoormProject1/goorm-project-1-text-classification


In [None]:
!ls

pytorch_model.bin  sentiment.train.0	  submission.csv
sentiment.dev.0    sentiment.train.1	  test_no_label.csv
sentiment.dev.1    submission_albert.csv  wandb


# 1. Preprocess

In [None]:
def make_id_file(task, tokenizer, cased):
    def make_data_strings(file_name, cased):
        data_strings = []
        with open(os.path.join(file_name), 'r', encoding='utf-8') as f:
            id_file_data = [tokenizer.encode(line if cased else line.lower()) for line in f.readlines()]
        for item in id_file_data:
            data_strings.append(' '.join([str(k) for k in item]))
        return data_strings
  
    print('it will take some times...')
    train_pos = make_data_strings('sentiment.train.1', cased)
    train_neg = make_data_strings('sentiment.train.0', cased)
    dev_pos = make_data_strings('sentiment.dev.1', cased)
    dev_neg = make_data_strings('sentiment.dev.0', cased)

    print('make id file finished!')
    return train_pos, train_neg, dev_pos, dev_neg

In [None]:
tokenizer = BertTokenizer.from_pretrained(used_model)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
train_pos, train_neg, dev_pos, dev_neg = make_id_file('yelp', tokenizer, cased)

it will take some times...
make id file finished!


In [None]:
class SentimentDataset(object):
    #  def __init__(self, pos, neg):
    def __init__(self, tokenizer, pos, neg):
        self.tokenizer = tokenizer
        self.data = []
        self.label = []

        for pos_sent in pos:
            self.data += [self._cast_to_int(pos_sent.strip().split())]
            self.label += [[1]]
        for neg_sent in neg:
            self.data += [self._cast_to_int(neg_sent.strip().split())]
            self.label += [[0]]
    def _cast_to_int(self, sample):
        return [int(word_id) for word_id in sample]
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
         sample = self.data[index]
         return np.array(sample), np.array(self.label[index])         

In [None]:
train_dataset = SentimentDataset(tokenizer, train_pos, train_neg)
dev_dataset = SentimentDataset(tokenizer, dev_pos, dev_neg)

In [None]:
for i, item in enumerate(train_dataset):
    print(item)
    if i == 10:
        break

In [None]:
def collate_fn_style(samples):
    input_ids, labels = zip(*samples)
    max_len = max(len(input_id) for input_id in input_ids)
    attention_mask = torch.tensor([[1] * len(input_id) + [0] * (max_len - len(input_id)) for input_id in input_ids])
    input_ids = pad_sequence([torch.tensor(input_id) for input_id in input_ids],
                             batch_first=True)
    
    token_type_ids = torch.tensor([[0] * len(input_id) for input_id in input_ids])
    position_ids = torch.tensor([list(range(len(input_id))) for input_id in input_ids])
    labels = torch.tensor(np.stack(labels, axis=0))

    return input_ids, attention_mask, token_type_ids, position_ids, labels

In [None]:
train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=train_batch_size,
                                           shuffle=True, collate_fn=collate_fn_style,
                                           pin_memory=True, num_workers=2)
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=eval_batch_size,
                                         shuffle=False, collate_fn=collate_fn_style,
                                         num_workers=2)

# 2. Train

In [None]:
random_seed=42
np.random.seed(random_seed)
torch.manual_seed(random_seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = BertForSequenceClassification.from_pretrained(used_model)
model.to(device)

In [None]:
model.train()

optimizer = AdamW(model.parameters(), lr=learning_rate)
# optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)



In [None]:
def compute_acc(predictions, target_labels):
    return (np.array(predictions) == np.array(target_labels)).mean()

In [None]:
wandb.init(project=wandb_project, name=used_model+' '+str(int(time()))[-3:], entity=wandb_team)

In [None]:

init_time=time()
lowest_valid_loss = 9999.

train_acc = []
train_loss = []
valid_acc = []
valid_loss = []


curr_train_loss = [] 
curr_train_acc = [] 

report_to ="wandb" 



for epoch in range(train_epoch):
    with tqdm(train_loader, unit="batch") as tepoch:
        for iteration, (input_ids, attention_mask, token_type_ids, position_ids, labels) in enumerate(tepoch):
            tepoch.set_description(f"Epoch {epoch}")
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            position_ids = position_ids.to(device)
            labels = labels.to(device, dtype=torch.long)

            optimizer.zero_grad()
            
            output = model(input_ids=input_ids,
                           attention_mask=attention_mask,
                           token_type_ids=token_type_ids,
                           position_ids=position_ids, 
                           labels=labels)
            loss = output.loss


            logits = output.logits
            batch_predictions = [0 if example[0] > example[1] else 1 for example in logits]
            batch_labels = [int(example) for example in labels]
            
            acc = compute_acc(batch_predictions, batch_labels)
            



            loss.backward()

            optimizer.step()


            curr_train_loss.append(loss.item())
            curr_train_acc.append(acc)



            tepoch.set_postfix(loss=loss.item())
            if iteration != 0 and iteration % int(len(train_loader) / 5) == 0:
                # Evaluate the model five times per epoch
                with torch.no_grad():
                    model.eval()
                    curr_valid_loss = []   # valid_losses 수정 
                    curr_valid_acc = []  
                     

                    for input_ids, attention_mask, token_type_ids, position_ids, labels in tqdm(dev_loader,
                                                                                                desc='Eval',
                                                                                                position=1,
                                                                                                leave=None):
                        input_ids = input_ids.to(device)
                        attention_mask = attention_mask.to(device)
                        token_type_ids = token_type_ids.to(device)
                        position_ids = position_ids.to(device)
                        labels = labels.to(device, dtype=torch.long)

                        output = model(input_ids=input_ids,
                                       attention_mask=attention_mask,
                                       token_type_ids=token_type_ids,
                                       position_ids=position_ids,
                                       labels=labels)

                        logits = output.logits
                        loss = output.loss

                        batch_predictions = [0 if example[0] > example[1] else 1 for example in logits]
                        batch_labels = [int(example) for example in labels]


                        curr_valid_loss.append(loss.item())
                        curr_valid_acc.append(compute_acc(batch_predictions, batch_labels))

                

                # loss /acc 계산
                mean_train_acc = sum(curr_train_acc) / len(curr_train_acc)
                mean_train_loss = sum(curr_train_loss) / len(curr_train_loss)
                mean_valid_acc = sum(curr_valid_acc) / len(curr_valid_acc)
                mean_valid_loss = sum(curr_valid_loss) / len(curr_valid_loss)

                train_acc.append(mean_train_acc)
                train_loss.append(mean_train_loss)
                valid_acc.append(mean_valid_acc)
                valid_loss.append(mean_valid_loss)
                
                curr_train_acc = [] 
                curr_train_loss = [] 

                # wandb log 수집 

                wandb.log({ 
                        "Train Loss": mean_train_loss,
                        "Train Accuracy": mean_train_acc,
                        "Valid Loss" : mean_valid_loss, 
                        "Valid Accuracy" : mean_valid_acc

                        })


                ###############

                if lowest_valid_loss > mean_valid_loss:
                    lowest_valid_loss = mean_valid_loss
                    print('Acc for model which have lower valid loss: ', mean_valid_acc)
                    torch.save(model.state_dict(), "./pytorch_model.bin")



fin_time=time()
print('Time:',fin_time-init_time)

Epoch 0:  20%|█▉        | 2770/13852 [04:47<19:03,  9.69batch/s, loss=0.0239]
Eval:   0%|          | 0/125 [00:00<?, ?it/s][A
Eval:   1%|          | 1/125 [00:00<00:14,  8.62it/s][A
Eval:   4%|▍         | 5/125 [00:00<00:04, 24.49it/s][A
Eval:   7%|▋         | 9/125 [00:00<00:03, 29.75it/s][A
Eval:  10%|█         | 13/125 [00:00<00:03, 31.96it/s][A
Eval:  14%|█▎        | 17/125 [00:00<00:03, 33.65it/s][A
Eval:  17%|█▋        | 21/125 [00:00<00:03, 33.52it/s][A
Eval:  20%|██        | 25/125 [00:00<00:02, 34.14it/s][A
Eval:  23%|██▎       | 29/125 [00:00<00:02, 35.32it/s][A
Eval:  26%|██▋       | 33/125 [00:01<00:02, 35.28it/s][A
Eval:  30%|██▉       | 37/125 [00:01<00:02, 36.31it/s][A
Eval:  33%|███▎      | 41/125 [00:01<00:02, 35.76it/s][A
Eval:  36%|███▌      | 45/125 [00:01<00:02, 36.17it/s][A
Eval:  39%|███▉      | 49/125 [00:01<00:02, 35.98it/s][A
Eval:  42%|████▏     | 53/125 [00:01<00:02, 35.38it/s][A
Eval:  46%|████▌     | 57/125 [00:01<00:01, 36.08it/s][A
Eval: 

Acc for model which have lower valid loss:  0.97


Epoch 0:  40%|███▉      | 5540/13852 [09:36<14:27,  9.59batch/s, loss=0.031] 
Eval:   0%|          | 0/125 [00:00<?, ?it/s][A
Eval:   1%|          | 1/125 [00:00<00:13,  8.94it/s][A
Eval:   4%|▍         | 5/125 [00:00<00:04, 25.58it/s][A
Eval:   7%|▋         | 9/125 [00:00<00:03, 31.04it/s][A
Eval:  10%|█         | 13/125 [00:00<00:03, 32.36it/s][A
Eval:  14%|█▎        | 17/125 [00:00<00:03, 34.15it/s][A
Eval:  17%|█▋        | 21/125 [00:00<00:03, 33.97it/s][A
Eval:  20%|██        | 25/125 [00:00<00:02, 34.54it/s][A
Eval:  23%|██▎       | 29/125 [00:00<00:02, 35.62it/s][A
Eval:  26%|██▋       | 33/125 [00:01<00:02, 35.05it/s][A
Eval:  30%|██▉       | 37/125 [00:01<00:02, 36.27it/s][A
Eval:  33%|███▎      | 41/125 [00:01<00:02, 35.87it/s][A
Eval:  36%|███▌      | 45/125 [00:01<00:02, 36.37it/s][A
Eval:  39%|███▉      | 49/125 [00:01<00:02, 36.01it/s][A
Eval:  42%|████▏     | 53/125 [00:01<00:02, 35.19it/s][A
Eval:  46%|████▌     | 57/125 [00:01<00:01, 35.69it/s][A
Eval: 

Acc for model which have lower valid loss:  0.9765


Epoch 0:  60%|█████▉    | 8310/13852 [14:26<09:31,  9.71batch/s, loss=0.0894]
Eval:   0%|          | 0/125 [00:00<?, ?it/s][A
Eval:   1%|          | 1/125 [00:00<00:14,  8.85it/s][A
Eval:   4%|▍         | 5/125 [00:00<00:04, 25.27it/s][A
Eval:   7%|▋         | 9/125 [00:00<00:03, 30.55it/s][A
Eval:  10%|█         | 13/125 [00:00<00:03, 32.68it/s][A
Eval:  14%|█▎        | 17/125 [00:00<00:03, 34.33it/s][A
Eval:  17%|█▋        | 21/125 [00:00<00:03, 34.04it/s][A
Eval:  20%|██        | 25/125 [00:00<00:02, 34.69it/s][A
Eval:  23%|██▎       | 29/125 [00:00<00:02, 35.42it/s][A
Eval:  26%|██▋       | 33/125 [00:01<00:02, 34.98it/s][A
Eval:  30%|██▉       | 37/125 [00:01<00:02, 36.23it/s][A
Eval:  33%|███▎      | 41/125 [00:01<00:02, 35.97it/s][A
Eval:  36%|███▌      | 45/125 [00:01<00:02, 36.51it/s][A
Eval:  39%|███▉      | 49/125 [00:01<00:02, 36.19it/s][A
Eval:  42%|████▏     | 53/125 [00:01<00:02, 35.15it/s][A
Eval:  46%|████▌     | 57/125 [00:01<00:01, 35.73it/s][A
Eval: 

Acc for model which have lower valid loss:  0.97925


Epoch 0: 100%|█████████▉| 13849/13852 [24:01<00:00,  8.96batch/s, loss=0.00467]
Eval:   0%|          | 0/125 [00:00<?, ?it/s][A
Eval:   1%|          | 1/125 [00:00<00:13,  9.34it/s][A
Eval:   4%|▍         | 5/125 [00:00<00:04, 24.97it/s][A
Eval:   7%|▋         | 9/125 [00:00<00:03, 30.32it/s][A
Eval:  10%|█         | 13/125 [00:00<00:03, 32.39it/s][A
Eval:  14%|█▎        | 17/125 [00:00<00:03, 34.14it/s][A
Eval:  17%|█▋        | 21/125 [00:00<00:03, 33.84it/s][A
Eval:  20%|██        | 25/125 [00:00<00:02, 34.51it/s][A
Eval:  23%|██▎       | 29/125 [00:00<00:02, 35.46it/s][A
Eval:  26%|██▋       | 33/125 [00:01<00:02, 35.02it/s][A
Eval:  30%|██▉       | 37/125 [00:01<00:02, 35.12it/s][A
Eval:  33%|███▎      | 41/125 [00:01<00:02, 35.09it/s][A
Eval:  36%|███▌      | 45/125 [00:01<00:02, 35.10it/s][A
Eval:  39%|███▉      | 49/125 [00:01<00:02, 35.25it/s][A
Eval:  42%|████▏     | 53/125 [00:01<00:02, 34.72it/s][A
Eval:  46%|████▌     | 57/125 [00:01<00:01, 35.67it/s][A
Eval

Acc for model which have lower valid loss:  0.98075


Epoch 0: 100%|██████████| 13852/13852 [24:09<00:00,  9.56batch/s, loss=0.00371]
Epoch 1:  20%|█▉        | 2770/13852 [04:43<18:49,  9.81batch/s, loss=0.00107]
Eval:   0%|          | 0/125 [00:00<?, ?it/s][A
Eval:   1%|          | 1/125 [00:00<00:13,  9.45it/s][A
Eval:   4%|▍         | 5/125 [00:00<00:04, 25.57it/s][A
Eval:   7%|▋         | 9/125 [00:00<00:03, 30.96it/s][A
Eval:  10%|█         | 13/125 [00:00<00:03, 32.53it/s][A
Eval:  14%|█▎        | 17/125 [00:00<00:03, 34.22it/s][A
Eval:  17%|█▋        | 21/125 [00:00<00:03, 33.63it/s][A
Eval:  20%|██        | 25/125 [00:00<00:02, 34.38it/s][A
Eval:  23%|██▎       | 29/125 [00:00<00:02, 35.75it/s][A
Eval:  26%|██▋       | 33/125 [00:00<00:02, 35.28it/s][A
Eval:  30%|██▉       | 37/125 [00:01<00:02, 36.20it/s][A
Eval:  33%|███▎      | 41/125 [00:01<00:02, 35.95it/s][A
Eval:  36%|███▌      | 45/125 [00:01<00:02, 36.64it/s][A
Eval:  39%|███▉      | 49/125 [00:01<00:02, 35.70it/s][A
Eval:  42%|████▏     | 53/125 [00:01<00:0

Acc for model which have lower valid loss:  0.98125


Epoch 1:  80%|███████▉  | 11080/13852 [19:02<04:42,  9.82batch/s, loss=0.113]  
Eval:   0%|          | 0/125 [00:00<?, ?it/s][A
Eval:   1%|          | 1/125 [00:00<00:13,  9.06it/s][A
Eval:   4%|▍         | 5/125 [00:00<00:04, 25.45it/s][A
Eval:   7%|▋         | 9/125 [00:00<00:03, 29.49it/s][A
Eval:  10%|█         | 13/125 [00:00<00:03, 31.63it/s][A
Eval:  14%|█▎        | 17/125 [00:00<00:03, 33.46it/s][A
Eval:  17%|█▋        | 21/125 [00:00<00:03, 33.27it/s][A
Eval:  20%|██        | 25/125 [00:00<00:02, 34.07it/s][A
Eval:  23%|██▎       | 29/125 [00:00<00:02, 35.04it/s][A
Eval:  26%|██▋       | 33/125 [00:01<00:02, 34.35it/s][A
Eval:  30%|██▉       | 37/125 [00:01<00:02, 35.58it/s][A
Eval:  33%|███▎      | 41/125 [00:01<00:02, 35.28it/s][A
Eval:  36%|███▌      | 45/125 [00:01<00:02, 35.20it/s][A
Eval:  39%|███▉      | 49/125 [00:01<00:02, 35.01it/s][A
Eval:  42%|████▏     | 53/125 [00:01<00:02, 34.67it/s][A
Eval:  46%|████▌     | 57/125 [00:01<00:01, 35.40it/s][A
Eval

Time: 4324.959753751755





# 3. Test

In [None]:
import pandas as pd
test_df = pd.read_csv('test_no_label.csv')

In [None]:
test_dataset = test_df['Id']

In [None]:
def make_id_file_test(tokenizer, test_dataset, cased):
    data_strings = []
    id_file_data = [tokenizer.encode(sent if cased else sent.lower()) for sent in test_dataset]
    for item in id_file_data:
        data_strings.append(' '.join([str(k) for k in item]))
    return data_strings

In [None]:
test = make_id_file_test(tokenizer, test_dataset, cased)

In [None]:
class SentimentTestDataset(object):
    def __init__(self, tokenizer, test):
        self.tokenizer = tokenizer
        self.data = []

        for sent in test:
            self.data += [self._cast_to_int(sent.strip().split())]

    def _cast_to_int(self, sample):
        return [int(word_id) for word_id in sample]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = self.data[index]
        return np.array(sample)

In [None]:
test_dataset = SentimentTestDataset(tokenizer, test)

In [None]:
def collate_fn_style_test(samples):
    input_ids = samples
    max_len = max(len(input_id) for input_id in input_ids)
    attention_mask = torch.tensor(
        [[1] * len(input_id) + [0] * (max_len - len(input_id)) for input_id in
         input_ids])
    input_ids = pad_sequence([torch.tensor(input_id) for input_id in input_ids],
                             batch_first=True)
    
    
    token_type_ids = torch.tensor([[0] * len(input_id) for input_id in input_ids])
    position_ids = torch.tensor([list(range(len(input_id))) for input_id in input_ids])

    return input_ids, attention_mask, token_type_ids, position_ids

In [None]:
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=test_batch_size,
                                          shuffle=False, collate_fn=collate_fn_style_test,
                                          num_workers=2)

In [None]:
with torch.no_grad():
    model.eval()
    predictions = []
    for input_ids, attention_mask, token_type_ids, position_ids in tqdm(test_loader,
                                                                        desc='Test',
                                                                        position=1,
                                                                        leave=None):

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        position_ids = position_ids.to(device)

        output = model(input_ids=input_ids,
                       attention_mask=attention_mask,
                       token_type_ids=token_type_ids,
                       position_ids=position_ids)

        logits = output.logits
        batch_predictions = [0 if example[0] > example[1] else 1 for example in logits]
        predictions += batch_predictions


Test:   0%|          | 0/32 [00:00<?, ?it/s][A
Test:   3%|▎         | 1/32 [00:00<00:03,  9.24it/s][A
Test:  12%|█▎        | 4/32 [00:00<00:01, 20.78it/s][A
Test:  25%|██▌       | 8/32 [00:00<00:00, 25.99it/s][A
Test:  38%|███▊      | 12/32 [00:00<00:00, 29.56it/s][A
Test:  50%|█████     | 16/32 [00:00<00:00, 31.05it/s][A
Test:  62%|██████▎   | 20/32 [00:00<00:00, 32.83it/s][A
Test:  75%|███████▌  | 24/32 [00:00<00:00, 33.82it/s][A
Test:  88%|████████▊ | 28/32 [00:00<00:00, 34.44it/s][A
                                                     [A

In [None]:
test_df['Category'] = predictions

In [None]:
test_df.to_csv('submission_uncased.csv', index=False)

In [None]:
test_df['Category'].value_counts()

1    504
0    496
Name: Category, dtype: int64