# import

In [1]:
from model.bert import bert_ATE, bert_ABSA
from data.dataset import dataset_ATM, dataset_ABSA

In [2]:
from torch.utils.data import DataLoader, ConcatDataset
from transformers import BertTokenizer
import torch
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import time
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [3]:
DEVICE = torch.device("mps" if torch.has_mps else "cpu")
pretrain_model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(pretrain_model_name)
lr = 2e-5
model_ATE = bert_ATE(pretrain_model_name).to(DEVICE)
optimizer_ATE = torch.optim.Adam(model_ATE.parameters(), lr=lr)
model_ABSA = bert_ABSA(pretrain_model_name).to(DEVICE)
optimizer_ABSA = torch.optim.Adam(model_ABSA.parameters(), lr=lr)

In [4]:
def evl_time(t):
    min, sec= divmod(t, 60)
    hr, min = divmod(min, 60)
    return int(hr), int(min), int(sec)

def load_model(model, path):
    model.load_state_dict(torch.load(path), strict=False)
    return model
    
def save_model(model, name):
    torch.save(model.state_dict(), name)

In [5]:
df = pd.read_csv("data/laptops_train.csv")

In [6]:
x = tokenizer.tokenize("Hello how are you.")

In [7]:
tokenizer.convert_tokens_to_ids(x)

[7592, 2129, 2024, 2017, 1012]

In [8]:
tokenizer("Hello how are you.")

{'input_ids': [101, 7592, 2129, 2024, 2017, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

# Acpect Term Extraction

In [9]:
laptops_train_ds = dataset_ATM(pd.read_csv("data/laptops_train.csv"), tokenizer)
laptops_test_ds = dataset_ATM(pd.read_csv("data/laptops_test.csv"), tokenizer)
restaurants_train_ds = dataset_ATM(pd.read_csv("data/restaurants_train.csv"), tokenizer)
restaurants_test_ds = dataset_ATM(pd.read_csv("data/restaurants_test.csv"), tokenizer)
twitter_train_ds = dataset_ATM(pd.read_csv("data/twitter_train.csv"), tokenizer)
twitter_test_ds = dataset_ATM(pd.read_csv("data/twitter_test.csv"), tokenizer)

In [10]:
train_ds = ConcatDataset([laptops_train_ds, restaurants_train_ds, twitter_train_ds])
test_ds = ConcatDataset([laptops_test_ds, restaurants_test_ds, twitter_test_ds])

In [11]:
def create_mini_batch(samples):
    ids_tensors = [s[1] for s in samples]
    ids_tensors = pad_sequence(ids_tensors, batch_first=True)

    tags_tensors = [s[2] for s in samples]
    tags_tensors = pad_sequence(tags_tensors, batch_first=True)

    pols_tensors = [s[3] for s in samples]
    pols_tensors = pad_sequence(pols_tensors, batch_first=True)
    
    masks_tensors = torch.zeros(ids_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(ids_tensors != 0, 1)
    
    return ids_tensors, tags_tensors, pols_tensors, masks_tensors

In [12]:
train_loader = DataLoader(train_ds, batch_size=5, collate_fn=create_mini_batch, shuffle = True)
test_loader = DataLoader(test_ds, batch_size=50, collate_fn=create_mini_batch, shuffle = True)

In [13]:
for batch in train_loader:
    a,b,c,d = batch
    print(a)
    print(b)
    print(c)
    print(d)
    break

tensor([[ 2096,  1045,  3262,  2224,  2009,  2005, 10373,  1010,  4274,  1998,
         10355,  1010,  1045,  1000,  1049,  1000,  9657,  2035,  2060,  5097,
          2444,  2039,  2000,  1996,  2152,  3115,  1045,  1000,  2310,  1000,
          2272,  2000,  9120,  2013,  6097, 12191,  2015,  1012,     0,     0,
             0,     0,     0],
        [ 1045,  2165,  2009,  2000,  2767,  2040,  8184,  4964,  2009,  1998,
          1045,  2633,  3825,  2055, 10347,  2005,  1996,  3668, 10943,  2100,
          1012,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [ 3492,  2172,  2589,  2026, 19453,  1012,  2085,  1045,  2031,  1017,
          2847,  1998,  2570,  2781,  2000,  3191,  4302, 10693,  1012,  1024,
          1011, 25269,  2497,  1011,  3398,  1010,  1045,  1000,  1049,  1000,
          1037, 11265,  4103,  1012,  3066,  2007,  2009,  1012,  884

In [16]:
def train_model_ATE(loader, epochs):
    all_data = len(loader)
    for epoch in range(epochs):
        finish_data = 0
        losses = []
        correct_predictions = 0
        t0 = time.time()
        for data in loader:
            ids_tensors, tags_tensors, _, masks_tensors = data
            ids_tensors = ids_tensors.to(DEVICE)
            tags_tensors = tags_tensors.to(DEVICE)
            masks_tensors = masks_tensors.to(DEVICE)

            loss = model_ATE(ids_tensors=ids_tensors, tags_tensors=tags_tensors, masks_tensors=masks_tensors)
            losses.append(loss.item())
            loss.backward()
            optimizer_ATE.step()
            optimizer_ATE.zero_grad()

        current_time = (round(time.time()-t0,3))
        hr, min, sec = evl_time(current_time)
        print('epoch:', epoch, " loss:", np.mean(losses), " hr:", hr, " min:", min," sec:", sec)         

        save_model(model_ATE, 'bert_ATE.pkl')
        
def test_model_ATE(loader):
    pred = []
    trueth = []
    with torch.no_grad():
        for data in loader:

            ids_tensors, tags_tensors, _, masks_tensors = data
            ids_tensors = ids_tensors.to(DEVICE)
            tags_tensors = tags_tensors.to(DEVICE)
            masks_tensors = masks_tensors.to(DEVICE)

            outputs = model_ATE(ids_tensors=ids_tensors, tags_tensors=None, masks_tensors=masks_tensors)

            _, predictions = torch.max(outputs, dim=2)

            pred += list([int(j) for i in predictions for j in i ])
            trueth += list([int(j) for i in tags_tensors for j in i ])

    return trueth, pred



In [None]:
%time train_model_ATE(train_loader, 3)

epoch: 0  loss: 0.04603916781836714  hr: 211  min: 12  sec: 58


In [11]:
model_ATE = load_model(model_ATE, 'bert_ATE.pkl')

In [12]:
%time x, y = test_model_ATE(test_loader)
print(classification_report(x, y, target_names=[str(i) for i in range(3)]))

CPU times: user 9.58 s, sys: 14.7 s, total: 24.3 s
Wall time: 50.1 s
              precision    recall  f1-score   support

           0       0.99      1.00      0.99    143669
           1       0.87      0.88      0.87      6486
           2       0.89      0.77      0.83      3837

    accuracy                           0.98    153992
   macro avg       0.92      0.88      0.90    153992
weighted avg       0.98      0.98      0.98    153992



# Aspect Based Sentiment Analysis

In [5]:
laptops_train_ds = dataset_ABSA(pd.read_csv("data/laptops_train.csv"), tokenizer)
laptops_test_ds = dataset_ABSA(pd.read_csv("data/laptops_test.csv"), tokenizer)
restaurants_train_ds = dataset_ABSA(pd.read_csv("data/restaurants_train.csv"), tokenizer)
restaurants_test_ds = dataset_ABSA(pd.read_csv("data/restaurants_test.csv"), tokenizer)
twitter_train_ds = dataset_ABSA(pd.read_csv("data/twitter_train.csv"), tokenizer)
twitter_test_ds = dataset_ABSA(pd.read_csv("data/twitter_test.csv"), tokenizer)

In [6]:
w,x,y,z = laptops_train_ds.__getitem__(121)
print(w)
print(len(w))
print(x)
print(len(x))
print(y)
print(len(y))
print(z)

['[cls]', 'the', 'battery', 'life', 'seems', 'to', 'be', 'very', 'good', ',', 'and', 'have', 'had', 'no', 'issues', 'with', 'it', '.', '[sep]', 'battery', 'life']
21
tensor([ 100, 1996, 6046, 2166, 3849, 2000, 2022, 2200, 2204, 1010, 1998, 2031,
        2018, 2053, 3314, 2007, 2009, 1012,  100, 6046, 2166])
21
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1])
21
tensor(2)


In [7]:
def create_mini_batch2(samples):
    ids_tensors = [s[1] for s in samples]
    ids_tensors = pad_sequence(ids_tensors, batch_first=True)

    segments_tensors = [s[2] for s in samples]
    segments_tensors = pad_sequence(segments_tensors, batch_first=True)

    label_ids = torch.stack([s[3] for s in samples])
    
    masks_tensors = torch.zeros(ids_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(ids_tensors != 0, 1)

    return ids_tensors, segments_tensors, masks_tensors, label_ids

In [8]:
train_ds = ConcatDataset([laptops_train_ds, restaurants_train_ds, twitter_train_ds])
test_ds = ConcatDataset([laptops_test_ds, restaurants_test_ds, twitter_test_ds])

train_loader = DataLoader(train_ds, batch_size=4, collate_fn=create_mini_batch2, shuffle = True)
test_loader = DataLoader(test_ds, batch_size=50, collate_fn=create_mini_batch2, shuffle = True)

In [9]:
for batch in train_loader:
    w,x,y,z = batch
    print(w)
    print(w.size())
    print(x)
    print(x.size())
    print(y)
    print(y.size())
    print(z)
    print(z.size())
    break

tensor([[  100,  2026,  2069, 10520,  2007,  1996,  2410,  1000,  1000,  2944,
          2003,  2008,  2027,  1000,  2128,  1000,  1996,  2069,  3924,  1999,
          1996,  6097,  8654,  4013,  2240,  2039,  2302,  2019,  1045,  2629,
          2030,  1045,  2581, 13151,  1998,  7037,  8389,  5329,  1010,  2174,
          1996,  3119,  1011,  2125,  2003,  2008,  2017,  6162,  1037,  2936,
          6046,  2166,  1011,  1048, 15185,  1011,  1997,  2055,  1016,  2062,
          2847,  1011, 25269,  2497,  1011,  1012,   100, 13151],
        [  100,  1045,  4299,  9733,  1012,  4012,  2052,  1000, 23961,  1000,
          1043, 10626,  8757,  9465, 10272,  1998,  4532, 28619,  2078,  1012,
          2175,  2185,  2175,  2361,  1012,  2017,  1000,  2128,  1000,  2025,
          2359,  1012,   100,  4532, 28619,  2078,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0, 

In [10]:
def train_model_ABSA(loader, epochs):
    all_data = len(loader)
    for epoch in range(epochs):
        finish_data = 0
        losses = []
        current_times = []
        correct_predictions = 0
        
        for data in loader:
            t0 = time.time()
            ids_tensors, segments_tensors, masks_tensors, label_ids = data
            ids_tensors = ids_tensors.to(DEVICE)
            segments_tensors = segments_tensors.to(DEVICE)
            label_ids = label_ids.to(DEVICE)
            masks_tensors = masks_tensors.to(DEVICE)

            loss = model_ABSA(ids_tensors=ids_tensors, lable_tensors=label_ids, masks_tensors=masks_tensors, segments_tensors=segments_tensors)
            losses.append(loss.item())
            loss.backward()
            optimizer_ABSA.step()
            optimizer_ABSA.zero_grad()

            finish_data += 1
            current_times.append(round(time.time()-t0,3))
            current = np.mean(current_times)
            hr, min, sec = evl_time(current*(all_data-finish_data) + current*all_data*(epochs-epoch-1))
            print('epoch:', epoch, " batch:", finish_data, "/" , all_data, " loss:", np.mean(losses), " hr:", hr, " min:", min," sec:", sec)         

        save_model(model_ABSA, 'bert_ABSA2.pkl')
        
def test_model_ABSA(loader):
    pred = []
    trueth = []
    with torch.no_grad():
        for data in loader:

            ids_tensors, segments_tensors, masks_tensors, label_ids = data
            ids_tensors = ids_tensors.to(DEVICE)
            segments_tensors = segments_tensors.to(DEVICE)
            masks_tensors = masks_tensors.to(DEVICE)

            outputs = model_ABSA(ids_tensors, None, masks_tensors=masks_tensors, segments_tensors=segments_tensors)
            
            _, predictions = torch.max(outputs, dim=1)

            pred += list([int(i) for i in predictions])
            trueth += list([int(i) for i in label_ids])

    return trueth, pred



In [11]:
%time train_model_ABSA(train_loader, 6)

AttributeError: 'tuple' object has no attribute 'size'

In [None]:
model_ABSA = load_model(model_ABSA, 'bert_ABSA.pkl')

In [None]:
%time x, y = test_model_ABSA(test_loader)
print(classification_report(x, y, target_names=[str(i) for i in range(3)]))

# ATE + ABSA

In [None]:
def predict_model_ABSA(sentence, aspect, tokenizer):
    t1 = tokenizer.tokenize(sentence)
    t2 = tokenizer.tokenize(aspect)

    word_pieces = ['[cls]']
    word_pieces += t1
    word_pieces += ['[sep]']
    word_pieces += t2

    segment_tensor = [0] + [0]*len(t1) + [0] + [1]*len(t2)

    ids = tokenizer.convert_tokens_to_ids(word_pieces)
    input_tensor = torch.tensor([ids]).to(DEVICE)
    segment_tensor = torch.tensor(segment_tensor).to(DEVICE)

    with torch.no_grad():
        outputs = model_ABSA(input_tensor, None, None, segments_tensors=segment_tensor)
        _, predictions = torch.max(outputs, dim=1)
    
    return word_pieces, predictions, outputs

def predict_model_ATE(sentence, tokenizer):
    word_pieces = []
    tokens = tokenizer.tokenize(sentence)
    word_pieces += tokens

    ids = tokenizer.convert_tokens_to_ids(word_pieces)
    input_tensor = torch.tensor([ids]).to(DEVICE)

    with torch.no_grad():
        outputs = model_ATE(input_tensor, None, None)
        _, predictions = torch.max(outputs, dim=2)
    predictions = predictions[0].tolist()

    return word_pieces, predictions, outputs

def ATE_ABSA(text):
    terms = []
    word = ""
    x, y, z = predict_model_ATE(text, tokenizer)
    for i in range(len(y)):
        if y[i] == 1:
            if len(word) != 0:
                terms.append(word.replace(" ##",""))
            word = x[i]
        if y[i] == 2:
            word += (" " + x[i])
            
    
    if len(word) != 0:
            terms.append(word.replace(" ##",""))
            
    print("tokens:", x)
    print("ATE:", terms)
    
    if len(terms) != 0:
        for i in terms:
            _, c, p = predict_model_ABSA(text, i, tokenizer)
            print("term:", [i], "class:", [int(c)], "ABSA:", [float(p[0][0]), float(p[0][1]), float(p[0][2])])


In [None]:
model_ABSA = load_model(model_ABSA, 'bert_ABSA.pkl')
model_ATE = load_model(model_ATE, 'bert_ATE.pkl')

In [None]:
text = "For the price you pay this product is very good. However, battery life is a little lack-luster coming from a MacBook Pro."
ATE_ABSA(text)

In [None]:
text = "I think Apple is better than Microsoft."
ATE_ABSA(text)

# Cyberpunk 2077 - Xbox One

https://www.amazon.com/-/zh_TW/Cyberpunk-2077-Xbox-One/product-reviews/B07DJW4WZC/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber=2

In [None]:
text = "Spent 5 hours downloading updates."
ATE_ABSA(text)

In [None]:
text = "Install is buggy, so after downloading a day one patch that's nearly 3 times the size of the game, it glitched on the CDs and had to reinstall the game from scratch."
ATE_ABSA(text)

In [None]:
text = "Cyberpunk 2077 freezes constantly, frame rates are terrible, and it's extremely frustrating to try to play."
ATE_ABSA(text)

In [None]:
text = "Cyberpunk 2077 is completely unplayable on xbox one. They should have never released this for current gen."
ATE_ABSA(text)

In [None]:
text = "It’s just a cash grab, the game crashes constantly, runs at like 20 fps, half the environment and characters only load when you’re three feet away from them. Unless you’re in a small space the game looks awful. The worst game i’ve ever played in years visually. It looks worse than later xbox 360 games."
ATE_ABSA(text)

In [None]:
text = "CD Projekt Red should have just abandoned the current gen consoles instead of cheating people out of their money."
ATE_ABSA(text)