In [None]:
# uninstall
!pip uninstall -y wandb

# download
!pip install transformers
!pip install simplet5

# import
import re
import json
import torch

import random
import pandas as pd
from tqdm import tqdm
from simplet5 import SimpleT5
from torch.utils.data import Dataset
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from google.colab import drive
drive.mount('/content/drive')

In [2]:
cd '/content/drive/MyDrive/Sentiment Analysis'

/content/drive/MyDrive/Sentiment Analysis


In [3]:
# def preprocess(string):
#     # 전처리
#     string = string.lower()
#     string = string.split('-- rated')[0]
#     string = re.sub(r'\.{4,}', '', string) # ........있는 부분 제거
#     string = re.sub(r'[\n\r\t]', ' ', string) # escape sequence 제거

def preprocess_sentence(sentence):
    sentence = sentence.lower() # 텍스트 소문자화
    sentence = re.sub(r'\([^)]*\)', '', sentence) # 괄호로 닫힌 문자열 (.) 제거
    sentence = re.sub(r'[ㄱ-ㅎㅏ-ㅣ.,?!]+[/ㄱ-ㅎㅏ-ㅣ.,?!]', '', sentence) # 여러개 자음, 모음, 구두점 제거
    sentence = re.sub("[^a-z0-9-.,!?#@]", " ", sentence) # 지정한 문자 제외 공백으로 전환
    sentence = re.sub(r'[" "]+', " ", sentence) # 여러개 공백을 하나의 공백으로 바꿉니다.
    sentence = sentence.strip() # 문장 양쪽 공백 제거

    return sentence

df = pd.read_csv('/content/drive/MyDrive/Sentiment Analysis/MultinomialNB_label.csv')
print(len(df))

df['Review'] = df['Review'].apply(lambda x: preprocess_sentence(x))

df = df[df['Review']!='']
print(len(df))

df.to_csv('beer_preprocess.csv', index=False)
df.info()

111925
111914
<class 'pandas.core.frame.DataFrame'>
Int64Index: 111914 entries, 0 to 111924
Data columns (total 5 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Rating               111914 non-null  float64
 1   Review               111914 non-null  object 
 2   Beer_name            111914 non-null  object 
 3   label                111914 non-null  object 
 4   MultinomialNB_label  111914 non-null  object 
dtypes: float64(1), object(4)
memory usage: 5.1+ MB


In [4]:
df = pd.read_csv('beer_preprocess.csv')
df

Unnamed: 0,Rating,Review,Beer_name,label,MultinomialNB_label
0,0.5,this is like budlight but with slight corn tas...,Milwaukee's Best Light,Negative,Negative
1,1.0,strong corn flavor. highly carbonated and no h...,Milwaukee's Best Light,Negative,Negative
2,0.5,it just doesn t get worse than this. brings me...,Milwaukee's Best Light,Negative,Negative
3,0.6,beast bleu. i shutter to think how many cans o...,Milwaukee's Best Light,Negative,Negative
4,1.2,i wouldn t wish this beer on anyones glass. th...,Milwaukee's Best Light,Negative,Negative
...,...,...,...,...,...
111909,3.8,"yes, is till needed to taste this one. i taste...",Hoegaarden,Neutral,Positive
111910,1.6,had this at a local all you can eat buffet. no...,Tsingtao,Negative,Negative
111911,1.8,pours a pale golden yellow color with a one fi...,Corona Light,Neutral,Negative
111912,3.7,"tall can. cloudy yellow, with a thin white hea...",Toppling Goliath King Sue,Positive,Positive


In [5]:
df['MultinomialNB_label'].value_counts()

Positive    65957
Negative    41884
Neutral      4073
Name: MultinomialNB_label, dtype: int64

In [6]:
# Data load function
def load_review_dataset(random_seed = 1, file_path="/content/drive/MyDrive/Sentiment Analysis/beer_preprocess.csv"):
    df = pd.read_csv(file_path)

    X_train, X_valid, y_train, y_valid = \
        train_test_split(df['Review'].tolist(), df['MultinomialNB_label'].tolist(),
                         shuffle=True, test_size=0.2, random_state=random_seed, stratify=df['MultinomialNB_label'])

        
    X_val, X_test, y_val, y_test = \
        train_test_split(X_valid, y_valid,
                         shuffle=True, test_size=0.5, random_state=random_seed, stratify=y_valid)
        
    print(len(X_train))
    print(len(X_val))
    print(len(X_test))
    print(len(y_train))
    print(len(y_val))
    print(len(y_test))
    
    # transform to pandas dataframe
    train_data = pd.DataFrame({'source_text': X_train, 'target_text': y_train})    
    val_data = pd.DataFrame({'source_text': X_val, 'target_text': y_val})  
    test_data = pd.DataFrame({'source_text': X_test, 'target_text': y_test})  

    return train_data, val_data, test_data

In [7]:
from glob import glob

for trial_no in range(1):
    # create data
    train_df, val_df, test_df = load_review_dataset(trial_no)    
    # load model
    model = SimpleT5()
    model.from_pretrained(model_type="t5", model_name="t5-base")
    # train model
    model.train(train_df=train_df,
                eval_df=val_df, 
                source_max_token_len=300, 
                target_max_token_len=200, 
                batch_size=8, 
                max_epochs=2, 
                outputdir = "outputs",
                use_gpu=True
               )
    # fetch the path to last model
    last_epoch_model = None 
    for file in glob("./outputs/*"):
        if 'epoch-1' in file:
            last_epoch_model = file
    # load the last model
    model.load_model("t5", last_epoch_model, use_gpu=True)
    # test and save
    # for each test data perform prediction
    predictions = []
    for index, row in test_df.iterrows():
        prediction = model.predict(row['source_text'])[0]
        predictions.append(prediction)
    df = test_df.copy()
    df['predicted'] = predictions
    df['original'] = df['target_text']
    print(f1_score(df['original'], df['predicted'], average='macro'))
    df.to_csv(f"result_run_{trial_no}.csv", index=False)
    # clean the output
    !rm -rf ./outputs

89531
11191
11192
89531
11191
11192


Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.distributed:GPU available: True, used: True
INFO:pytorch_lightning.utilities.distributed:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.distributed:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(
INFO:pytorch_lightning.utilities.seed:Global seed set to 42
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (817 > 512). Running this sequence through the model will result in indexing errors


0.747234619965282


In [12]:
from sklearn.metrics import f1_score, accuracy_score
df_result = pd.read_csv('result_run_0.csv')

y_true = df_result['original']
y_pred = df_result['predicted']

f1 = f1_score(y_true, y_pred, average='micro')
acc = accuracy_score(y_true, y_pred)
print(f"f1_score: {f1:.3f}, accuracy: {acc:.3f}")

f1_score: 0.908, accuracy: 0.908


In [11]:
from sklearn.metrics import f1_score, accuracy_score
df_result = pd.read_csv('result_run_0.csv')

y_true = df_result['original']
y_pred = df_result['predicted']

f1_score(y_true, y_pred, average='macro'), accuracy_score(y_true, y_pred)
# y_true.value_counts()

(0.747234619965282, 0.9077912794853467)

In [10]:
df_result

Unnamed: 0,source_text,target_text,predicted,original
0,hazy yellow orange colour. big white long last...,Positive,Positive,Positive
1,i ordered this beer while having lunch at a te...,Negative,Negative,Negative
2,the aroma is malty and floral with hints of fr...,Positive,Positive,Positive
3,"bottle at sidewalk cafe, venice beach, los ang...",Positive,Negative,Positive
4,nice hops. the malt is a little watery in flav...,Positive,Positive,Positive
...,...,...,...,...
11187,too thin and too much rice influence,Negative,Negative,Negative
11188,"malty , metallic and a sweet taste with a sour...",Negative,Negative,Negative
11189,"pours a light yellow, almost white color, hazy...",Positive,Positive,Positive
11190,tap at the euston tap. middle yellowish brown ...,Positive,Positive,Positive


In [13]:
df_result['predicted'].value_counts()

Positive    6445
Negative    4543
Neutral      204
Name: predicted, dtype: int64