## Import packages

In [1]:
!pip install -Uq emoji==1.7
!pip install -Uq optuna
!pip install -Uq flashtext
!pip install -Uq underthesea
!pip install -Uq scikit-learn

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/175.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m174.1/175.4 kB[0m [31m21.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.4/175.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.7/365.7 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.9/212.9 kB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for flashtext (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━

In [2]:
import re
import unicodedata
import numpy as np
import pandas as pd
from functools import partial
from emoji import get_emoji_regexp
from flashtext import KeywordProcessor
from sklearn.base import BaseEstimator, TransformerMixin

## TextCleaner class

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

HASHTAG = 'hashtag'

class TextCleanerBase(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()

        # Find emojis
        emoji = get_emoji_regexp()

        # Create preprocessing function
        self.remove_emoji      = partial(emoji.sub, '')
        self.normalize_unicode = partial(unicodedata.normalize, 'NFC')

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not isinstance(X, pd.Series):
            X = pd.Series(X)

        return X.apply(str.lower) \
                .apply(self.remove_emoji) \
                .apply(self.normalize_unicode)
        

class TextCleaner(TextCleanerBase):
    def __init__(self):
        super().__init__()

        # Find hashtag
        hashtag = re.compile('#\S+')

        # Find price tags
        pricetag = '((?:(?:\d+[,\.]?)+) ?(?:nghìn đồng|đồng|k|vnd|d|đ))'
        pricetag = re.compile(pricetag)

        # Find special characters
        specialchar = r"[\"#$%&'()*+,\-.\/\\:;<=>@[\]^_`{|}~\n\r\t]"
        specialchar = re.compile(specialchar)

        # Spelling correction
        rules = {
            "òa":["oà"], "óa":["oá"], "ỏa":["oả"], "õa":["oã"], "ọa":["oạ"],
            "òe":["oè"], "óe":["oé"], "ỏe":["oẻ"], "õe":["oẽ"], "ọe":["oẹ"],
            "ùy":["uỳ"], "úy":["uý"], "ủy":["uỷ"], "ũy":["uỹ"], "ụy":["uỵ"],
            "ùa":["uà"], "úa":["uá"], "ủa":["uả"], "ũa":["uã"], "ụa":["uạ"],
            "xảy":["xẩy"], "bảy":["bẩy"], "gãy":["gẫy"],
            "không":["k", "hông", "ko", "khong"]}

        kp = KeywordProcessor(case_sensitive=False)
        kp.add_keywords_from_dict(rules)

        # Create preprocessing functions
        self.autocorrect          = kp.replace_keywords
        self.normalize_pricetag   = partial(pricetag.sub, 'giá_tiền')
        self.normalize_hashtag    = partial(hashtag.sub, HASHTAG)
        self.remove_specialchar   = partial(specialchar.sub, '')

    def transform(self, X):
        X = super().transform(X)

        return X.apply(self.autocorrect) \
                .apply(self.normalize_pricetag) \
                .apply(self.normalize_hashtag) \
                .apply(self.remove_specialchar)


## mo2ml - Multioutput to multilabel

In [4]:
aspects = ['FOOD#PRICES',
           'FOOD#QUALITY',
           'FOOD#STYLE&OPTIONS',
           'DRINKS#PRICES',
           'DRINKS#QUALITY',
           'DRINKS#STYLE&OPTIONS',
           'RESTAURANT#PRICES',
           'RESTAURANT#GENERAL',
           'RESTAURANT#MISCELLANEOUS',
           'SERVICE#GENERAL',
           'AMBIENCE#GENERAL',
           'LOCATION#GENERAL']

sentiments = ['-', 'o', '+']

def mo2ml(y):
    """Convert multi-output to multi-label data
    """
    newcols = [f'{a} {s}' for a in aspects for s in sentiments]

    nrows, ncols = len(y), len(newcols)
    ml = pd.DataFrame(np.zeros((nrows, ncols), dtype='bool'),
                      columns=newcols)
    
    for i, a in enumerate(aspects):
        for j in range(1, 4):
            indices = y[a] == j
            ml.iloc[indices, i * 3 + j - 1] = True

    return ml

## mo2df - Multioutput to DataFrame

In [5]:
def mo2df(y):
    if isinstance(y, pd.DataFrame):
        return y
    return pd.DataFrame(y, columns=aspects)

## Reading csv files

In [6]:
!git clone https://github.com/qhle2001/Corpus-Lingistic-CS321.git

Cloning into 'Corpus-Lingistic-CS321'...
remote: Enumerating objects: 78, done.[K
remote: Counting objects: 100% (78/78), done.[K
remote: Compressing objects: 100% (53/53), done.[K
remote: Total 78 (delta 37), reused 54 (delta 22), pack-reused 0[K
Unpacking objects: 100% (78/78), 15.09 MiB | 8.73 MiB/s, done.


In [7]:
train_data = pd.read_csv('/content/Corpus-Lingistic-CS321/file csv/Train.csv')
dev_data = pd.read_csv('/content/Corpus-Lingistic-CS321/file csv/Dev.csv')
test_data = pd.read_csv('/content/Corpus-Lingistic-CS321/file csv/Test.csv')

In [8]:
test_data.head()

Unnamed: 0,review,FOOD#PRICES,FOOD#QUALITY,FOOD#STYLE&OPTIONS,DRINKS#PRICES,DRINKS#QUALITY,DRINKS#STYLE&OPTIONS,RESTAURANT#PRICES,RESTAURANT#GENERAL,RESTAURANT#MISCELLANEOUS,SERVICE#GENERAL,AMBIENCE#GENERAL,LOCATION#GENERAL
0,"❤ Bánh đc bao gói bao bì khá lạ, là túi nilong...",,,,,,,,,positive,,,
1,* soda and lemon thì cũng tạm được.,,,,,neutral,,,,,,,
2,"Mình sẽ ko baoh order ship thế này nữa, làm ăn...",,,,,,,,,,negative,,
3,Cơ sở Kichi Kichi này đã hạn chế về không gian...,,,,,,,,positive,,,negative,
4,Nhắc 2 3 lần mới thấy đem tới.,,,,,,,,,,negative,,


In [9]:
sentences_review = test_data['review']
print(sentences_review)

0       ❤ Bánh đc bao gói bao bì khá lạ, là túi nilong...
1                     * soda and lemon thì cũng tạm được.
2       Mình sẽ ko baoh order ship thế này nữa, làm ăn...
3       Cơ sở Kichi Kichi này đã hạn chế về không gian...
4                          Nhắc 2 3 lần mới thấy đem tới.
                              ...                        
1933    Nước sốt, mình thử vị BBQ và Tiêu:cả hai cái đ...
1934                           Phục vụ tệ, đồ ăn siêu tệ.
1935    Lần 1 thì order 5 ly có hết 4 ly là cf trứng c...
1936        Vừa sánh vừa béo ngậy mà thơm đậm mùi phomai.
1937    Giá cả bình thg, đầy đủ mấy loại đồ ăn vặt mìn...
Name: review, Length: 1938, dtype: object


In [None]:
# train_data = pd.read_csv('/content/Train.csv')
# dev_data = pd.read_csv('/content/Dev.csv')
# test_data = pd.read_csv('/content/Test.csv')

In [10]:
def ConvertOutput(df):
    X = df.pop('review')
    y = df.replace({np.nan: 0, 
                    'negative': 1, 
                    'neutral': 2, 
                    'positive': 3}).astype(np.uint8)

    print('X.shape:', X.shape, 'y.shape:', y.shape)
    return X, y

In [11]:
Xtrain, ytrain = ConvertOutput(train_data)
Xdev,   ydev   = ConvertOutput(dev_data)
Xtest,  ytest  = ConvertOutput(test_data)

X.shape: (7028,) y.shape: (7028, 12)
X.shape: (771,) y.shape: (771, 12)
X.shape: (1938,) y.shape: (1938, 12)


In [12]:
# Advanced text cleanup
cleaner       = TextCleaner()

xtrain        = cleaner.transform(Xtrain)
xdev          = cleaner.transform(Xdev)
xtest         = cleaner.transform(Xtest)

  emoji = get_emoji_regexp()


In [13]:
# y target for evaluation
ytrain_ml = mo2ml(ytrain)
ydev_ml   = mo2ml(ydev)
ytest_ml  = mo2ml(ytest)

## Class distribution

In [14]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def get_pie(df, name):
    count = df.sum(axis=0)
    return go.Pie(labels=count.index, values=count, 
                  textposition='inside', name=name)


names = ('Train', 'Dev', 'Test')
phaseA = (ytrain, ydev, ytest)

fig = make_subplots(cols=3, subplot_titles=names,
                    specs=[[{'type': 'pie'}] * 3])

for i, (df, name) in enumerate(zip(phaseA, names), 1):
    fig.add_trace(get_pie(df, name), row=1, col=i)

fig.update_layout(title='# of samples per aspect')

In [15]:
phaseML = (ytrain_ml, ydev_ml, ytest_ml)

fig = make_subplots(cols=3, subplot_titles=names,
                    specs=[[{'type': 'pie'}] * 3])

for i, (df, name) in enumerate(zip(phaseML, names), 1):
    fig.add_trace(get_pie(df, name), row=1, col=i)

fig.update_layout(title='# of samples per class (entity, sentiment)')

# Feature extraction (Convert reviews to vectors)

##  Basic features (1, 2, 3 grams)


In [30]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

# x data using advanced clean up class and basic features extrator
xtrain_basef = vectorizer.fit_transform(xtrain)
xdev_basef   = vectorizer.transform(xdev)
xtest_basef  = vectorizer.transform(xtest)

In [16]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# vectorizer = TfidfVectorizer(ngram_range=(1, 3),
#                              min_df=2, max_df=0.9)

# # x data using advanced clean up class and basic features extrator
# xtrain_basef = vectorizer.fit_transform(xtrain)
# xdev_basef   = vectorizer.transform(xdev)
# xtest_basef  = vectorizer.transform(xtest)

In [31]:
xtrain_basef

<7028x4768 sparse matrix of type '<class 'numpy.int64'>'
	with 103177 stored elements in Compressed Sparse Row format>

In [32]:
xtrain_basef.shape

(7028, 4768)

# Model Architectures

## End-to-End Architecture

In [19]:
from sklearn.multioutput import MultiOutputClassifier as MOC

# Evaluation functions

In [20]:
from sklearn.metrics import f1_score, classification_report

def quick_f1(y_true, y_pred):
    y_pred = mo2ml(mo2df(y_pred))
    return round(f1_score(y_true, y_pred, average='micro', zero_division=0), 4)

def evaluate(model, X, y):
    yb_true  = mo2ml(y)

    yb_pred  = mo2df(model.predict(X))
    yb_pred  = mo2ml(yb_pred)

    return classification_report(yb_true, yb_pred, zero_division=0)

# Compare models

## Logistic Regression

In [33]:
from sklearn.linear_model import LogisticRegression

lr = MOC(LogisticRegression())
lr.fit(xtrain_basef, ytrain)

print('Micro F1 score on train:',quick_f1(ytrain_ml  , lr.predict(xtrain_basef)))
print('Micro F1 score on dev',quick_f1(ydev_ml  , lr.predict(xdev_basef)))
print('Micro F1 score on test',quick_f1(ytest_ml , lr.predict(xtest_basef)))
print(evaluate(lr, xtest_basef, ytest))


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to th

Micro F1 score on train: 0.914
Micro F1 score on dev 0.558
Micro F1 score on test 0.5571
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        10
           1       0.60      0.41      0.49        71
           2       0.47      0.29      0.36        31
           3       0.42      0.22      0.29        78
           4       0.49      0.33      0.40        75
           5       0.74      0.74      0.74       401
           6       0.48      0.28      0.35        58
           7       0.52      0.45      0.48       173
           8       0.62      0.47      0.53       206
           9       0.20      0.20      0.20         5
          10       0.23      0.09      0.13        32
          11       0.50      0.10      0.17        10
          12       0.47      0.31      0.37        26
          13       0.60      0.25      0.35        48
          14       0.58      0.59      0.58       129
          15       0.00      0.00      0.00   

## Support Vector Machine

In [34]:
from sklearn.svm import SVC

svc = MOC(SVC())
svc.fit(xtrain_basef, ytrain)

print('Micro F1 score on train:',quick_f1(ytrain_ml  , svc.predict(xtrain_basef)))
print('Micro F1 score on dev',quick_f1(ydev_ml  , svc.predict(xdev_basef)))
print('Micro F1 score on test',quick_f1(ytest_ml , svc.predict(xtest_basef)))
print(evaluate(svc, xtest_basef, ytest))

Micro F1 score on train: 0.8145
Micro F1 score on dev 0.4589
Micro F1 score on test 0.4868
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        10
           1       0.78      0.10      0.17        71
           2       0.00      0.00      0.00        31
           3       0.67      0.05      0.10        78
           4       0.93      0.17      0.29        75
           5       0.78      0.72      0.75       401
           6       0.76      0.22      0.35        58
           7       0.75      0.27      0.40       173
           8       0.77      0.33      0.46       206
           9       0.00      0.00      0.00         5
          10       0.00      0.00      0.00        32
          11       0.00      0.00      0.00        10
          12       0.00      0.00      0.00        26
          13       0.00      0.00      0.00        48
          14       0.69      0.46      0.55       129
          15       0.00      0.00      0.00 

## Unicode normalization

In [None]:
# import unicodedata

# dựng_sẵn = '\u1EA0'
# tổ_hợp   = '\u0041\u0323'

# print(dựng_sẵn, tổ_hợp)
# print(dựng_sẵn == tổ_hợp)

# print('-'*10)

# dựng_sẵn = unicodedata.normalize('NFC', dựng_sẵn)
# tổ_hợp   = unicodedata.normalize('NFC', tổ_hợp)

# print(dựng_sẵn, tổ_hợp)
# print(dựng_sẵn == tổ_hợp)

## Test TextCleaner class

In [None]:
# texts = ['K khí trong lành. đồ ăn hong ngon, thức uống  K tồi; 🥙🌮',
#          'khung cảnh xinh đẹp',
#          'khuyến mãi cực sốc giả chỉ 1000 đồng',
#          '200k quá mắc',
#          'món ăn này mắc quá tới 200k lận. ngày 23/3/2000 😴',
#          'mua 100.000vnd',
#          'bán 1,000,000 d. 5 cái bành xèo tốn 500k',
#          'bán 1.000.000 d. 5 cái bành xèo tốn 500k %^^4',
#          'món ăn này có giá 10 lít',
#          'món ăn này tận 100 nghìn đồng',
#          'bán 1.000đ',
#          'quán này có giá trung bình từ 100k-200k 😛',
#          'quán này có giá trung bình từ 100-200k 😫',
#          '#mắc #food',
#          'bàn ghế sạch đẹp, thái độ nhân viên ok#restaurant 😍',
#          '#tiktok ở nhà vẫn vui',
#          '# birthday ngày mai có tiệc ^^',
#          'aslkdhlakd#tiktok#learn asljdalskjd',
#          '#tiktok   #learn',
#          '#hastag alskjdlasjd #hastag asdsadas #hastag 😁',
#          '#123&456',
#          '#!?@!']

# cleaner = TextCleaner()
# for t in cleaner.fit_transform(texts):
#     print(t.strip())

#Export file

In [35]:
testpredict = lr.predict(xtest_basef)
# for i in testpredict:
#   print(i)

In [36]:
number = list()
f = open("/content/Corpus-Lingistic-CS321/file text/Test.txt", 'r', encoding ='UTF8')
tem = f.readlines()
f.close()
for i in range(0, len(tem), 4):
  number.append(tem[i])
# print(number)

In [25]:
value = ['negative', 'neutral', 'positive']

##Export csv file

In [37]:
f = open('/content/test.txt', 'w', encoding='UTF-8')
for i in range(0,len(sentences_review)):
  f.write(number[i])
  f.write(sentences_review[i])
  f.write('\n')
  countlabel = 0
  count = 0
  for j in range(0,len(testpredict[i])):
    if testpredict[i][j] != 0:
      countlabel += 1
  if countlabel != 0:
    temp = ''
    for j in range(0, len(testpredict[i])):
      if testpredict[i][j] == 1:
        temp = '{'+ aspects[j] + ', ' + value[0] + '}'
        count += 1
      elif testpredict[i][j] == 2:
        temp = '{'+ aspects[j] + ', ' + value[1] + '}'
        count += 1
      elif testpredict[i][j] == 3:
        temp = '{'+ aspects[j] + ', ' + value[2] + '}'
        count += 1
      if count < countlabel:
        temp += ','
    f.write(temp)
  f.write('\n')
  f.write('\n')
f.close()

##Export 10 text files

In [28]:
from google.colab import files

In [38]:
f = open('/content/test.txt', 'r', encoding='UTF-8')
test_file = f.readlines()
f.close()
num = 1
tem = []
count = 0
for idx in test_file:
  tem.append(idx)
  count += 1
  if count == 800:
    with open('{}.txt'.format(num),'w', encoding='utf-8') as f:
      for i in tem:
        f.write(i)
    files.download('{}.txt'.format(num))
    num += 1
    count = 0
    tem = []
    continue
with open('{}.txt'.format(num),'w', encoding='utf-8') as f:
    for i in tem:
       f.write(i)
files.download('{}.txt'.format(num))
f.close()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>