In [1]:
import pandas as pd
import numpy as np
from underthesea import word_tokenize, pos_tag, sent_tokenize
import regex
import string



### Read and overview

In [2]:
df = pd.read_csv('./input_data/data/Products_ThoiTrangNam_comments_20K.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_id    20000 non-null  int64 
 1   category      20000 non-null  object
 2   sub_category  20000 non-null  object
 3   user          20000 non-null  object
 4   rating        20000 non-null  int64 
 5   comment       20000 non-null  object
dtypes: int64(2), object(4)
memory usage: 937.6+ KB


In [4]:
### Preprocess data

In [5]:
from mds5.analyzer.processor.text import TextProcessor

In [6]:
processor = TextProcessor()

In [7]:
df_negative = df[df.rating < 3].sample(n=1000)
df_positive = df[df.rating > 4].sample(n=1000)
df = pd.concat([df_negative, df_positive])

### Lower text

In [8]:
df.comment = df.comment.str.lower()

### Check duplicate

In [9]:
df.comment.duplicated().sum()

491

In [10]:
df.drop_duplicates(subset='comment', inplace=True)

TypeError: drop_duplicates() got an unexpected keyword argument 'subset'

In [None]:
df.info()

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
df.head()

### Handle mising value

In [None]:
df.info()

* No mising value

### Remove `’` in text

In [None]:
df.comment = df.comment.str.replace('’','')

### Remove multiple dot to 1 dot

In [None]:
df.comment = df.comment.apply(lambda x: regex.sub(r'\.+','.',x))

### Clean text

In [None]:
df.comment = df.comment.apply(lambda x: processor.process_text(x))

### Convert to unicode

In [None]:
df.comment = df.comment.apply(lambda x: processor.covert_unicode(x))

## Filter postagging

In [None]:
df.comment = df.comment.apply(lambda x: processor.process_postag_thesea(x))

### Remove number

In [17]:
df.comment = df.comment.apply(lambda x: regex.sub(r'\d+','',x))

### Feature Enginerring

In [31]:
df.head()

Unnamed: 0,product_id,category,sub_category,user,rating,comment
16870,554,Thời Trang Nam,Đồ Bộ,meo_moon12345,2,", chất lượng sản phẩm kém"
19318,349,Thời Trang Nam,Áo Vest và Blazer,vungocnha12,1,chất lượng sản phẩm rất kém
18358,286,Thời Trang Nam,Vớ/ Tất,klt_80,1,", chất lượng sản phẩm rất kém"
16582,45,Thời Trang Nam,"Áo Hoodie, Áo Len & Áo Nỉ",nakut3vn,2,đặt xanh dương gửi màu này. dẫu biết thực tế s...
18602,15,Thời Trang Nam,Áo,heopic,1,"mua c đóng gói thành c , có cái bịch th cũng t..."


In [32]:
df['label'] = df.rating.apply(lambda x: 1 if x > 3 else 0)

### Visualize

In [33]:
## Trực quan hóa cho label = 1 sử dụng wordcloud

In [None]:
## Trực quan hóa cho label = 0 sử dụng wordcloud

### Vectorize

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
vectorizer = TfidfVectorizer()


In [20]:
vectorized_data = vectorizer.fit_transform(df['comment']).toarray()


In [21]:
vectorizer.get_feature_names_out()

array(['_lúc', '_shop', '_đánh', ..., 'ủng', 'ức', 'ứng'], dtype=object)

In [22]:
vectorizer_df = pd.DataFrame(vectorized_data, 
                             columns=vectorizer.get_feature_names_out())

In [23]:
vectorizer_df

Unnamed: 0,_lúc,_shop,_đánh,aaaa,abcefghacvxhvghffyfjfhguhghfygugtyf,about,ace,actually,admin,ae,...,ốm,ống,ổn,ổng,ổnnn,ủa,ủi,ủng,ức,ứng
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1504,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.267993,0.0,0.0
1506,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.173742,...,0.0,0.0,0.134385,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0


### Build model

In [29]:
from sklearn.model_selection import train_test_split

In [34]:
X = vectorizer_df
y = df['label']

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [39]:
from sklearn.linear_model import LogisticRegression

In [40]:
model = LogisticRegression()

In [41]:
model.fit(X_train, y_train)

In [42]:
result = model.predict(X_test)

### Evaluation model

In [43]:
from sklearn.metrics import confusion_matrix, classification_report

In [45]:
train_acc = model.score(X_train,y_train)
test_acc = model.score(X_test,y_test)
print(f'Train accurracy: {train_acc}')
print(f'Train accurracy: {test_acc}')

Train accurracy: 0.9461474730737366
Train accurracy: 0.8509933774834437


In [46]:
confusion_matrix(y_test, result)

array([[147,  18],
       [ 27, 110]], dtype=int64)

In [49]:
print(classification_report(y_test, result))

              precision    recall  f1-score   support

           0       0.84      0.89      0.87       165
           1       0.86      0.80      0.83       137

    accuracy                           0.85       302
   macro avg       0.85      0.85      0.85       302
weighted avg       0.85      0.85      0.85       302

