In [1]:
import pandas as pd
import numpy as np
from underthesea import word_tokenize, pos_tag, sent_tokenize
import regex
import string



### Read and overview

In [2]:
df = pd.read_csv('./input_data/data/Products_ThoiTrangNam_comments_20K.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_id    20000 non-null  int64 
 1   category      20000 non-null  object
 2   sub_category  20000 non-null  object
 3   user          20000 non-null  object
 4   rating        20000 non-null  int64 
 5   comment       20000 non-null  object
dtypes: int64(2), object(4)
memory usage: 937.6+ KB


In [4]:
### Preprocess data

In [5]:
from mds5.analyzer.processor.text import TextProcessor

In [6]:
processor = TextProcessor()

In [7]:
df_negative = df[df.rating < 3].sample(n=1000)
df_positive = df[df.rating > 4].sample(n=1000)
df = pd.concat([df_negative, df_positive])

### Lower text

In [8]:
df.comment = df.comment.str.lower()

### Check duplicate

In [9]:
df.comment.duplicated().sum()

492

In [10]:
df.drop_duplicates(subset='comment', inplace=True)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1508 entries, 17075 to 5614
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_id    1508 non-null   int64 
 1   category      1508 non-null   object
 2   sub_category  1508 non-null   object
 3   user          1508 non-null   object
 4   rating        1508 non-null   int64 
 5   comment       1508 non-null   object
dtypes: int64(2), object(4)
memory usage: 82.5+ KB


In [12]:
df.reset_index(drop=True, inplace=True)

In [13]:
df.head()

Unnamed: 0,product_id,category,sub_category,user,rating,comment
0,74,Thời Trang Nam,Quần jeans,ndduc190192,2,"size 34 mà quá nhỏ, ko đúng size. shop ko nhiệ..."
1,687,Thời Trang Nam,Quần Short,baotuyen123456789,2,", chất lượng sản phẩm kém"
2,1030,Thời Trang Nam,Áo,t*****1,1,"size l thì 58kg mặc vừa, tôi 53 kg đây các bạn..."
3,31,Thời Trang Nam,Áo,nguyenthanhnam18,1,", chất lượng sản phẩm rất kém"
4,1208,Thời Trang Nam,Áo,pngan2109,1,vải chán thức sự luôn ý mình bỏ tiền ra mua mà...


### Handle mising value

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1508 entries, 0 to 1507
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_id    1508 non-null   int64 
 1   category      1508 non-null   object
 2   sub_category  1508 non-null   object
 3   user          1508 non-null   object
 4   rating        1508 non-null   int64 
 5   comment       1508 non-null   object
dtypes: int64(2), object(4)
memory usage: 70.8+ KB


* No mising value

### Remove `’` in text

In [15]:
df.comment = df.comment.str.replace('’','')

### Remove multiple dot to 1 dot

In [16]:
df.comment = df.comment.apply(lambda x: regex.sub(r'\.+','.',x))

### Clean text

In [17]:
df.comment = df.comment.apply(lambda x: processor.process_text(x))

### Convert to unicode

In [18]:
df.comment = df.comment.apply(lambda x: processor.covert_unicode(x))

## Filter postagging

In [19]:
df.comment = df.comment.apply(lambda x: processor.process_postag_thesea(x))

### Remove number

In [20]:
df.comment = df.comment.apply(lambda x: regex.sub(r'\d+','',x))

### Feature Enginerring

In [21]:
df.head()

Unnamed: 0,product_id,category,sub_category,user,rating,comment
0,74,Thời Trang Nam,Quần jeans,ndduc190192,2,quá nhỏ đúng ko nhiệt_tình lắm
1,687,Thời Trang Nam,Quần Short,baotuyen123456789,2,chất_lượng kém
2,1030,Thời Trang Nam,Áo,t*****1,1,l mặc vừa cũng chẳng cạn cho chuyên đi xin
3,31,Thời Trang Nam,Áo,nguyenthanhnam18,1,chất_lượng rất kém
4,1208,Thời Trang Nam,Áo,pngan2109,1,chán thức luôn ý bỏ ra mua mỏng dễ xù không_hà...


In [22]:
df['label'] = df.rating.apply(lambda x: 1 if x > 3 else 0)

### Visualize

In [23]:
## Trực quan hóa cho label = 1 sử dụng wordcloud

In [24]:
## Trực quan hóa cho label = 0 sử dụng wordcloud

### Vectorize

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
vectorizer = TfidfVectorizer()


In [27]:
vectorized_data = vectorizer.fit_transform(df['comment']).toarray()


In [28]:
vectorizer.get_feature_names_out()

array(['_shop', 'a_giao', 'adidas', ..., 'ủi', 'ủng_hộ', 'ứng'],
      dtype=object)

In [29]:
vectorizer_df = pd.DataFrame(vectorized_data, 
                             columns=vectorizer.get_feature_names_out())

In [30]:
vectorizer_df

Unnamed: 0,_shop,a_giao,adidas,ae,aff,ah,ai_ai,an_tâm,and,anh_em,...,ống_tay_áo,ổn,ổn_áp,ổn_đáng,ỗn,ới,ủa,ủi,ủng_hộ,ứng
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1503,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
1504,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.297589,0.0
1505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
1506,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0


### Build model

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
X = vectorizer_df
y = df['label']

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [34]:
from sklearn.linear_model import LogisticRegression

In [35]:
model = LogisticRegression()

In [36]:
model.fit(X_train, y_train)

In [37]:
result = model.predict(X_test)

### Evaluation model

In [38]:
from sklearn.metrics import confusion_matrix, classification_report

In [39]:
train_acc = model.score(X_train,y_train)
test_acc = model.score(X_test,y_test)
print(f'Train accurracy: {train_acc}')
print(f'Train accurracy: {test_acc}')

Train accurracy: 0.9054726368159204
Train accurracy: 0.8377483443708609


In [40]:
confusion_matrix(y_test, result)

array([[156,  12],
       [ 37,  97]], dtype=int64)

In [41]:
print(classification_report(y_test, result))

              precision    recall  f1-score   support

           0       0.81      0.93      0.86       168
           1       0.89      0.72      0.80       134

    accuracy                           0.84       302
   macro avg       0.85      0.83      0.83       302
weighted avg       0.84      0.84      0.84       302



### Build model with pipeline

In [43]:
from sklearn.pipeline import Pipeline


In [48]:
_input = [('vectorizer', TfidfVectorizer()),
         ('model', LogisticRegression())]

In [49]:
model_auto = Pipeline(_input)
model_auto

In [50]:
X_train, X_test, y_train, y_test = train_test_split(df['comment'],y)

In [51]:
model_auto.fit(X_train, y_train)

In [52]:
model_auto.predict(X_test)

array([0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,