In [1]:
!pip install pythainlp



In [2]:
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1iqQN_PkvIpVVYYtDsW1uqP42s10wPzOu' -O shopping-comment.csv

--2024-07-24 18:03:41--  https://docs.google.com/uc?export=download&id=1iqQN_PkvIpVVYYtDsW1uqP42s10wPzOu
Resolving docs.google.com (docs.google.com)... 2404:6800:4001:801::200e, 216.58.221.206
Connecting to docs.google.com (docs.google.com)|2404:6800:4001:801::200e|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://drive.usercontent.google.com/download?id=1iqQN_PkvIpVVYYtDsW1uqP42s10wPzOu&export=download [following]
--2024-07-24 18:03:42--  https://drive.usercontent.google.com/download?id=1iqQN_PkvIpVVYYtDsW1uqP42s10wPzOu&export=download
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 2404:6800:4001:807::2001, 142.250.199.1
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|2404:6800:4001:807::2001|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 62645 (61K) [application/octet-stream]
Saving to: ‘shopping-comment.csv’


2024-07-24 18:03:45 (851 KB/s) - ‘shopping-comment.csv’

# Logistic Regression

In [3]:
import pandas as pd
import numpy as np
from pythainlp.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Step 1: Load and preprocess the data
data = pd.read_csv('shopping-comment.csv')  # Replace 'your_dataset.csv' with the path to your dataset
X = data['text'].astype(str)  # Text data in the 'message' column
y = data['class']  # Labels in the 'class' column


# Step 2: Create bag-of-words representation
# Preprocess the data Before training the model, you need to preprocess the Thai text data to convert it into tokenized
# and numerical features. We'll use scikit-learn's CountVectorizer to transform the text data into bag-of-word features (bow).

# Tokenize Thai text
X_tokenized = X.apply(word_tokenize, keep_whitespace=False)

vectorizer = CountVectorizer(analyzer=lambda x: x)  # Use the list of tokens as the analyzer
X_bow = vectorizer.fit_transform(X_tokenized)# bag of word
print(X_bow.shape) # (documents, vocab)

vocab = np.array(vectorizer.get_feature_names_out())
print(vocab.shape)
print(vocab[250:270])


(1190, 1642)
(1642,)
['งุด' 'ง่วง' 'ง่าย' 'จ' 'จง' 'จดจำ' 'จน' 'จนกว่า' 'จนถึง' 'จบ' 'จม'
 'จรจัด' 'จริง' 'จริงใจ' 'จริงๆ' 'จอ' 'จอมปลอม' 'จะ' 'จัง' 'จังหวะ']


In [4]:
print(X_tokenized[2])
print(X_bow[2])
print(vocab[1021])
print(vocab[1069])

['สินค้า', 'หมด', 'ทำไม', 'ไม่', 'แจ้ง', 'ขึ้น', 'ว่า', 'หมด', 'อะ', 'กด', 'ใส่', 'ตะกร้า', 'ไป', 'เถอะ', 'เซ็ง', 'เรย']
  (0, 1615)	1
  (0, 1021)	1
  (0, 1069)	2
  (0, 483)	1
  (0, 1464)	1
  (0, 121)	1
  (0, 934)	1
  (0, 1159)	1
  (0, 8)	1
  (0, 1589)	1
  (0, 387)	1
  (0, 1611)	1
  (0, 1283)	1
  (0, 1263)	1
  (0, 1353)	1
สินค้า
หมด


In [5]:
# Step 3: Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)

# Step 4: Train the binomial logistic regression
logreg_classifier = LogisticRegression()
logreg_classifier.fit(X_train, y_train)

# Step 5: Evaluate the model
y_pred = logreg_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8487394957983193


In [6]:
from sklearn.metrics import classification_report, confusion_matrix

# Print classification report (precision, recall, F1-score, support)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.96      0.88       135
           1       0.94      0.70      0.80       103

    accuracy                           0.85       238
   macro avg       0.87      0.83      0.84       238
weighted avg       0.86      0.85      0.84       238

Confusion Matrix:
[[130   5]
 [ 31  72]]


In [7]:
# Predict New Data: Example 1

new_text = "ของไม่ตรงปก ส่งช้า แพ็คไม่ดี"
#new_text = ["สินค้าหมด ทำไมไม่แจ้ง ขึ้นว่าหมดอ่ะ"]
new_text = pd.Series(new_text)
new_text_tokenized = new_text.apply(word_tokenize, keep_whitespace=False)
new_text_bow = vectorizer.transform(new_text_tokenized)

predicted_class = logreg_classifier.predict(new_text_bow)
print("Predicted Class: ", predicted_class[0])

print(new_text_tokenized)
print(new_text_bow)
#print(new_text_bow.toarray())

Predicted Class:  0
0    [ของ, ไม่, ตรง, ปก, ส่ง, ช้า, แพ็ค, ไม่, ดี]
dtype: object
  (0, 88)	1
  (0, 325)	1
  (0, 351)	1
  (0, 374)	1
  (0, 604)	1
  (0, 1045)	1
  (0, 1494)	1
  (0, 1615)	2


In [8]:
# Predict New Data: Example 2

#new_text = ["ส่งรวดเร็ว แม่ค้าให้ข้อมูลครบดี"]
new_text = ["สินค้าดีมีคุณภาพ ชอบค่ะ แนะนำเลย"]
new_text = pd.Series(new_text)
new_text_tokenized = new_text.apply(word_tokenize, keep_whitespace=False)
new_text_bow = vectorizer.transform(new_text_tokenized)

predicted_class = logreg_classifier.predict(new_text_bow)
print("Predicted Class: ", predicted_class[0])

print(new_text_tokenized)
print(new_text_bow)


Predicted Class:  1
0    [สินค้า, ดี, มี, คุณภาพ, ชอบ, ค่ะ, แนะนำ, เลย]
dtype: object
  (0, 227)	1
  (0, 236)	1
  (0, 299)	1
  (0, 351)	1
  (0, 762)	1
  (0, 1021)	1
  (0, 1363)	1
  (0, 1477)	1


# Naive Bays

In [9]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

from pythainlp.tokenize import word_tokenize

# Step 1: Load and preprocess the data
data = pd.read_csv('shopping-comment.csv')  # Replace 'your_dataset.csv' with the path to your dataset
X = data['text'].astype(str)  # Text data in the 'message' column
y = data['class']  # Labels in the 'class' column

# Tokenize Thai text
X_tokenized = X.apply(word_tokenize, keep_whitespace=False)

# Step 2: Create bag-of-words representation
vectorizer2 = CountVectorizer(analyzer=lambda x: x)  # Use the list of tokens as the analyzer
X_bow = vectorizer2.fit_transform(X_tokenized)
print(X_bow.shape) # (documents, vocab)

# Step 3: Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)

# Step 4: Train the binomial Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Step 5: Evaluate the model
y_pred = nb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


(1190, 1642)
Accuracy: 0.8529411764705882


In [10]:
new_text = ["ของไม่ตรงปก ส่งช้า แพ็คไม่", "สินค้าหมด ทำไมไม่แจ้ง ขึ้นว่าหมดอ่ะ","ส่งรวดเร็ว แม่ค้าให้ข้อมูลครบดี", "สินค้าดีราคาไม่แพง ชอบค่ะ แนะนำเลย"]
new_text = pd.Series(new_text)
new_text_tokenized = new_text.apply(word_tokenize, keep_whitespace=False)
new_text_bow = vectorizer2.transform(new_text_tokenized)

predicted_class = nb_classifier.predict(new_text_bow)
print("Predicted Class:", predicted_class)

print(new_text_tokenized)
print(new_text_bow)

Predicted Class: [0 0 1 1]
0             [ของ, ไม่, ตรง, ปก, ส่ง, ช้า, แพ็ค, ไม่]
1    [สินค้า, หมด, ทำไม, ไม่, แจ้ง, ขึ้น, ว่า, หมด,...
2         [ส่ง, รวดเร็ว, แม่ค้า, ให้, ข้อมูล, ครบ, ดี]
3    [สินค้า, ดี, ราคา, ไม่, แพง, ชอบ, ค่ะ, แนะนำ, ...
dtype: object
  (0, 88)	1
  (0, 325)	1
  (0, 374)	1
  (0, 604)	1
  (0, 1045)	1
  (0, 1494)	1
  (0, 1615)	2
  (1, 121)	1
  (1, 483)	1
  (1, 934)	1
  (1, 1021)	1
  (1, 1069)	2
  (1, 1196)	1
  (1, 1464)	1
  (1, 1615)	1
  (2, 351)	1
  (2, 811)	1
  (2, 1045)	1
  (2, 1594)	1
  (3, 236)	1
  (3, 299)	1
  (3, 351)	1
  (3, 844)	1
  (3, 1021)	1
  (3, 1363)	1
  (3, 1477)	1
  (3, 1493)	1
  (3, 1615)	1
