### Feature extractions

This script extracted features from two sentimental corpora, kt4.0 (ours) and wisesight. By training from kt4.0 corpus, we expect to see an improvement in the wisesight corpus' classification performance.

For both datasets, random stratify hold-out was performed with 80:20 ratio for train and test set. Next, several feature extraction methods were applied and output as a joblib objects as follows:  

* Bag of words for unigram and bigrams
* TF-IDF for unigram and bigrams
* Word2Vec pretrained from Thai wiki. (100 dimension)
* POS_tagging with flatten dataframe for unigram and bigrams
* Dictionary-based with Thai positive and negative words  for unigram and bigrams

The output vectors will be carried out in the next experiment.  
pree.t@cmu.ac.th  

In [1]:
import pandas as pd
import numpy as np
import pythainlp
from pythainlp.ulmfit import process_thai

# for visualize
from matplotlib import pyplot as plt
plt.rcParams['font.family'] = 'tahoma'

## Load original datasets

In [2]:
import os
os.path.dirname(os.getcwd())

data_path_kt = os.path.dirname(os.getcwd()) + '\\data\kt4.0\\'
data_path_ws = os.path.dirname(os.getcwd()) + '\\data\wisesight\\'
model_path = os.path.dirname(os.getcwd()) + '\\model\\'
df_kt = pd.read_csv(data_path_kt + 'pantip_cleaned_1.csv')

# we use the original wisesight corpus and reconstruct a new dataframe
texts = []
targets = []

with open(str(data_path_ws) + '\\' + 'neg.txt', encoding='utf-8') as f:
    for line in f:
        texts.append(line.strip())
        targets.append('neg')

with open(str(data_path_ws) + '\\' + 'neu.txt', encoding='utf-8') as f:
    for line in f:
        texts.append(line.strip())
        targets.append('neu')

with open(str(data_path_ws) + '\\' + 'pos.txt', encoding='utf-8') as f:
    for line in f:
        texts.append(line.strip())
        targets.append('pos')

with open(str(data_path_ws) + '\\' + 'q.txt', encoding='utf-8') as f:
    for line in f:
        texts.append(line.strip())
        targets.append('q')
        
df_ws = pd.DataFrame({'texts': texts, 'targets': targets})
df_ws.to_csv(os.path.dirname(os.getcwd()) + '\\' + 'wisesight.csv', index=False)
df_kt.shape, df_ws.shape

((60081, 14), (26737, 2))

In [3]:
df_kt.head(10)

Unnamed: 0,post_id,post_date,user_id,user_name,text,tag,emotion,length,num_sent,sent_length,label,label_1,label_2,vote
0,39839097,2020-04-25 13:24:00,https://pantip.com/profile/5798163,สมาชิกหมายเลข 5798163,[CR] แปังพัฟคุมมัน จัดเต็มเนื้อบางเบา,เครื่องสำอาง,ถูกใจ 0 ขำกลิ้ง 0 หลงรัก 0 ซึ้ง 0 สยอง 0 ทึ่ง 0,36,3,14,2,2,2,pos
1,39839097,2020-04-25 13:24:00,https://pantip.com/profile/5798163,สมาชิกหมายเลข 5798163,ไม่อุดตัน แต่ปกปิดแน่นมาก,เครื่องสำอาง,ถูกใจ 0 ขำกลิ้ง 0 หลงรัก 0 ซึ้ง 0 สยอง 0 ทึ่ง 0,36,3,8,2,2,2,pos
2,39839097,2020-04-25 13:24:00,https://pantip.com/profile/5798163,สมาชิกหมายเลข 5798163,รีวิวแป้ง Lady Audrey Ready All Day จ้า,เครื่องสำอาง,ถูกใจ 0 ขำกลิ้ง 0 หลงรัก 0 ซึ้ง 0 สยอง 0 ทึ่ง 0,36,3,14,2,2,1,pos
3,39838736,2020-04-25 10:52:00,https://pantip.com/profile/5730006,สมาชิกหมายเลข 5730006,ขอบตาดำมากค่ะ คอร์เล็คเตอร์ก็เอาไม่อยู่,เครื่องสำอาง,ถูกใจ 0 ขำกลิ้ง 0 หลงรัก 0 ซึ้ง 0 สยอง 0 ทึ่ง 0,15,2,13,1,3,3,neg
4,39837384,2020-04-24 20:39:00,https://pantip.com/profile/4975838,สมาชิกหมายเลข 4975838,เอาaloe Vera แช่ตู้เย็น จนกลายเป็นน้ำแข็ง,เครื่องสำอาง,ถูกใจ 0 ขำกลิ้ง 0 หลงรัก 0 ซึ้ง 0 สยอง 0 ทึ่ง 0,11,1,11,1,1,3,neu
5,39838990,2020-04-25 12:36:00,https://pantip.com/profile/5655853,chdewxx,[SR] ไอเทม #เซรั่มสิว ลดสิว สิวอุดตัน สิวผด บำ...,เครื่องสำอาง,ถูกใจ 0 ขำกลิ้ง 0 หลงรัก 0 ซึ้ง 0 สยอง 0 ทึ่ง 0,29,1,29,2,2,2,pos
6,39838619,2020-04-25 10:01:00,https://pantip.com/profile/5656639,คูจองยอนและวีรยา,รบกวนสาวๆช่วยแนะนำสกินแคร์ ที่ช่วยให้ผิวหน้าขา...,เครื่องสำอาง,ถูกใจ 0 ขำกลิ้ง 0 หลงรัก 0 ซึ้ง 0 สยอง 0 ทึ่ง 0,23,1,23,2,2,1,pos
7,39837266,2020-04-24 19:58:00,https://pantip.com/profile/632132,หมูกลมอารมณ์ดี,ทดลองใช้ แครอทวิตซีหน้าใส,เครื่องสำอาง,ถูกใจ 0 ขำกลิ้ง 0 หลงรัก 0 ซึ้ง 0 สยอง 0 ทึ่ง 0,14,2,9,1,2,2,pos
8,39837266,2020-04-24 19:58:00,https://pantip.com/profile/632132,หมูกลมอารมณ์ดี,ใน 1 สัปดาห์,เครื่องสำอาง,ถูกใจ 0 ขำกลิ้ง 0 หลงรัก 0 ซึ้ง 0 สยอง 0 ทึ่ง 0,14,2,5,1,1,1,neu
9,39835926,2020-04-24 12:03:00,https://pantip.com/profile/3826851,สมาชิกหมายเลข 3826851,"วิธีเลือก ""รองพื้น"" และ ""คอนซีลเลอร์""",เครื่องสำอาง,ถูกใจ 0 ขำกลิ้ง 0 หลงรัก 0 ซึ้ง 0 สยอง 0 ทึ่ง 0,14,1,14,2,2,2,pos


In [4]:
df_kt.describe()

Unnamed: 0,post_id,length,num_sent,sent_length,label,label_1,label_2
count,60081.0,60081.0,60081.0,60081.0,60081.0,60081.0,60081.0
mean,39649360.0,116.994574,8.502172,13.978329,1.577304,1.362644,1.662156
std,155991.9,118.647716,7.575442,12.083572,0.777527,0.639271,0.800034
min,39172830.0,3.0,1.0,3.0,1.0,1.0,1.0
25%,39587550.0,31.0,3.0,6.0,1.0,1.0,1.0
50%,39689290.0,72.0,6.0,10.0,1.0,1.0,1.0
75%,39769470.0,159.0,11.0,17.0,2.0,2.0,2.0
max,39839700.0,499.0,44.0,301.0,3.0,3.0,3.0


In [5]:
df_ws.head(10)

Unnamed: 0,texts,targets
0,☹️,neg
1,😔,neg
2,😞,neg
3,😥,neg
4,รำ,neg
5,Noๆ,neg
6,Rip,neg
7,T_T,neg
8,กาก,neg
9,โกง,neg


In [6]:
df_ws.describe()

Unnamed: 0,texts,targets
count,26737,26737
unique,26713,4
top,อุดรมีไหมค่ะ,neu
freq,2,14561


# Train-test split 80/20

In [7]:
# random stratified split train and test set 80/20
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
all_df_kt, test_df_kt = train_test_split(df_kt, test_size=0.2, random_state=42, shuffle = True)
all_df_kt.shape, test_df_kt.shape

((48064, 14), (12017, 14))

In [8]:
# class distribution
all_df_kt.vote.value_counts() / all_df_kt.shape[0]

neu    0.632136
pos    0.206620
neg    0.161243
Name: vote, dtype: float64

In [9]:
all_df_ws, test_df_ws = train_test_split(df_ws, test_size=0.2, random_state=42, shuffle = True)
all_df_ws.shape, test_df_ws.shape

((21389, 2), (5348, 2))

In [10]:
# class distribution
all_df_ws.targets.value_counts() / all_df_ws.shape[0]

neu    0.544860
neg    0.253588
pos    0.179345
q      0.022208
Name: targets, dtype: float64

In [11]:
# clean and word tokenize
all_df_kt['processed'] = all_df_kt['text'].apply(str).apply(process_thai)
test_df_kt['processed'] = test_df_kt['text'].apply(str).apply(process_thai)

all_df_ws['processed'] = all_df_ws['texts'].apply(str).apply(process_thai)
test_df_ws['processed'] = test_df_ws['texts'].apply(str).apply(process_thai)

In [12]:
all_df_kt['vote'].to_csv(os.path.dirname(os.getcwd()) + '\\' + 'train_label_kt.csv',  index=False)
# test_df_kt['vote'].to_csv(os.path.dirname(os.getcwd()) + '\\' + 'test_label_kt.csv',  index=False)

all_df_ws['targets'].to_csv(os.path.dirname(os.getcwd()) + '\\' + 'train_label_ws.csv', index=False)
# test_df_ws['targets'].to_csv(os.path.dirname(os.getcwd()) + '\\' + 'test_label_ws.csv', index=False)

## Bag of words 

In [13]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import joblib

# BOW with unigram and bigrams
bow1 = CountVectorizer(ngram_range=(1, 1))
bow2 = CountVectorizer(ngram_range=(2, 2))

# fit kt and transfrom on train set
bow1_fit_kt = bow1.fit(all_df_kt['processed'].apply(str))
text_all_bow1_kt = bow1_fit_kt.transform(all_df_kt['processed'].apply(str))
text_all_bow1_ws = bow1_fit_kt.transform(all_df_ws['processed'].apply(str))

bow2_fit_kt = bow2.fit(all_df_kt['processed'].apply(str))
text_all_bow2_kt = bow2_fit_kt.transform(all_df_kt['processed'].apply(str))
text_all_bow2_ws = bow2_fit_kt.transform(all_df_ws['processed'].apply(str))

joblib.dump(text_all_bow1_kt, model_path+'text_all_bow1_kt.joblib')
joblib.dump(text_all_bow1_ws, model_path+'text_all_bow1_ws.joblib')
joblib.dump(text_all_bow2_kt, model_path+'text_all_bow2_kt.joblib')
joblib.dump(text_all_bow2_ws, model_path+'text_all_bow2_ws.joblib')

# bow1 = CountVectorizer(ngram_range=(1, 1))
# bow2 = CountVectorizer(ngram_range=(2, 2))

# # fit kt and transfrom on test set
# bow1_fit_kt = bow1.fit(test_df_kt['processed'].apply(str))
# text_test_bow1_kt = bow1_fit_kt.transform(test_df_kt['processed'].apply(str))
# text_test_bow1_ws = bow1_fit_kt.transform(test_df_ws['processed'].apply(str))

# bow2_fit_kt = bow2.fit(all_df_kt['processed'].apply(str))
# text_test_bow2_kt = bow2_fit_kt.transform(test_df_kt['processed'].apply(str))
# text_test_bow2_ws = bow2_fit_kt.transform(test_df_ws['processed'].apply(str))

# joblib.dump(text_test_bow1_kt, model_path+'text_test_bow1_kt.joblib')
# joblib.dump(text_test_bow1_ws, model_path+'text_test_bow1_ws.joblib')
# joblib.dump(text_test_bow2_kt, model_path+'text_test_bow2_kt.joblib')
# joblib.dump(text_test_bow2_ws, model_path+'text_test_bow2_ws.joblib')

['C:\\Users\\Pree\\Thai_SA_journal\\model\\text_all_bow2_ws.joblib']

## TF-IDF

In [14]:
# TF-IDF with unigram and bigrams
tfidf1 = TfidfVectorizer(ngram_range=(1, 1))
tfidf2 = TfidfVectorizer(ngram_range=(2, 2))

# fit kt and transfrom on train set
tfidf1_fit_kt = tfidf1.fit(all_df_kt['processed'].apply(str))
text_all_tfidf1_kt = tfidf1_fit_kt.transform(all_df_kt['processed'].apply(str))
text_all_tfidf1_ws = tfidf1_fit_kt.transform(all_df_ws['processed'].apply(str))

tfidf2_fit_kt = tfidf2.fit(all_df_kt['processed'].apply(str))
text_all_tfidf2_kt = tfidf2_fit_kt.transform(all_df_kt['processed'].apply(str))
text_all_tfidf2_ws = tfidf2_fit_kt.transform(all_df_ws['processed'].apply(str))

joblib.dump(text_all_tfidf1_kt, model_path+'text_all_tfidf1_kt.joblib')
joblib.dump(text_all_tfidf1_ws, model_path+'text_all_tfidf1_ws.joblib')
joblib.dump(text_all_tfidf2_kt, model_path+'text_all_tfidf2_kt.joblib')
joblib.dump(text_all_tfidf2_ws, model_path+'text_all_tfidf2_ws.joblib')

# tfidf1 = TfidfVectorizer(ngram_range=(1, 1))
# tfidf2 = TfidfVectorizer(ngram_range=(2, 2))

# # fit kt and transfrom on test set
# tfidf1_fit_kt = tfidf1.fit(test_df_kt['processed'].apply(str))
# text_test_tfidf1_kt = tfidf1_fit_kt.transform(test_df_kt['processed'].apply(str))
# text_test_tfidf1_ws = tfidf1_fit_kt.transform(test_df_ws['processed'].apply(str))

# tfidf2_fit_kt = tfidf2.fit(all_df_kt['processed'].apply(str))
# text_test_tfidf2_kt = tfidf2_fit_kt.transform(test_df_kt['processed'].apply(str))
# text_test_tfidf2_ws = tfidf2_fit_kt.transform(test_df_ws['processed'].apply(str))

# joblib.dump(text_test_tfidf1_kt, model_path+'text_test_tfidf1_kt.joblib')
# joblib.dump(text_test_tfidf1_ws, model_path+'text_test_tfidf1_ws.joblib')
# joblib.dump(text_test_tfidf2_kt, model_path+'text_test_tfidf2_kt.joblib')
# joblib.dump(text_test_tfidf2_ws, model_path+'text_test_tfidf2_ws.joblib')

['C:\\Users\\Pree\\Thai_SA_journal\\model\\text_all_tfidf2_ws.joblib']

## Word2vec

In [15]:
from collections import Counter, defaultdict

class TfidfEmbeddingVectorizer(object):
    def __init__(self, model):
        w2v = {w: vec for w, vec in zip(model.wv.index_to_key, model.wv.vectors)}
        self.word2vec = w2v
        self.word2weight = None
        self.dim = model.vector_size
    
    
    def fit(self, X):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [16]:
import gensim
from pythainlp import word_vector
from gensim.models import Word2Vec, KeyedVectors

# create word2vec for kt corpus
w2v_kt = Word2Vec(vector_size=100, min_count=1, window = 5, workers=4)
w2v_kt.build_vocab(all_df_kt['processed'])
total_examples = w2v_kt.corpus_count

w2v_kt.train(all_df_kt['processed'], total_examples=total_examples, epochs=50)
w2v_kt.wv.most_similar("บะหมี่")

[('สำเร็จรูป', 0.6994114518165588),
 ('กึ่ง', 0.6770331263542175),
 ('หัวไชเท้า', 0.6058409214019775),
 ('ไส้กรอก', 0.586573600769043),
 ('ดอง', 0.5776714086532593),
 ('ต้มยำ', 0.5749776363372803),
 ('คะน้า', 0.5620478391647339),
 ('ยำ', 0.5589373111724854),
 ('ขนม', 0.556253969669342),
 ('หมูสับ', 0.5471652150154114)]

In [17]:
import dill # we use dill instead of joblim because the lambda and dependecie in class TfidfEmbeddingVectorizer
# create embbed vector
w2v_tfidf_emb_kt = TfidfEmbeddingVectorizer(w2v_kt)
w2v_tifdf_fit_kt = w2v_tfidf_emb_kt.fit(all_df_kt['processed'])

# transfrom on the trai and test set for both corpuses
text_all_w2v_tifdf_kt = w2v_tifdf_fit_kt.transform(all_df_kt['processed'])
text_all_w2v_tifdf_ws = w2v_tifdf_fit_kt.transform(all_df_ws['processed'])

# text_test_w2v_tifdf_kt = w2v_tifdf_fit_kt.transform(test_df_kt['processed'])
# text_test_w2v_tifdf_ws = w2v_tifdf_fit_kt.transform(test_df_ws['processed'])

dill.dump(text_all_w2v_tifdf_kt, open(model_path+'text_all_w2v_tifdf_kt.dill', 'wb'))
dill.dump(text_all_w2v_tifdf_ws, open(model_path+'text_all_w2v_tifdf_ws.dill', 'wb'))

# dill.dump(text_test_w2v_tifdf_kt, open(model_path+'text_test_w2v_tifdf_kt.dill', 'wb'))
# dill.dump(text_test_w2v_tifdf_ws, open(model_path+'text_test_w2v_tifdf_ws.dill', 'wb'))

In [18]:
# todo: use pretrained from thai2fit
#w2v_thwiki = word_vector.get_model()
#w2v_model_ws.build_vocab(w2v_thwiki.index_to_key, update=True)
#w2v_model_ws.wv.vectors_lockf = np.ones(len(w2v_model_ws.wv))
#w2v_model_ws.wv.intersect_word2vec_format('thai2vec.bin', binary=True, lockf=1.0)

## POS Tagging


In [19]:
def flatten(x):
    l = list(sum(x, ()))
    return l

In [22]:
from pythainlp.tag import pos_tag_sents
# we used a POS tag with the orchid_ud feature that represented a type of word in a sentence in one-hot vector form
# flatten the list of tuple in series was applied for feature vectors
all_df_kt['POSTags'] = pos_tag_sents(all_df_kt['processed'].tolist(), corpus='orchid_ud')
all_df_kt['POSTags'] = all_df_kt['POSTags'].apply(flatten)

# test_df_kt['POSTags'] = pos_tag_sents(test_df_kt['processed'].tolist(), corpus='orchid_ud')
# test_df_kt['POSTags'] = test_df_kt['POSTags'].apply(flatten)

all_df_ws['POSTags'] = pos_tag_sents(all_df_ws['processed'].tolist(), corpus='orchid_ud')
all_df_ws['POSTags'] = all_df_ws['POSTags'].apply(flatten)

# test_df_ws['POSTags'] = pos_tag_sents(test_df_ws['processed'].tolist(), corpus='orchid_ud')
# test_df_ws['POSTags'] = test_df_ws['POSTags'].apply(flatten)

# TODO: 1. concate word with pos via underscore (มัน_ADV)
#       2. use only tagging 
all_df_ws['POSTags'].iloc[0]

['nissan',
 'NOUN',
 'silvia',
 'NOUN',
 's',
 'NOUN',
 '14',
 'NUM',
 'หน้า',
 'NOUN',
 'หมู',
 'NOUN',
 'ที่',
 'SCONJ',
 'เท่ห์',
 'VERB',
 'ไม่',
 'PART',
 'เหมือน',
 'VERB',
 'ใคร',
 'PRON',
 'ความ',
 'NOUN',
 'เปรี้ยว',
 'VERB',
 'ของ',
 'ADP',
 'สปอร์ต',
 'NOUN',
 'คาร์',
 'NOUN',
 'ยุค',
 'VERB',
 '90',
 'NUM']

In [23]:
# create bow vectors
bow1 = CountVectorizer(ngram_range=(1, 1))
bow2 = CountVectorizer(ngram_range=(2, 2))

text_all_pos_bow1_fit_kt = bow1.fit(all_df_kt['POSTags'].apply(str))
text_all_pos_bow1_kt = text_all_pos_bow1_fit_kt.transform(all_df_kt['POSTags'].apply(str))
text_all_pos_bow1_ws = text_all_pos_bow1_fit_kt.transform(all_df_ws['POSTags'].apply(str))

text_all_pos_bow2_fit_kt = bow2.fit(all_df_kt['POSTags'].apply(str))
text_all_pos_bow2_kt = text_all_pos_bow2_fit_kt.transform(all_df_kt['POSTags'].apply(str))
text_all_pos_bow2_ws = text_all_pos_bow2_fit_kt.transform(all_df_ws['POSTags'].apply(str))

joblib.dump(text_all_pos_bow1_kt, model_path+'text_all_pos_bow1_kt.joblib')
joblib.dump(text_all_pos_bow1_ws, model_path+'text_all_pos_bow1_ws.joblib')
joblib.dump(text_all_pos_bow2_kt, model_path+'text_all_pos_bow2_kt.joblib')
joblib.dump(text_all_pos_bow2_ws, model_path+'text_all_pos_bow2_ws.joblib')

# bow1 = CountVectorizer(ngram_range=(1, 1))
# bow2 = CountVectorizer(ngram_range=(2, 2))

# text_test_pos_bow1_fit_kt = bow1.fit(test_df_kt['POSTags'].apply(str))
# text_test_pos_bow1_kt = text_test_pos_bow1_fit_kt.transform(test_df_kt['POSTags'].apply(str))
# text_test_pos_bow1_ws = text_test_pos_bow1_fit_kt.transform(test_df_ws['POSTags'].apply(str))

# text_test_pos_bow2_fit_kt = bow2.fit(test_df_kt['POSTags'].apply(str))
# text_test_pos_bow2_kt = text_test_pos_bow2_fit_kt.transform(test_df_kt['POSTags'].apply(str))
# text_test_pos_bow2_ws = text_test_pos_bow2_fit_kt.transform(test_df_ws['POSTags'].apply(str))

# joblib.dump(text_test_pos_bow1_kt, model_path+'text_test_pos_bow1_kt.joblib')
# joblib.dump(text_test_pos_bow1_ws, model_path+'text_test_pos_bow1_ws.joblib')
# joblib.dump(text_test_pos_bow2_kt, model_path+'text_test_pos_bow2_kt.joblib')
# joblib.dump(text_test_pos_bow2_ws, model_path+'text_test_pos_bow2_ws.joblib')

['C:\\Users\\Pree\\Thai_SA_journal\\model\\text_all_pos_bow2_ws.joblib']

In [24]:
# create tfidf vectors
tfidf1 = TfidfVectorizer(ngram_range=(1, 1))
tfidf2 = TfidfVectorizer(ngram_range=(2, 2))

text_all_pos_tfidf1_fit_kt = tfidf1.fit(all_df_kt['POSTags'].apply(str))
text_all_pos_tfidf1_kt = text_all_pos_tfidf1_fit_kt.transform(all_df_kt['POSTags'].apply(str))
text_all_pos_tfidf1_ws = text_all_pos_tfidf1_fit_kt.transform(all_df_ws['POSTags'].apply(str))

text_all_pos_tfidf2_fit_kt = tfidf2.fit(all_df_kt['POSTags'].apply(str))
text_all_pos_tfidf2_kt = text_all_pos_tfidf2_fit_kt.transform(all_df_kt['POSTags'].apply(str))
text_all_pos_tfidf2_ws = text_all_pos_tfidf2_fit_kt.transform(all_df_ws['POSTags'].apply(str))

joblib.dump(text_all_pos_tfidf1_kt, model_path+'text_all_pos_tfidf1_kt.joblib')
joblib.dump(text_all_pos_tfidf1_ws, model_path+'text_all_pos_tfidf1_ws.joblib')
joblib.dump(text_all_pos_tfidf2_kt, model_path+'text_all_pos_tfidf2_kt.joblib')
joblib.dump(text_all_pos_tfidf2_ws, model_path+'text_all_pos_tfidf2_ws.joblib')

# tfidf1 = TfidfVectorizer(ngram_range=(1, 1))
# tfidf2 = TfidfVectorizer(ngram_range=(2, 2))

# text_test_pos_tfidf1_fit_kt = tfidf1.fit(test_df_kt['POSTags'].apply(str))
# text_test_pos_tfidf1_kt = text_test_pos_tfidf1_fit_kt.transform(test_df_kt['POSTags'].apply(str))
# text_test_pos_tfidf1_ws = text_test_pos_tfidf1_fit_kt.transform(test_df_ws['POSTags'].apply(str))

# text_test_pos_tfidf2_fit_kt = tfidf2.fit(test_df_kt['POSTags'].apply(str))
# text_test_pos_tfidf2_kt = text_test_pos_tfidf2_fit_kt.transform(test_df_kt['POSTags'].apply(str))
# text_test_pos_tfidf2_ws = text_test_pos_tfidf2_fit_kt.transform(test_df_ws['POSTags'].apply(str))

# joblib.dump(text_test_pos_tfidf1_kt, model_path+'text_test_pos_tfidf1_kt.joblib')
# joblib.dump(text_test_pos_tfidf1_ws, model_path+'text_test_pos_tfidf1_ws.joblib')
# joblib.dump(text_test_pos_tfidf2_kt, model_path+'text_test_pos_tfidf2_kt.joblib')
# joblib.dump(text_test_pos_tfidf2_ws, model_path+'text_test_pos_tfidf2_ws.joblib')

['C:\\Users\\Pree\\Thai_SA_journal\\model\\text_all_pos_tfidf2_ws.joblib']

## Dictionary-based

In [25]:
# load list of our custom positive and negative words
with open(os.path.dirname(os.getcwd()) + '\\data\\' + 'pos_words.txt', encoding='UTF-8') as f:
    pos_words = [line.rstrip('\n') for line in f]

with open(os.path.dirname(os.getcwd()) + '\\data\\' + 'neg_words.txt', encoding='UTF-8') as f:
    neg_words = [line.rstrip('\n') for line in f]
pos_words = list(set(pos_words))
neg_words = list(set(neg_words))

In [26]:
# use bow and tfidf vectorizer based on our custom dict
bow1 = CountVectorizer(ngram_range=(1, 1))
bow2 = CountVectorizer(ngram_range=(2, 2))

my_vocabs = pos_words + neg_words
print('dict size: ', len(my_vocabs))

text_all_dict_bow1_fit = bow1.fit(my_vocabs)
text_all_dict_bow1_kt = text_all_dict_bow1_fit.transform(all_df_kt['processed'].apply(str))
text_all_dict_bow1_ws = text_all_dict_bow1_fit.transform(all_df_ws['processed'].apply(str))

text_all_dict_bow2_fit = bow2.fit(my_vocabs)
text_all_dict_bow2_kt = text_all_dict_bow2_fit.transform(all_df_kt['processed'].apply(str))
text_all_dict_bow2_ws = text_all_dict_bow2_fit.transform(all_df_ws['processed'].apply(str))

joblib.dump(text_all_dict_bow1_kt, model_path+'text_all_dict_bow1_kt.joblib')
joblib.dump(text_all_dict_bow1_ws, model_path+'text_all_dict_bow1_ws.joblib')
joblib.dump(text_all_dict_bow2_kt, model_path+'text_all_dict_bow2_kt.joblib')
joblib.dump(text_all_dict_bow2_ws, model_path+'text_all_dict_bow2_ws.joblib')

# bow1 = CountVectorizer(ngram_range=(1, 1))
# bow2 = CountVectorizer(ngram_range=(2, 2))

# text_test_dict_bow1_fit = bow1.fit(my_vocabs)
# text_test_dict_bow1_kt = text_test_dict_bow1_fit.transform(test_df_kt['processed'].apply(str))
# text_test_dict_bow1_ws = text_test_dict_bow1_fit.transform(test_df_ws['processed'].apply(str))

# text_test_dict_bow2_fit = bow2.fit(my_vocabs)
# text_test_dict_bow2_kt = text_test_dict_bow2_fit.transform(test_df_kt['processed'].apply(str))
# text_test_dict_bow2_ws = text_test_dict_bow2_fit.transform(test_df_ws['processed'].apply(str))

# joblib.dump(text_test_dict_bow1_kt, model_path+'text_test_dict_bow1_kt.joblib')
# joblib.dump(text_test_dict_bow1_ws, model_path+'text_test_dict_bow1_ws.joblib')
# joblib.dump(text_test_dict_bow2_kt, model_path+'text_test_dict_bow2_kt.joblib')
# joblib.dump(text_test_dict_bow2_ws, model_path+'text_test_dict_bow2_ws.joblib')

dict size:  91


['C:\\Users\\Pree\\Thai_SA_journal\\model\\text_all_dict_bow2_ws.joblib']

In [27]:
# use bow and tfidf vectorizer based on our custom dict
tfidf1 = TfidfVectorizer(ngram_range=(1, 1))
tfidf2 = TfidfVectorizer(ngram_range=(2, 2))

text_all_dict_tfidf1_fit = tfidf1.fit(my_vocabs)
text_all_dict_tfidf1_kt = text_all_dict_tfidf1_fit.transform(all_df_kt['processed'].apply(str))
text_all_dict_tfidf1_ws = text_all_dict_tfidf1_fit.transform(all_df_ws['processed'].apply(str))

text_all_dict_tfidf2_fit = bow2.fit(my_vocabs)
text_all_dict_tfidf2_kt = text_all_dict_tfidf2_fit.transform(all_df_kt['processed'].apply(str))
text_all_dict_tfidf2_ws = text_all_dict_tfidf2_fit.transform(all_df_ws['processed'].apply(str))

joblib.dump(text_all_dict_tfidf1_kt, model_path+'text_all_dict_tfidf1_kt.joblib')
joblib.dump(text_all_dict_tfidf1_ws, model_path+'text_all_dict_tfidf1_ws.joblib')
joblib.dump(text_all_dict_tfidf2_kt, model_path+'text_all_dict_tfidf2_kt.joblib')
joblib.dump(text_all_dict_tfidf2_ws, model_path+'text_all_dict_tfidf2_ws.joblib')

# tfidf1 = TfidfVectorizer(ngram_range=(1, 1))
# tfidf2 = TfidfVectorizer(ngram_range=(2, 2))

# text_test_dict_tfidf1_fit = tfidf1.fit(my_vocabs)
# text_test_dict_tfidf1_kt = text_test_dict_tfidf1_fit.transform(test_df_kt['processed'].apply(str))
# text_test_dict_tfidf1_ws = text_test_dict_tfidf1_fit.transform(test_df_ws['processed'].apply(str))

# text_test_dict_tfidf2_fit = tfidf2.fit(my_vocabs)
# text_test_dict_tfidf2_kt = text_test_dict_tfidf2_fit.transform(test_df_kt['processed'].apply(str))
# text_test_dict_tfidf2_ws = text_test_dict_tfidf2_fit.transform(test_df_ws['processed'].apply(str))

# joblib.dump(text_test_dict_tfidf1_kt, model_path+'text_test_dict_tfidf1_kt.joblib')
# joblib.dump(text_test_dict_tfidf1_ws, model_path+'text_test_dict_tfidf1_ws.joblib')
# joblib.dump(text_test_dict_tfidf2_kt, model_path+'text_test_dict_tfidf2_kt.joblib')
# joblib.dump(text_test_dict_tfidf2_ws, model_path+'text_test_dict_tfidf2_ws.joblib')

['C:\\Users\\Pree\\Thai_SA_journal\\model\\text_all_dict_tfidf2_ws.joblib']

## Demonstrate usage


In [30]:
#text_all_kt = joblib.load(model_path+'text_all_tfidf2_kt.joblib')
#text_test_kt = joblib.load(model_path+'text_all_tfidf2_kt.joblib')

text_all_ws = joblib.load(model_path+'text_all_tfidf2_ws.joblib')
#text_test_ws = joblib.load(model_path+'text_test_tfidf2_ws.joblib')

text_all_ws.shape

(21389, 134146)

In [33]:
# load target class
y_train = pd.read_csv(os.path.dirname(os.getcwd()) + '\\' + 'train_label_ws.csv')['targets']
#y_test = pd.read_csv(os.path.dirname(os.getcwd()) + '\\' + 'test_label_ws.csv')['targets']

## Train-valid split

In [34]:
X_train, X_valid, y_train, y_valid = train_test_split(text_all_ws, y_train, test_size=0.15, random_state=42)

## Test the extracted features with Logistic Regression

In [35]:
# quick test with out cv
#fit logistic regression models
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(C=2., penalty="l2", solver="liblinear", dual=False, multi_class="ovr")
model.fit(X_train, y_train.values.ravel())
model.score(X_valid, y_valid)
#y_pred = model.predict(X_valid)

0.6344655655967592

In [36]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, classification_report
def build_model(model):
    scores = (cross_val_score(model, X_train, y_train, cv = 5).mean())
    model = model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    acc_sc = accuracy_score(y_valid, y_pred)
    pre_sc = precision_score(y_valid, y_pred, average='weighted')
    rec_sc = recall_score(y_valid, y_pred, average='weighted')
    f1_sc = f1_score(y_valid, y_pred, average='weighted')
    print('Accuracy :',acc_sc)
    print('Confusion Matrix :\n', confusion_matrix(y_valid, y_pred))
    print('Precision :', pre_sc)
    print('Recall :', rec_sc)
    print('F1-score :', f1_sc)
    print('Classification Report :\n', classification_report(y_valid, y_pred))
    print('Average accuracy of k-fold (5-fold) :', scores ,'\n')

In [37]:
build_model(model)

Accuracy : 0.6344655655967592
Confusion Matrix :
 [[ 339  432   19    0]
 [ 110 1608   39    2]
 [  41  448   89    0]
 [   5   73    4    0]]
Precision : 0.6189299634363078
Recall : 0.6344655655967592
F1-score : 0.5819361159246491
Classification Report :
               precision    recall  f1-score   support

         neg       0.68      0.43      0.53       790
         neu       0.63      0.91      0.74      1759
         pos       0.59      0.15      0.24       578
           q       0.00      0.00      0.00        82

    accuracy                           0.63      3209
   macro avg       0.48      0.37      0.38      3209
weighted avg       0.62      0.63      0.58      3209

Average accuracy of k-fold (5-fold) : 0.6265676567656766 



In [38]:
text_all_kt = joblib.load(model_path+'text_all_tfidf2_kt.joblib')
text_all_ws.shape

In [40]:
y_train = pd.read_csv(os.path.dirname(os.getcwd()) + '\\' + 'train_label_kt.csv')['vote']

In [42]:
X_train, X_valid, y_train, y_valid = train_test_split(text_all_kt, y_train, test_size=0.15, random_state=42)

In [43]:
build_model(model)

Accuracy : 0.6963938973647712
Confusion Matrix :
 [[ 224  926   31]
 [  82 4343   85]
 [  19 1046  454]]
Precision : 0.7108881188654218
Recall : 0.6963938973647712
F1-score : 0.6422187904341828
Classification Report :
               precision    recall  f1-score   support

         neg       0.69      0.19      0.30      1181
         neu       0.69      0.96      0.80      4510
         pos       0.80      0.30      0.43      1519

    accuracy                           0.70      7210
   macro avg       0.72      0.48      0.51      7210
weighted avg       0.71      0.70      0.64      7210

Average accuracy of k-fold (5-fold) : 0.6992214936934771 

