# Import all dependencies

In [1]:
from string import punctuation
import pickle, random, time, json

import numpy as np

from preprocess import get_normalized_data

from bow import make_vocabs

from vectorizers import binary_vectorizer, \
    count_vectorizer, \
    tfidf_vectorizer

from sklearn.naive_bayes import MultinomialNB, \
    GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from model_utils import get_best_model, \
    do_training_testing, \
    do_validation

from smote import SMOTE as smote

In [2]:
START = time.time()

# Load the file (Traveloka hotel comments)

In [3]:
"""
Membaca data komentar hotel traveloka
berupa file json
file json diambil 1500 komentar yang sudah diberi tag label secara manual
bentuk format dari file json: [{'class': , 'text': }]
"""
with open('data.json', 'r') as file:
    raw_data = json.load(file)

n_data = 1500
raw_data = raw_data[:n_data]
random.seed(123)
random.shuffle(raw_data)

pos_texts_normalized, neg_texts_normalized = get_normalized_data(raw_data)

n_neg = len(neg_texts_normalized)
n_pos = n_data - n_neg

print('n pos: {}, n neg: {}, total: {}'.format(n_pos, n_neg, n_data))
print('sampel data positif:', random.sample(pos_texts_normalized, 1))
print('sampel data negatif:', random.sample(neg_texts_normalized, 1))

n pos: 1058, n neg: 442, total: 1500
sampel data positif: ['suka dekat bukit bintang transportasi gol murah lagi']
sampel data negatif: ['kurang cocok jalan bisnis rombong privasi praktis privilese urus bisnis kamar tidak pandu arah sholat muslim padahal negara muslim front desak sifat kurang ramah banyak tidak lokasi tuju umum kol']


# Split the data -> training/testing and validation

In [4]:
"""
Pemisahan data menjadi data training/testing dan data validasi

Untuk data validasi diambil 300 data
Data trainig/testing adalah seluruh data setelah dipotong untuk data validasi

masing-masing disimpan setelah dikelompokkan berdasarkan kelasnya
"""
n_val_pos = 158
n_val_neg = 142

pos_tt = pos_texts_normalized[:-n_val_pos]
neg_tt = neg_texts_normalized[:-n_val_neg]

pos_val = pos_texts_normalized[-n_val_pos:]
neg_val = neg_texts_normalized[-n_val_neg:]

print('data training/testing:\t{} pos : {} neg'.format(len(pos_tt), len(neg_tt)))
print('data validasi:\t\t{} pos : {} neg'.format(len(pos_val), len(neg_val)))

data training/testing:	900 pos : 300 neg
data validasi:		158 pos : 142 neg


# Make vocabs

In [5]:
all_words = make_vocabs(pos_tt + neg_tt)

print('used n feature:\t\t', len(all_words))

n fitur awal:		 1739
used n feature:		 953


# Vectorize features (binary, tf, tfidf)
and do resampling, using SMOTE algorithm

In [6]:
array_fitur = ['biner', 'count', 'tfidf']

from collections import OrderedDict

"""
instansiasi object vectorizer
untuk masing-masing fitur: biner, frekuensi, dan tf-idf
kemudian dibentuk dict vectorizers
"""
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizers = dict(zip(array_fitur, [
    binary_vectorizer,
    count_vectorizer,
    TfidfVectorizer(vocabulary=all_words),
#     tfidf_vectorizer,
]))

In [7]:
"""
vektorisasi data sebelum di-oversampling

X = array of dict {fitur: data} yang akan digunakan untuk proses training dan testing
y = label data untuk proses training dan testing
"""
# X = {fitur: vectorizers[fitur](pos_tt + neg_tt, all_words) for fitur in array_fitur}
X = {}
for fitur in array_fitur:
    if fitur == 'tfidf':
        vectorizers[fitur].fit(pos_tt + neg_tt)
        X[fitur] = vectorizers[fitur].transform(pos_tt + neg_tt).toarray()
#          X[fitur] = vectorizers[fitur](pos_tt + neg_tt, all_words, training=True)
    else:
        X[fitur] = vectorizers[fitur](pos_tt + neg_tt, all_words)
y = np.concatenate([np.ones(len(pos_tt)), np.zeros(len(neg_tt))])


"""
oversampling data yang sudah divektorisasi

data_resampled = data hasil oversampling dari X dan y
X_resampled = array of dict {fitur: data} yang akan digunakan untuk proses training dan testing
y_resampled = label data untuk proses training dan testing
"""
data_resampled = {fitur: smote(X[fitur], y, 200, k=3, random_seed=10) for fitur in array_fitur}
X_resampled = {fitur: data_resampled[fitur][0] for fitur in array_fitur}
y_resampled = data_resampled[array_fitur[0]][1]


"""
vektorisasi data murni untuk validasi, tanpa pengaruh oversampling

X_val = array of dict {fitur: data} yang akan digunakan untuk proses validasi
y_val = label data untuk proses validasi
"""
# X_val = {fitur: vectorizers[fitur].fit_transform(pos_val + neg_val).toarray() for fitur in array_fitur}
X_val = {}
for fitur in array_fitur:
    if fitur == 'tfidf':
        X_val[fitur] = vectorizers[fitur].transform(pos_val + neg_val).toarray()
#         X_val[fitur] = vectorizers[fitur](pos_val + neg_val, all_words)
    else:
        X_val[fitur] = vectorizers[fitur](pos_val + neg_val, all_words)
y_val = np.concatenate([np.ones(len(pos_val)), np.zeros(len(neg_val))])

In [8]:
vectorizers = OrderedDict(sorted(vectorizers.items()))
X = OrderedDict(sorted(X.items()))
X_resampled =  OrderedDict(sorted(X_resampled.items()))
X_val =  OrderedDict(sorted(X_val.items()))

# Information about the data portion

In [9]:
def get_porsi(y):
    n_pos = len([n for n in y if n == 1])
    n_neg = len([n for n in y if n == 0])
    return n_pos, n_neg

print('data asli \t\t--> {} data \t\t--> pos : neg = {}'
      .format(len(y), get_porsi(y)))
print('data resampling \t--> {} data \t\t--> pos : neg = {}'
      .format(len(y_resampled), get_porsi(y_resampled)))
print('data validasi \t\t--> {} data \t\t--> pos : neg = {}'
      .format(len(y_val), get_porsi(y_val)))

data asli 		--> 1200 data 		--> pos : neg = (900, 300)
data resampling 	--> 1800 data 		--> pos : neg = (900, 900)
data validasi 		--> 300 data 		--> pos : neg = (158, 142)


# Define classification algorithms

In [10]:
"""
array object dari model-model classifier yang akan digunakan
menjadi paramater untuk beberapa fungsi
"""

clf = {
    'multi_nb': MultinomialNB(),
    'gauss_nb': GaussianNB(),
    'svm_lin': LinearSVC(random_state=123),
    'log_reg': LogisticRegression(random_state=123),
}

# Do training-testing and validation
print the performance result for each algorithms and features

In [11]:
"""
__main__
"""


print('===============VALIDASI===============\n')

start = time.time()
print('NO SAMPLING')
best_models_no_sampling = do_training_testing(
    clf, X, y,
    filename='kinerja_training_no_sampling.csv', show=False)
do_validation(
    clf, X_val, y_val, per_clf=best_models_no_sampling,
    filename = 'kinerja_testing_no_sampling.csv')
end = time.time()
print('selesai dalam {} detik -> {} menit'.format(round(end-start, 2),
                                                  round((end-start)/60, 2)))

start = time.time()
print('\nRESAMPLED')
best_models_after_sampling = do_training_testing(
    clf, X_resampled, y_resampled,
    filename='kinerja_training_with_sampling.csv', show=False)
do_validation(
    clf, X_val, y_val, per_clf=best_models_after_sampling,
    filename = 'kinerja_testing_with_sampling.csv')
end = time.time()
print('selesai dalam {} detik -> {} menit'.format(round(end-start, 2),
                                                  round((end-start)/60, 2)))


NO SAMPLING
multi_nb biner: 		 82.17
multi_nb count: 		 83.26
gauss_nb tfidf: 		 59.41
>> performa rata2 multi_nb: 	(74.95)
svm_lin biner: 			 76.7
svm_lin count: 			 79.38
svm_lin tfidf: 			 78.43
>> performa rata2 svm_lin: 	(78.17)
log_reg biner: 			 76.88
log_reg count: 			 79.99
log_reg tfidf: 			 72.46
>> performa rata2 log_reg: 	(76.44)
selesai dalam 1.92 detik -> 0.03 menit

RESAMPLED
multi_nb biner: 		 87.7
multi_nb count: 		 88.85
gauss_nb tfidf: 		 60.9
>> performa rata2 multi_nb: 	(79.15)
svm_lin biner: 			 82.71
svm_lin count: 			 85.47
svm_lin tfidf: 			 86.63
>> performa rata2 svm_lin: 	(84.94)
log_reg biner: 			 86.29
log_reg count: 			 86.29
log_reg tfidf: 			 88.0
>> performa rata2 log_reg: 	(86.86)
selesai dalam 3.36 detik -> 0.06 menit


In [12]:
END = time.time()
print('total time {} detik -> {} menit'.format(round(END-START, 2),
                                               round((END-START)/60, 2)))

total time 51.21 detik -> 0.85 menit
