# Import all dependencies

In [1]:
from string import punctuation
from os.path import isfile
import pickle
import random
import time
import json

import numpy as np

import nltk
from nltk import FreqDist

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from stopwords import get_stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, confusion_matrix

from imblearn.metrics import geometric_mean_score as gmean
from smote import MY_SMOTE as smote

import matplotlib.pyplot as plt
%matplotlib inline

# Load the file (Traveloka hotel comments)

In [2]:
"""
Membaca data komentar hotel traveloka
berupa file json
file json diambil 1500 komentar yang sudah diberi tag label secara manual
bentuk format dari file json: [{'class': , 'text': }]
kemudian pisah kedalam dua variabel:
    labels: array untuk label/kategori/kelas sentimen dari tiap komentar
    texts: array untuk komentar
"""
with open('data.json', 'r') as file:
    data = json.load(file)

data = data[:1500]
labels = [d['class'] for d in data]
texts = [d['text'] for d in data]


"""
Preprocessing data
1. pisahkan text positif dan negatif ke dalam variabel array masing-masing
2. untuk masing-masing text positif maupun negatif, lakukan:
    3. hapus stopwords
    4. stem tiap kata
5. simpan sebagai pickle file
"""
if isfile('data_traveloka_normalized.data'):
    with open('data_traveloka_normalized.data', 'rb') as data:
        pos_texts_normalized, neg_texts_normalized = pickle.load(data)
else:
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    stopwords = get_stopwords()
    
    pos_texts = [d['text'] for d in data if d['class'] == 1]
    neg_texts = [d['text'] for d in data if d['class'] == 0]

    pos_texts_normalized = []
    for text in pos_texts:
        text = [stemmer.stem(word) for word in text.split() if word not in stopwords]
        pos_texts_normalized.append(' '.join(text))
        
    neg_texts_normalized = []
    for text in neg_texts:
        text = [stemmer.stem(word) for word in text.split() if word not in stopwords]
        neg_texts_normalized.append(' '.join(text))
        
    with open('data_traveloka_normalized.data', 'wb') as data:
        pickle.dump([pos_texts_normalized, neg_texts_normalized], data)

        
n_pos, n_neg = len(pos_texts_normalized), len(neg_texts_normalized)
n_total = n_pos + n_neg
print('n pos: {}, n neg: {}, total: {}'.format(n_pos, n_neg, n_total))
print('sampel data positif:', random.sample(pos_texts_normalized, 1))
print('sampel data negatif:', random.sample(neg_texts_normalized, 1))

n pos: 1058, n neg: 442, total: 1500
sampel data positif: ['lokasi strategis kl sentral station monorail banyak restoran hotel kamar bersih besar bangun baru toiletries lengkap sedia safe deposit box kulkas hair dryer water heater']
sampel data negatif: ['tidak air panas sudah telepon resepsionist tidak direspon']


# Split the data -> training/testing and validation

In [3]:
"""
Pemisahan data menjadi data training/testing dan data validasi

Untuk data validasi diambil masing-masing 100 data positif dan negatif
Data trainig/testing adalah seluruh data setelah dipotong untuk data validasi

masing-masing disimpan setelah dikelompokkan berdasarkan kelasnya
"""
n_val = 100

pos_tt = pos_texts_normalized[:-n_val]
neg_tt = neg_texts_normalized[:-n_val]

pos_val = pos_texts_normalized[-n_val:]
neg_val = neg_texts_normalized[-n_val:]


print('data training/testing:\t{} pos : {} neg'.format(len(pos_tt), len(neg_tt)))
print('data validasi:\t\t{} pos : {} neg'.format(len(pos_val), len(neg_val)))

data training/testing:	958 pos : 342 neg
data validasi:		100 pos : 100 neg


# Make vocabs

In [4]:
"""
Proses pembuatan vocabs

vocabs ini digunakan untuk membentuk feature vector dari normalized data
beberapa perlakukan untuk membentuk vocabs, di antarnya:
(1) hapus hapax: kata yang hanya muncul sekali dari seluruh corpus
(2) seleksi hanya kata kerja
"""
all_words = [word for sentence in pos_tt + neg_tt for word in sentence.split()]

fd = FreqDist(all_words) # sebelum di-set, bentuk object freqdist

all_words = list(set(all_words))
print('n fitur awal:\t\t', len(all_words))

# (1)
hapaxes = fd.hapaxes()
# all_words = [word for word in all_words if word not in hapaxes]

# (2)
with open('./experiment/pos_tag_indo.pkl', 'rb') as file:
    jj = pickle.load(file)
all_words_adj = [word for word in all_words if word in jj]
# all_words = all_words_adj

print('used n feature:\t\t', len(all_words))

n fitur awal:		 2448
used n feature:		 2448


# Vectorized features (binary, tf, tfidf)
and do resampling, using SMOTE algorithm

In [5]:
array_fitur = ['biner', 'tfrek', 'tfidf']



"""
instansiasi object vectorizer
untuk masing-masing fitur: biner, frekuensi, dan tf-idf
kemudian dibentuk dict vectorizers
"""
vectorizers = dict(zip(array_fitur, [
    CountVectorizer(binary=True, vocabulary=all_words),
    CountVectorizer(binary=False, vocabulary=all_words),
    TfidfVectorizer(vocabulary=all_words)
]))



"""
vektorisasi data sebelum di-oversampling

X = array of dict {fitur: data} yang akan digunakan untuk proses training dan testing
y = label data untuk proses training dan testing
"""
X = {fitur: vectorizers[fitur].fit_transform(pos_tt + neg_tt).toarray() for fitur in array_fitur}
y = np.concatenate([np.ones(len(pos_tt)), np.zeros(len(neg_tt))])



"""
oversampling data yang sudah divektorisasi

data_resampled = data hasil oversampling dari X dan y
X_resampled = array of dict {fitur: data} yang akan digunakan untuk proses training dan testing
y_resampled = label data untuk proses training dan testing
"""
data_resampled = {fitur: smote(X[fitur], y, 100, k=3, random_seed=10) for fitur in array_fitur}

X_resampled = {fitur: data_resampled[fitur][0] for fitur in array_fitur}
y_resampled = {fitur: data_resampled[fitur][1] for fitur in array_fitur}



"""
vektorisasi data murni untuk validasi, tanpa pengaruh oversampling

X_val = array of dict {fitur: data} yang akan digunakan untuk proses validasi
y_val = label data untuk proses validasi
"""
X_val = {fitur: vectorizers[fitur].fit_transform(pos_val + neg_val).toarray() for fitur in array_fitur}
y_val = np.concatenate([np.ones(len(pos_val)), np.zeros(len(neg_val))])

# Information about the data portion

In [6]:
def get_porsi(y):
    n_pos = len([n for n in y if n == 1])
    n_neg = len([n for n in y if n == 0])
    return n_pos, n_neg

print('data asli \t\t--> {} data \t\t--> pos : neg = {}'
      .format(len(y), get_porsi(y)))
print('data resampling \t--> {} data \t\t--> pos : neg = {}'
      .format(len(y_resampled['biner']), get_porsi(y_resampled['biner'])))
print('data validasi \t\t--> {} data \t\t--> pos : neg = {}'
      .format(len(y_val), get_porsi(y_val)))

data asli 		--> 1300 data 		--> pos : neg = (958, 342)
data resampling 	--> 1642 data 		--> pos : neg = (958, 684)
data validasi 		--> 200 data 		--> pos : neg = (100, 100)


# Functions needed

In [7]:
def get_best_model(X, y, c, kf, show=False):
    """
    fungsi untuk mendapatkan model terbaik dari hasil k-fold
    return best_model: model terbaik, dengan tolak ukur gmean
    
    parameter:
    X = data per jenis fitur
    y = label dari data
    c = array object classifier
    kf = object K-Fold
    show = boolean untuk mencetak proses pencarian model terbaik
    """
    performance_total = 0
    best_fold_performance = -100
    best_fold_index = -1
    best_model = None
    if show: print('\t\t', end='')
    for index, (train_index, test_index) in enumerate(kf.split(X)):
        X_train_fold, X_test_fold = X[train_index], X[test_index]
        y_train_fold, y_test_fold = y[train_index], y[test_index]
        c.fit(X_train_fold, y_train_fold)
        pred = c.predict(X_test_fold)
        acc = round(accuracy_score(y_test_fold, pred) * 100, 2)
        gmean_score = round(gmean(y_test_fold, pred, average='binary') * 100, 2)
        selected_metric = gmean_score
        if selected_metric > best_fold_performance:
            best_fold_performance = selected_metric
            best_fold_index = index
            best_model = c
        if show: print(selected_metric, end=' ')
        performance_total += selected_metric
    performance_avg = round(performance_total / kf.get_n_splits(), 2)
    if show: print('\n\t\tbest index: {}, best performance: {}, performance avg: {}\n'
                      .format(best_fold_index+1, best_fold_performance, performance_avg))
    return best_model


def do_training_testing(clf, X, y, is_res=False, show=False):
    """
    fungsi untuk melakukan training dan testing
    baik itu dengan atau tanpa resampling
    return per_clf: model terbaik dari masing-masing fitur
    
    parameter:
    clf = array object classifier
    X = data per jenis fitur
    y = label dari data
    is_res = boolean, untuk mengecek apakah data yg dilewatkan adalah hasil resampling atau tidak
    show = boolean, untuk mencetak proses pencarian model terbaik
    """
    kf = KFold(n_splits=10, shuffle=True, random_state=0)
    per_clf = {}
    for c in clf: # untuk masing-masing jenis classifier
        
        for index, fitur in enumerate(X): # untuk masing-masing jenis fitur
            y_train = y[fitur] if is_res else y
            if show: # show process
                print('\t', c, fitur)
                per_clf[(c, fitur)] = get_best_model(X[fitur], y_train, clf[c], kf, show=True)
            else:
                per_clf[(c, fitur)] = get_best_model(X[fitur], y_train, clf[c], kf)
    return per_clf



def do_validation(clf, X_val, y_val, per_clf):
    """
    fungsi untuk melakukan validasi
    baik itu dengan atau tanpa resampling
    
    parameter:
    clf = array object classifier
    X_val = data per jenis fitur khusus untuk proses validasi
    y_val = label dari data khusus untuk proses validasi
    per_clf = model terbaik untuk masing-masing fitur
    """
    for c in clf:
        performance_total = 0
        n_fitur = 0
        for index, fitur in enumerate(X_val):
            n_fitur += 1
            pred = per_clf[c, fitur].predict(X_val[fitur])
            
            # performa menggunakan akurasi atau gmean
            
            gmean_score = round(gmean(y_val, pred, average='binary') * 100, 2)
            acc = round(accuracy_score(y_val, pred) * 100, 2)
            print(confusion_matrix(y_val, pred, labels=[1,0]))
            performance_total += gmean_score
            # print('acc:gm {} {}: \t\t {} : {}'.format(c, fitur, acc, gmean_score))
            print('gm {} {}: \t\t {}'.format(c, fitur, gmean_score))
        performance_avg = round(performance_total / n_fitur, 2)
        print('>> performa rata2 {}: \t({})'.format(c, performance_avg))

# Define classification algorithms

In [8]:
"""
array object dari model-model classifier yang akan digunakan
menjadi paramater untuk beberapa fungsi
"""
clf = {
    'multi_nb': MultinomialNB(),
    'supp_vm': SVC(),
    'log_reg': LogisticRegression(),
}

# Do training-testing and validation
print the performance result for each algorithms and features

In [9]:
"""
__main__
"""

# path = '/home/satriajiwidi/Desktop/codes/python_codes/web/pyweb/models/pt/'

# with open(path + 'val_data.pkl', 'wb') as data:
#     pickle.dump([X_val, y_val], data)
#     print('validation data pickled')


best_models = {}
print('===============VALIDASI===============\n')


start = time.time()
print('NO SAMPLING')
best_models_no_sampling = do_training_testing(clf, X, y, show=False)
do_validation(clf, X_val, y_val, per_clf=best_models_no_sampling)
end = time.time()
print('selesai dalam {} detik -> {} menit'.format(round(end-start, 2),
                                                  round((end-start)/60, 2)))

# with open(path + 'pt/model_ns.pkl', 'wb') as model:
#     pickle.dump(best_models_no_sampling, model)
#     print('model pickled')

start = time.time()
print('\nRESAMPLED')
best_models_after_sampling = do_training_testing(clf, X_resampled, y_resampled, is_res=True, show=False)
do_validation(clf, X_val, y_val, per_clf=best_models_after_sampling)
end = time.time()
print('selesai dalam {} detik -> {} menit'.format(round(end-start, 2),
                                                  round((end-start)/60, 2)))

# with open(path + 'pt/model_as.pkl', 'wb') as model:
#     pickle.dump(best_models_after_sampling, model)
#     print('model pickled')


NO SAMPLING
[[100   0]
 [100   0]]
gm supp_vm tfidf: 		 0.0
[[100   0]
 [100   0]]
gm supp_vm tfrek: 		 0.0
[[100   0]
 [100   0]]
gm supp_vm biner: 		 0.0
>> performa rata2 supp_vm: 	(0.0)
[[98  2]
 [61 39]]
gm multi_nb tfidf: 		 61.82
[[97  3]
 [48 52]]
gm multi_nb tfrek: 		 71.02
[[97  3]
 [50 50]]
gm multi_nb biner: 		 69.64
>> performa rata2 multi_nb: 	(67.49)
[[100   0]
 [ 74  26]]
gm log_reg tfidf: 		 50.99
[[96  4]
 [42 58]]
gm log_reg tfrek: 		 74.62
[[97  3]
 [45 55]]
gm log_reg biner: 		 73.04
>> performa rata2 log_reg: 	(66.22)
selesai dalam 103.27 detik -> 1.72 menit

RESAMPLED
[[100   0]
 [100   0]]
gm supp_vm tfidf: 		 0.0
[[100   0]
 [100   0]]
gm supp_vm tfrek: 		 0.0
[[100   0]
 [100   0]]
gm supp_vm biner: 		 0.0
>> performa rata2 supp_vm: 	(0.0)
[[95  5]
 [35 65]]
gm multi_nb tfidf: 		 78.58
[[95  5]
 [33 67]]
gm multi_nb tfrek: 		 79.78
[[95  5]
 [32 68]]
gm multi_nb biner: 		 80.37
>> performa rata2 multi_nb: 	(79.58)
[[95  5]
 [34 66]]
gm log_reg tfidf: 		 79.18