# Import all dependencies

In [1]:
from string import punctuation
from os.path import isfile
import pickle
import random

import nltk
from nltk.corpus import movie_reviews, stopwords
from nltk.stem import PorterStemmer
from nltk import FreqDist

import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold

from sklearn.metrics import accuracy_score, confusion_matrix

from imblearn.metrics import geometric_mean_score as gmean
from imblearn.over_sampling import SMOTE, ADASYN

import matplotlib.pyplot as plt
%matplotlib inline

# Load the file (imdb movie reviews)

In [2]:
major, minor = 1000, 200
test_size = .2
pos = []
neg = []
stemmer = PorterStemmer()

if isfile('pos.data'):
    with open('pos.data', 'rb') as posfile:
        pos = pickle.load(posfile)
else:
    for fileid in movie_reviews.fileids('pos')[:major]:
        temp = []
        for words in movie_reviews.words(fileid):
            if words not in stopwords.words('english') and words not in punctuation:
                temp.append(stemmer.stem(words))
        pos.append(' '.join(temp))
    with open('pos.data', 'wb') as posfile:
        pickle.dump(pos, posfile)

# read full neg, then random choose 200 data
if isfile('neg_full.data'):
    with open('neg_full.data', 'rb') as negfile:
        neg = pickle.load(negfile)
else:
    for fileid in movie_reviews.fileids('neg')[:minor]:
        temp = []
        for words in movie_reviews.words(fileid):
            if words not in stopwords.words('english') and words not in punctuation:
                temp.append(stemmer.stem(words))
        neg.append(' '.join(temp))
    with open('neg.data', 'wb') as negfile:
        pickle.dump(neg, negfile)

print('pos:', len(pos))
print('neg:', len(neg))
print('total:', len(pos + neg))

pos: 1000
neg: 1000
total: 2000


# Split the data -> training/testing and validation

In [3]:
random.seed(0)
neg = random.sample(neg, 200)
n_val = 100

pos_tt = pos[:-n_val]
neg_tt = neg[:-n_val]

# pembagian data training-testing dengan data validasi (tanpa resample/asli)
pos_val = pos[-n_val:]
neg_val = neg[-n_val:]

print('data training/testing:\t{} pos : {} neg'.format(len(pos_tt), len(neg_tt)))
print('data validasi:\t\t{} pos : {} neg'.format(len(pos_val), len(neg_val)))

data training/testing:	900 pos : 100 neg
data validasi:		100 pos : 100 neg


# Make vocabs

In [4]:
# pembuatan vocabs
all_words = [word for i in pos_tt + neg_tt for word in i.split()]
# hapus hapax
fd = FreqDist(all_words)
all_words = list(set(all_words))
print('n fitur awal:\t\t', len(all_words))
hapaxes = fd.hapaxes()
all_words = [word for word in all_words if word not in hapaxes]
print('n fitur tanpa hapax:\t', len(all_words))
# all_words = [key[0] for key in fd.most_common(10000)]
# print('n fitur final:', len(all_words))

# seleksi fitur yg digunakan
tagged = nltk.pos_tag(all_words)
selection = ['JJ', 'JJR', 'JJS']
excepts = ['NN', 'NNP', 'NNPS', 'NNS']
all_words2 = [word for (word, tag) in tagged if tag in selection]
# all_words2 = [word for (word, tag) in tagged if tag not in excepts]
print('n feature selection:\t', len(all_words2))

# full vocabs/selection feature toggle
# uncomment to use selection feature
# all_words = all_words2
print('used n feature:\t\t', len(all_words))

n fitur awal:		 20124
n fitur tanpa hapax:	 13330
n feature selection:	 2465
used n feature:		 13330


# Vectorized features (binary, tf, tfidf)
and do resampling, using SMOTE algorithm

In [5]:
fitur = ['biner', 'tf', 'tfidf']

vectorizer_types = [CountVectorizer(binary=True, vocabulary=all_words),
                    CountVectorizer(binary=False, vocabulary=all_words),
                    TfidfVectorizer(vocabulary=all_words)
                   ]

vectorizer = dict(zip(fitur, vectorizer_types))

X = {f: vectorizer[f].fit_transform(pos_tt + neg_tt).toarray() for f in fitur}
y = np.concatenate([np.ones(len(pos_tt)), np.zeros(len(neg_tt))])

X_val = {f: vectorizer[f].fit_transform(pos_val + neg_val).toarray() for f in fitur}
y_val = np.concatenate([np.ones(len(pos_val)), np.zeros(len(neg_val))])

data_resampled = {f: SMOTE(random_state=0, kind='borderline2').fit_sample(X[f], y) for f in fitur}
# data_resampled = {f: ADASYN(random_state=0).fit_sample(X[f], y) for f in fitur}

X_resampled = {f: data_resampled[f][0] for f in fitur}

y_resampled = {f: data_resampled[f][1] for f in fitur}

# Information about the data portion

In [6]:
def get_porsi(y):
    pos = len([n for n in y if n == 1])
    neg = len([n for n in y if n == 0])
    return pos, neg

# cetak informasi data
print('data asli \t\t--> {} data \t\t--> pos : neg = {}'
      .format(len(y), get_porsi(y)))
print('data resampling \t--> {} data \t\t--> pos : neg = {}'
      .format(len(y_resampled['biner']), get_porsi(y_resampled['biner'])))
print('data validasi \t\t--> {} data \t\t--> pos : neg = {}'
      .format(len(y_val), get_porsi(y_val)))

data asli 		--> 1000 data 		--> pos : neg = (900, 100)
data resampling 	--> 1800 data 		--> pos : neg = (900, 900)
data validasi 		--> 200 data 		--> pos : neg = (100, 100)


# Functions needed

In [7]:
kf = KFold(n_splits=10, shuffle=True)

def get_best_model(X, y, c, verbose=False):
    acc_total = 0
    best_fold_acc = -100
    best_fold_index = -1
    best_model = None
    if verbose: print('\t\t', end='')
    for index, (train_index, test_index) in enumerate(kf.split(X)):
        XXX_train, XXX_test = X[train_index], X[test_index]
        YYY_train, YYY_test = y[train_index], y[test_index]
        c.fit(XXX_train, YYY_train)
        pred = c.predict(XXX_test)
        acc = round(accuracy_score(YYY_test, pred) * 100, 2)
        if acc > best_fold_acc:
            best_fold_acc = acc
            best_fold_index = index
            best_model = c
        if verbose: print(acc, end=' ')
        acc_total += acc
    acc_avg = round(acc_total / kf.get_n_splits(), 2)
    if verbose: print('\n\t\tbest index: {}, best acc: {}, avg acc: {}\n'
                      .format(best_fold_index+1, best_fold_acc, acc_avg))
    return best_model

In [8]:
def fnr(fn, tp):
    return fn / (fn + tp)

def fpr(fp, tn):
    return fp / (fp + tn)

def make_subplots(X):
    # X adalah fitur, len(X) = banyak fitur
    _, axs = plt.subplots(1, len(X))
    plt.setp(axs, xticks=[0,1], xticklabels=[1,0],
             yticks=[0,1], yticklabels=[1,0])
    axs = axs.ravel()
    return axs

def show_cm(y, pred, axs, index, fitur, c, tp):
    acc = round(accuracy_score(y, pred) * 100, 2)
    cm = confusion_matrix(y, pred, labels=[1,0])
    n_test_pos = len([x for x in y if x == 1])
    n_test_neg = len(y) - n_test_pos
    n_test_pos, n_test_neg = str(n_test_pos), str(n_test_neg)
    axs[index].imshow(cm, cmap=plt.cm.Blues, interpolation='nearest')
    if tp == 'res': tp = 'resampled'
    if tp == 'val': tp = 'validation'
    axs[index].set_title('{}\n{} : {}\naccuracy: {}\n{} : {}'
                         .format(tp.upper(), c, fitur, str(acc), n_test_pos, n_test_neg))
    axs[index].set_xlabel('predicted')
    axs[index].set_ylabel('actual')
    for (k, j), label in np.ndenumerate(cm):
        axs[index].text(j, k, label, ha='center', va='center',
                        color='red', fontsize=14)

def do_training_testing(clf, X, y, tp, show=None):
    per_clf = {}
    for c in clf: # untuk masing-masing jenis classifier
        if show == 'cm': axs = make_subplots(X)
        for index, fitur in enumerate(X): # untuk masing-masing jenis fitur
            y_train = y[fitur] if tp == 'res' else y
            if show == 'cm':
                pred = cross_val_predict(clf[c], X[fitur], y_train, cv=10)
                per_clf[(c, fitur)] = get_best_model(X[fitur], y_train, clf[c])
            if show == 'md': # show process to get best models
                print('\t', c, fitur)
                per_clf[(c, fitur)] = get_best_model(X[fitur], y_train, clf[c], verbose=True)
            else:
                per_clf[(c, fitur)] = get_best_model(X[fitur], y_train, clf[c])
            if show == 'cm': show_cm(y_train, pred, axs, index, fitur, c, tp)
        if show == 'cm': plt.tight_layout()
    return per_clf

def do_validation(clf, fitur, per_clf, show=None):
    for c in clf:
        if show == 'cm': axs = make_subplots(fitur)
        total_acc = 0
        for index, f in enumerate(fitur):
            pred = per_clf[c, f].predict(X_val[f])
            gmean_score = round(gmean(y_val, pred, average='binary') * 100, 2)
            acc = round(accuracy_score(y_val, pred) * 100, 2)
            total_acc += acc
            if show == 'cm':
                show_cm(y_val, pred, axs, index, f, c, tp='val')
            else:
                print('acc:gm {} {}: \t\t {} : {}'.format(c, f, acc, gmean_score))
        avg_acc = round(total_acc / len(fitur), 2)
        if show == 'cm':
            plt.tight_layout()
        else:
            print('>> acc rata2 {}: \t\t({})'.format(c, avg_acc))

# Define classification algorithms

In [9]:
clf = {
       'multi_nb': MultinomialNB(),
       'supp_vm': SVC(),
       'log_reg': LogisticRegression(),
      }

# Do training-testing and validation
print the performance result for each algorithms and features

In [10]:
best_models = {}
print('===============VALIDASI===============\n')

import time

start = time.time()
print('no sampling')
best_models = do_training_testing(clf, X, y, tp='asli')
do_validation(clf, fitur, per_clf=best_models)
end = time.time()
print('selesai dalam {} detik -> {} menit'.format(round(end-start, 2),
                                                  round((end-start)/60, 2)))

start = time.time()
print('\nwith sampling')
best_models = do_training_testing(clf, X_resampled, y_resampled, tp='res')
do_validation(clf, fitur, per_clf=best_models)
end = time.time()
print('selesai dalam {} detik -> {} menit'.format(round(end-start, 2),
                                                  round((end-start)/60, 2)))


no sampling
acc:gm multi_nb biner: 		 52.0 : 20.0
acc:gm multi_nb tf: 		 54.0 : 29.85
acc:gm multi_nb tfidf: 		 52.0 : 22.25
>> acc rata2 multi_nb: 		(52.67)
acc:gm log_reg biner: 		 58.0 : 40.0
acc:gm log_reg tf: 		 58.0 : 41.02
acc:gm log_reg tfidf: 		 50.0 : 0.0
>> acc rata2 log_reg: 		(55.33)
acc:gm supp_vm biner: 		 50.0 : 0.0
acc:gm supp_vm tf: 		 50.0 : 0.0
acc:gm supp_vm tfidf: 		 50.0 : 0.0
>> acc rata2 supp_vm: 		(50.0)
selesai dalam 215.41 detik -> 3.59 menit

with sampling
acc:gm multi_nb biner: 		 54.0 : 28.28
acc:gm multi_nb tf: 		 56.5 : 39.4
acc:gm multi_nb tfidf: 		 56.0 : 39.19
>> acc rata2 multi_nb: 		(55.5)
acc:gm log_reg biner: 		 70.0 : 64.99
acc:gm log_reg tf: 		 66.0 : 59.77
acc:gm log_reg tfidf: 		 50.0 : 0.0
>> acc rata2 log_reg: 		(62.0)
acc:gm supp_vm biner: 		 52.5 : 22.36
acc:gm supp_vm tf: 		 69.5 : 69.28
acc:gm supp_vm tfidf: 		 50.0 : 0.0
>> acc rata2 supp_vm: 		(57.33)
selesai dalam 1437.48 detik -> 23.96 menit
