In [1]:
import re
from string import punctuation
import nltk
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from stopwords import get_stopwords
import corpus
from collections import Counter
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import LinearSVC
import numpy as np

In [2]:
# data loading
pos_data = corpus.pos
neg_data = corpus.neg
print('n pos:' , len(pos_data))
print('n neg:', len(neg_data))

# gabung semua data (pos dan neg)
data = [] # teks dan label
X = [] # teks saja
y = [] # label/sentiment saja
for (words, sentiment) in pos_data + neg_data:
    words_filtered = [e.lower() for e in words.split() if len(e) >= 3] 
    data.append((words_filtered, sentiment))
    X.append(words_filtered)
    y.append(sentiment)
    
index_pos = [(index) for index, value in enumerate(y) if value == 'positive']
index_neg = [(index) for index, value in enumerate(y) if value == 'negative']

pos_data = [X[index] for index in index_pos]
pos_data_y = [y[index] for index in index_pos]
neg_data = [X[index] for index in index_neg]
neg_data_y = [y[index] for index in index_neg]

n pos: 11
n neg: 6


In [3]:
# tokenisasi
# mentransformasi seluruh kalimat dalam corpus menjadi array dari
# kata-kata
def get_words(data):
    all_words = []
    for (words, sentiment) in data:
    	all_words.extend(words)
    return all_words

# mendapatkan word features
# atau kamus dari seluruh korpus
# mengurutkan kata-kata dari frekuensi kemunculan tertinggi
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features

# hapus stopwords dan tanda baca
def remove_stopwords(words):
    stopwords = get_stopwords()
    all_words = [re.sub(r'[^\w\s]','', x) for x in words] # remove punctuation
    all_words = [x for x in all_words if x not in stopwords]
    return all_words

# stemming
# mengubah kata-kata menjadi kata dasarnya
# menghilangkan imbuhan pada kata
def stem_words(words):
	factory = StemmerFactory()
	stemmer = factory.create_stemmer()
	all_words = [stemmer.stem(word) for word in words]
	return all_words

In [4]:
all_words = sorted(set(remove_stopwords(stem_words(get_words(data)))))

word_features = get_word_features(all_words)
# most_common = [(key, val) for key, val
#                in nltk.FreqDist(remove_stopwords(stem_words(get_words(data)))).most_common()
#                if val > 1]
# print(most_common)
print(word_features, len(word_features))

dict_keys(['indah', 'mobil', 'letih', 'sungguh', 'olahraga', 'cinta', 'konser', 'bahagia', 'pandang', 'bagus', 'jengkel', 'sahabat', 'senang', 'musik', 'cerah', 'sejuk', 'suka', 'asa', 'tarik', 'dengar', 'tidak', 'makan', 'musuh']) 23


In [5]:
# mendapatkan fitur kata dari tiap dokumen
# menjadikan array one-hot
# dictionary {word: boolean presence}
def extract_features(document):
    document_words = sorted(set(remove_stopwords(stem_words(document))))
    features = {}
    for word in word_features:
        features[word] = (word in document_words)
    return features

In [6]:
# loading training data sets
training_set = [(extract_features(kalimat), label) for kalimat, label in data]
print(training_set[1])

({'indah': True, 'mobil': False, 'letih': False, 'sungguh': True, 'olahraga': False, 'pandang': True, 'konser': False, 'bahagia': False, 'cinta': False, 'bagus': False, 'jengkel': False, 'sahabat': False, 'senang': False, 'musik': False, 'cerah': False, 'sejuk': False, 'suka': True, 'asa': False, 'tarik': False, 'dengar': False, 'tidak': False, 'makan': False, 'musuh': False}, 'positive')


In [10]:
# khusus untuk penanganan imbalanced data
# 
# 

# mengubah fitur kata menjadi array bit (0 dan 1)
featuresets = [extract_features(kalimat) for kalimat in X]
featuresets_bit = []
temp = []
for index, i in enumerate(featuresets):
    for j, k in i.items():
        temp.append(1 if k == True else 0)
    featuresets_bit.append(temp)
    temp = []

print(featuresets[0])
print(featuresets_bit[0], end='\n\n')

# oversampling
from imblearn.over_sampling import SMOTE, ADASYN
y_bit = [1 if i == 'positive' else 0 for i in y]
X_resampled, y_resampled = SMOTE().fit_sample(featuresets_bit, y_bit)

# mengubah bilangan real menjadi 0 atau 1 dengan pembulatan
X_resampled_normalized = []
temp = []
for i in X_resampled:
    for j in i:
        temp.append(round(j))
    X_resampled_normalized.append(temp)
    temp = []
    
print('n data awal:', len(X))
print('n data hasil:', len(X_resampled_normalized), end='\n\n')

# coba mengembalikan dari vector ke kata
kamus = [(index, val) for index, val in enumerate(featuresets[0])]
print(kamus, end='\n\n')
kamus = np.array([val for index, val in enumerate(featuresets[0])])
for index, word in enumerate(X_resampled_normalized):
    indices = [index for index, value in enumerate(word) if value == 1]
    print(index+1, indices, np.take(kamus, indices), y_resampled[index])

{'indah': False, 'mobil': True, 'letih': False, 'sungguh': False, 'olahraga': False, 'pandang': False, 'konser': False, 'bahagia': False, 'cinta': False, 'bagus': False, 'jengkel': False, 'sahabat': False, 'senang': False, 'musik': False, 'cerah': False, 'sejuk': False, 'suka': True, 'asa': False, 'tarik': False, 'dengar': False, 'tidak': False, 'makan': False, 'musuh': False}
[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]

n data awal: 17
n data hasil: 22

[(0, 'indah'), (1, 'mobil'), (2, 'letih'), (3, 'sungguh'), (4, 'olahraga'), (5, 'pandang'), (6, 'konser'), (7, 'bahagia'), (8, 'cinta'), (9, 'bagus'), (10, 'jengkel'), (11, 'sahabat'), (12, 'senang'), (13, 'musik'), (14, 'cerah'), (15, 'sejuk'), (16, 'suka'), (17, 'asa'), (18, 'tarik'), (19, 'dengar'), (20, 'tidak'), (21, 'makan'), (22, 'musuh')]

1 [1, 16] ['mobil' 'suka'] 1
2 [0, 3, 5, 16] ['indah' 'sungguh' 'pandang' 'suka'] 1
3 [7] ['bahagia'] 1
4 [6, 18] ['konser' 'tarik'] 1
5 [11] ['sahabat'] 1
6 [16, 21

In [12]:
# training data
# dengan menggunakan algoritma naive bayes classifier
classifier = nltk.NaiveBayesClassifier.train(training_set)

# print(classifier.show_most_informative_features())

# validasi dengan menggunakan kalimat sendiri
kalimat_tes = 'hAri Ini mEnyenAnGkan'
kalimat_tes = extract_features(kalimat_tes.split())
print(kalimat_tes)

print(classifier.classify(kalimat_tes))

# membandkan dengan SVM
LinearSVC_clf = SklearnClassifier(LinearSVC())
LinearSVC_clf.train(training_set)
print(LinearSVC_clf.classify(kalimat_tes))

{'indah': False, 'mobil': False, 'letih': False, 'sungguh': False, 'olahraga': False, 'pandang': False, 'konser': False, 'bahagia': False, 'cinta': False, 'bagus': False, 'jengkel': False, 'sahabat': False, 'senang': True, 'musik': False, 'cerah': False, 'sejuk': False, 'suka': False, 'asa': False, 'tarik': False, 'dengar': False, 'tidak': False, 'makan': False, 'musuh': False}
positive
positive
