In [36]:
import re
import nltk
import json
import pickle
import pandas as pd
import nltk.classify
from nltk import ngrams
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [37]:
# fungsi mengubah ke kata dasar
def getstemmer():
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    
    return stemmer

In [38]:
# fungsi memanggil stopword
def getstopword():
    fp = open(r'stopwords.txt', 'r')
    line = fp.readline()
    stopwords = []
    while line:
        word = line.strip()
        stopwords.append(word)
        line = fp.readline()
    fp.close()
    
    return stopwords

In [39]:
def getlistdaerah():
    files = ['provinsi.txt','kota.txt','kecamatan.txt','desa.txt']
    daerah = []
    
    for file in files:
        fp = open(file, 'r')
        line = fp.readline()
        
        while line:
            word = line.strip()
            daerah.append(word)
            line = fp.readline()
        fp.close()
    
    return daerah

In [40]:
def getlistberita():
    fp = open(r'list_berita.txt', 'r')
    line = fp.readline()
    berita = []
    while line:
        word = line.strip()
        berita.append(word)
        line = fp.readline()
    fp.close()
    
    return berita

In [41]:
# fungsi untuk mengolah kata sebelum proses
def preprocessing(tweet):
    stopwords = getstopword()
    stemmer   = getstemmer()
    daerah    = getlistdaerah()
    berita    = getlistberita()
    # membuat string ke huruf kecil
    tweet = tweet.lower()
    # menghapus tab
    tweet = tweet.strip()
    # menghapus new line 
    tweet = tweet.replace('\n',' ')
    # menghapus at dan hastag
    tweet = re.sub("@\w+(\s)",' ',tweet)
    tweet = re.sub("#\w+(\s)",' ',tweet)
    # menghapus character/link yang tidak digunakan dengan regex
    tweet = re.sub("http([^\s|,])+",' ', tweet)
    tweet = re.sub("www([^\s|,])+",' ', tweet)
    tweet = re.sub("^rt[\s]+", ' ', tweet)
    tweet = re.sub("[-()\"#%/@;:<>{}$`^+'”―=_~*&|.!?,]|\d",' ',tweet)
    tweet = re.sub("\d+",' ', tweet)
    # tokenize string
    tweets = word_tokenize(tweet)
    # menghapus beberapa kalimat yang tidak mengubah makna sentiment dalam kalimat dengan stopwords
    tweet = []    
    for word in tweets:
        if word not in stopwords and word not in daerah and word not in berita: 
            word = word.encode('ascii', 'ignore').decode('ascii')
            word = stemmer.stem(word)
            tweet.append(word)
        else:
            pass
            
    tweet = ' '.join(tweet)
    return tweet

In [47]:
# membuat kombinasi kata sesuai urutan
def create_ngram_features(words, n=2):
    words = nltk.word_tokenize(words)
    ngram_vocab = ngrams(words, n)
    my_dict = dict([(ng[0], True) for ng in ngram_vocab])

    return my_dict

In [48]:
# memanggil data set dan membagi menjadi data train dan data test
df = pd.read_csv('data_train_label.csv',sep=',')
x = df['text']
y = df['sentiment']

In [49]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=20)

In [50]:
x_train = x_train.to_list()
x_test = x_test.to_list()
y_train = y_train.to_list()
y_test = y_test.to_list()

train_set = []
test_set = []
list_tweet_test = []
list_kelas_test = []
list_hasil_prediksi = []

x_train_new = []
for index,tweet in enumerate (x_train):
    sentiment = y_train[index].lower()
    tweet = preprocessing(tweet)
    x_train_new.append(tweet)
    tweet = create_ngram_features(tweet)
    train_set.append((tweet,sentiment))

df = pd.DataFrame({'text': x_train_new})
df.to_csv('cek_1.csv', sep=';', encoding='utf-8', index=False)
classifier = nltk.NaiveBayesClassifier.train(train_set)

for index,tweet in enumerate (x_test):
    sentiment = y_test[index].lower()
    list_tweet_test.append(tweet)
    list_kelas_test.append(sentiment)
    tweet = preprocessing(tweet)
    tweet = create_ngram_features(tweet)
    prediksi = classifier.classify(tweet)
    list_hasil_prediksi.append(prediksi)
    test_set.append((tweet,sentiment))

df = pd.DataFrame({'text': list_tweet_test,'sentiment': list_kelas_test,'prediksi': list_hasil_prediksi})
df.to_csv('hasil_1.csv', sep=';', encoding='utf-8', index=False)

f = open('sentiment_1.pickle', 'wb')
pickle.dump(classifier, f)
f.close()

accuracy = nltk.classify.util.accuracy(classifier, test_set)*100
print(accuracy)

57.20930232558139


In [51]:
classifier.show_most_informative_features(100)

Most Informative Features
                 syariah = True           positi : negati =     60.0 : 1.0
                 program = True           positi : negati =     31.3 : 1.0
               gravitasi = True           positi : netral =     21.5 : 1.0
                 lembaga = True           positi : netral =     17.0 : 1.0
                      si = True           negati : positi =     14.9 : 1.0
                    uang = True           positi : netral =     13.8 : 1.0
                  rahmat = True           positi : negati =     13.6 : 1.0
                    amin = True           positi : negati =     13.6 : 1.0
                    moga = True           positi : netral =     12.5 : 1.0
                     pkb = True           positi : negati =     12.4 : 1.0
                   milik = True           positi : netral =     12.3 : 1.0
                     gus = True           positi : netral =     12.1 : 1.0
                      lu = True           negati : positi =     11.7 : 1.0