In [15]:
import re
import nltk
import json
import pickle
import pandas as pd
import nltk.classify
from nltk import ngrams
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [16]:
#fungsi mengubah ke kata dasar
def getstemmer():
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    
    return stemmer

In [17]:
#fungsi memanggil stopword
def getstopword():
    fp = open(r'stopwords.txt', 'r')
    line = fp.readline()
    stopwords = []
    while line:
        word = line.strip()
        stopwords.append(word)
        line = fp.readline()
    fp.close()
    
    return stopwords

In [18]:
#fungsi untuk mengolah kata sebelum proses
def preprocessing(tweet):
    stopwords = getstopword()
    stemmer   = getstemmer()
    #membuat string ke huruf kecil
    tweet = tweet.lower()
    #menghapus character/link yang tidak digunakan dengan regex
    tweet = re.sub("\d+",' ', tweet)
    tweet = re.sub("http([^\s|,])+",' ', tweet)
    tweet = re.sub("www([^\s|,])+",' ', tweet)
    tweet = re.sub("^rt[\s]+", ' ', tweet)
    tweet = re.sub("[-()\"#%/@;:<>{}$`^+'=~*&|.!?,]|\d",' ',tweet)
    #tokenize string
    tweets = word_tokenize(tweet)
    #menghapus beberapa kalimat yang tidak mengubah makna sentiment dalam kalimat dengan stopwords
    tweet = []    
    for word in tweets:
        if word not in stopwords: 
            word = stemmer.stem(word)
            tweet.append(word)
        else:
            pass
            
    tweet = ' '.join(tweet)
    
    return tweet

In [73]:
#membuat kombinasi kata sesuai urutan
def create_ngram_features(words, n=1):
    words = nltk.word_tokenize(words)
    ngram_vocab = ngrams(words, n)
    my_dict = dict([(ng, True) for ng in ngram_vocab])
    return my_dict

In [60]:
df = pd.read_csv('dataset.csv')
tweet_list = df['text'][:5000]
sentiment_list = df['sentiment'][:5000]

for index,tweet in enumerate(tweet_list):
    tweet = preprocessing(tweet)
    tweet_list[index] = tweet

In [74]:
# membagi data train dan data test dari satu dataset
x_train, x_test, y_train, y_test = train_test_split(tweet_list, sentiment_list, test_size=0.2, random_state=10)

In [77]:
train_set = []
test_set = []

x_train = [x for x in x_train]
x_test = [x for x in x_test]
y_train = [x for x in y_train]
y_test = [x for x in y_test]

for index,tweet in enumerate (x_train): 
    sentiment = y_train[index]
    tweet = create_ngram_features(tweet)
    train_set.append((tweet,sentiment))
    
for index,tweet in enumerate (x_test): 
    sentiment = y_test[index]
    tweet = create_ngram_features(tweet)
    test_set.append((tweet,sentiment))

classifier = nltk.NaiveBayesClassifier.train(train_set)

f = open('sentiment_1.pickle', 'wb')
pickle.dump(classifier, f)
f.close()

accuracy = nltk.classify.util.accuracy(classifier, test_set)*100
print(accuracy)

78.5


In [79]:
classifier.show_most_informative_features(10)

Most Informative Features
              ('lemah',) = True           Negati : Positi =     21.6 : 1.0
            ('sejarah',) = True           Negati : Positi =     20.3 : 1.0
             ('anjlok',) = True           Negati : Positi =     15.4 : 1.0
              ('resmi',) = True           Positi : Negati =     13.8 : 1.0
               ('buat',) = True           Positi : Negati =     13.8 : 1.0
            ('selasar',) = True           Negati : Positi =     13.7 : 1.0
           ('cikampek',) = True           Negati : Positi =     13.7 : 1.0
               ('lika',) = True           Negati : Positi =     12.2 : 1.0
               ('liku',) = True           Negati : Positi =     12.2 : 1.0
             ('ambruk',) = True           Negati : Positi =     11.5 : 1.0
