In [26]:
import re
import nltk
import json
import pickle
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
#fungsi mengubah ke kata dasar
def getstemmer():
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    
    return stemmer

In [3]:
#fungsi memanggil stopword
def getstopword():
    fp = open(r'stopwords.txt', 'r')
    line = fp.readline()
    stopwords = []
    while line:
        word = line.strip()
        stopwords.append(word)
        line = fp.readline()
    fp.close()
    
    return stopwords

In [4]:
def preprocessing(tweet):
    stopwords = getstopword()
    stemmer   = getstemmer()
    #membuat string ke huruf kecil
    tweet = tweet.lower()
    #menghapus character/link yang tidak digunakan dengan regex
    tweet = re.sub("\d+",' ', tweet)
    tweet = re.sub("http([^\s|,])+",' ', tweet)
    tweet = re.sub("www([^\s|,])+",' ', tweet)
    tweet = re.sub("^rt[\s]+", ' ', tweet)
    tweet = re.sub("[-()\"#%/@;:<>{}$`^+'=~*&|.!?,]|\d",' ',tweet)
    #tokenize string
    tweets = word_tokenize(tweet)
    #menghapus beberapa kalimat yang tidak mengubah makna sentiment dalam kalimat dengan stopwords
    tweet = []    
    for word in tweets:
        if word not in stopwords: 
            word = stemmer.stem(word)
            tweet.append(word)
        else:
            pass
            
    tweet = ' '.join(tweet)
    
    return tweet

In [5]:
ngram_counter = CountVectorizer(ngram_range=(2, 4), analyzer='word')

In [42]:
df = pd.read_csv('dataset.csv')
tweet_list = df['text'][:5000]
sentiment_list = df['sentiment'][:5000]

for index,tweet in enumerate(tweet_list):
    tweet = preprocessing(tweet)
    tweet_list[index] = tweet

In [43]:
x_train, x_test, y_train, y_test = train_test_split(tweet_list, sentiment_list, test_size=0.2, random_state=10)

In [44]:
x_train = ngram_counter.fit_transform(x_train)
x_test  = ngram_counter.transform(x_test)
x_train = x_train.toarray()
x_test = x_test.toarray()

In [45]:
classifier = MLPClassifier()
model = classifier.fit(x_train, y_train)

f = open('sentiment_2.pickle', 'wb')
pickle.dump(model, f)
f.close()

result_prediction = model.predict(x_test)

accuracy_score(result_prediction, y_test)

0.827