In [172]:
import re
import nltk
import json
import pickle
import pandas as pd
from sklearn import svm
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [173]:
# fungsi mengubah ke kata dasar
def getstemmer():
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    
    return stemmer

In [174]:
# fungsi memanggil stopword
def getstopword():
    fp = open(r'stopwords.txt', 'r')
    line = fp.readline()
    stopwords = []
    while line:
        word = line.strip()
        stopwords.append(word)
        line = fp.readline()
    fp.close()
    
    return stopwords

In [175]:
def getlistdaerah():
    files = ['provinsi.txt','kota.txt','kecamatan.txt','desa.txt']
    daerah = []
    
    for file in files:
        fp = open(file, 'r')
        line = fp.readline()
        
        while line:
            word = line.strip()
            daerah.append(word)
            line = fp.readline()
        fp.close()
    
    return daerah

In [176]:
def getlistberita():
    fp = open(r'list_berita.txt', 'r')
    line = fp.readline()
    berita = []
    while line:
        word = line.strip()
        berita.append(word)
        line = fp.readline()
    fp.close()
    
    return berita

In [177]:
def preprocessing(tweet):
    stopwords = getstopword()
    stemmer   = getstemmer()
    daerah    = getlistdaerah()
    berita    = getlistberita()
    # membuat string ke huruf kecil
    tweet = tweet.lower()
    # menghapus tab
    tweet = tweet.strip()
    # menghapus new line 
    tweet = tweet.replace('\n',' ')
    # menghapus at dan hastag
    tweet = re.sub("\w+@\w+|@\w+",' ',tweet)
    tweet = re.sub("\w+#\w+|#\w+",' ',tweet)
    tweet = re.sub("wk"," ", tweet)
    # menghapus character/link yang tidak digunakan dengan regex
    tweet = re.sub("http([^\s|,])+",' ', tweet)
    tweet = re.sub("www([^\s|,])+",' ', tweet)
    tweet = re.sub("^rt[\s]+", ' ', tweet)
    tweet = re.sub("[-()\"#%/@;:<>{}$`^+'”―=_~*&|.!?,]|\d",' ',tweet)
    tweet = re.sub("(\[|\])"," ",tweet)
    tweet = re.sub("\d+",' ', tweet)
    # tokenize string
    tweets = word_tokenize(tweet)
    # menghapus beberapa kalimat yang tidak mengubah makna sentiment dalam kalimat dengan stopwords
    tweet = []    
    for word in tweets:
        if word not in stopwords and word not in daerah and word not in berita:
            word = word.encode('ascii', 'ignore').decode('ascii')
            word = stemmer.stem(word)
            tweet.append(word)
        else:
            pass
            
    tweet = ' '.join(tweet)
    return tweet

In [178]:
ngram_counter = CountVectorizer(ngram_range=(1, 2), analyzer='word', tokenizer=nltk.word_tokenize)

In [179]:
# memanggil data set dan membagi menjadi data train dan data test
df = pd.read_csv('data_train_label.csv',sep=',')
x = df['text']
y = df['sentiment']

In [180]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=20)

In [None]:
x_train_new = []
for x in x_train:
    x = preprocessing(x)
    x_train_new.append(x)
    
df = pd.DataFrame({'text': x_train_new, 'sentiment': y_train})
df.to_csv('cek_3.csv', sep=';', encoding='utf-8', index=False)
x_train = ngram_counter.fit_transform(x_train_new)
tf_idf = TfidfTransformer()
x_train_new = tf_idf.fit_transform(x_train.toarray())

In [None]:
classifier = MultinomialNB()
model = classifier.fit(x_train_new.toarray(), y_train)
f = open('sentiment_2.pickle', 'wb')
pickle.dump(model, f)
f.close()

In [None]:
list_hasil_prediksi = []
list_tweet_test = []
list_kelas_test = []
y_test= [y for y in y_test]
new_x_test = []
tf_idf = TfidfTransformer()
for index,tweet in enumerate(x_test):
    sentiment = y_test[index].lower()
    list_tweet_test.append(tweet)
    list_kelas_test.append(sentiment)
    tweet = preprocessing(tweet)
    new_x_test.append(tweet)
    tweet = [tweet]
    tweet  = ngram_counter.transform(tweet)
    tweet =  tf_idf.fit_transform(tweet.toarray())
    prediksi  = model.predict(tweet.toarray())
    list_hasil_prediksi.append(prediksi[0])

df = pd.DataFrame({'text': list_tweet_test,'sentiment': list_kelas_test,'prediksi': list_hasil_prediksi})
df.to_csv('hasil_2.csv', sep=';', encoding='utf-8', index=False)


In [None]:
new_x_test  = ngram_counter.transform(new_x_test)
new_x_test  = tf_idf.transform(new_x_test.toarray())
result_prediction = model.predict(new_x_test.toarray())
accuracy_score(result_prediction, y_test)*100

In [None]:
print(ngram_counter.get_feature_names())